2013-11-02 07:25:16 +00:00
|
|
|
/*
|
|
|
|
WARNING
|
|
|
|
|
|
|
|
This file contains an XSS prevention module I wrote myself. It has not
|
|
|
|
been verified by any external agency, and due to the nature of XSS I cannot
|
|
|
|
guarantee that it will filter correctly. Feel free to send me bug reports
|
|
|
|
and I will do my best to fix them, but use at your own risk.
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Prototype for a basic XML tag parser */
|
2013-10-31 05:39:35 +00:00
|
|
|
function TagParser(text) {
|
|
|
|
this.text = text;
|
|
|
|
this.i = 0;
|
|
|
|
this.tag = this.parse();
|
|
|
|
}
|
|
|
|
|
2013-11-02 07:25:16 +00:00
|
|
|
/* Moves the position marker past any whitespace characters */
|
2013-10-31 05:39:35 +00:00
|
|
|
TagParser.prototype.skipWhitespace = function () {
|
|
|
|
while (this.i < this.text.length && this.text[this.i].match(/\s/)) {
|
|
|
|
this.i++;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2013-11-02 07:25:16 +00:00
|
|
|
/* Reads a literal value matching the given regexp. Defaults
|
|
|
|
to /[^\s>]/; i.e. any string not containing whitespace or
|
|
|
|
the end of tag character '>'
|
|
|
|
*/
|
2013-10-31 05:39:35 +00:00
|
|
|
TagParser.prototype.readLiteral = function (regexp) {
|
|
|
|
if (regexp === void 0) {
|
|
|
|
regexp = /[^\s>]/;
|
|
|
|
}
|
|
|
|
var str = "";
|
|
|
|
while (this.i < this.text.length && this.text[this.i].match(regexp)) {
|
|
|
|
str += this.text[this.i];
|
|
|
|
this.i++;
|
|
|
|
}
|
2014-01-23 17:45:08 +00:00
|
|
|
|
|
|
|
str = str.replace(/&#([0-9]{2,7});?/g, function (m, p1) {
|
|
|
|
return String.fromCharCode(parseInt(p1));
|
|
|
|
});
|
|
|
|
|
|
|
|
str = str.replace(/&#x([0-9a-fA-F]{2,7});?/g, function (m, p1) {
|
|
|
|
return String.fromCharCode(parseInt(p1, 16));
|
|
|
|
});
|
|
|
|
|
|
|
|
str = str.replace(/[\x00-\x1f]/g, "");
|
2013-10-31 05:39:35 +00:00
|
|
|
return str;
|
|
|
|
};
|
|
|
|
|
2013-11-02 07:25:16 +00:00
|
|
|
/* If the character at the current position is a quote, read
|
|
|
|
a string. Otherwise, read a literal
|
|
|
|
*/
|
2013-10-31 05:39:35 +00:00
|
|
|
TagParser.prototype.readLiteralOrString = function (regexp) {
|
2014-01-23 17:45:08 +00:00
|
|
|
if (this.text[this.i].match(/["'`]/)) {
|
2013-10-31 05:39:35 +00:00
|
|
|
return this.readString();
|
|
|
|
}
|
|
|
|
return this.readLiteral(regexp);
|
|
|
|
};
|
|
|
|
|
2013-11-02 07:25:16 +00:00
|
|
|
/* Read a string delimited by the character at the current
|
|
|
|
position. For XML tags this means strings enclosed in
|
|
|
|
" or '. Treats \" as a literal '"' symbol and not a
|
|
|
|
delimiter.
|
|
|
|
*/
|
2013-10-31 05:39:35 +00:00
|
|
|
TagParser.prototype.readString = function () {
|
|
|
|
var delim = this.text[this.i++];
|
|
|
|
|
|
|
|
var str = "";
|
|
|
|
while (this.i < this.text.length && this.text[this.i] !== delim) {
|
|
|
|
if (this.text[this.i] === "\\" && this.text[this.i+1] === delim) {
|
|
|
|
str += this.text[this.i+1];
|
|
|
|
this.i++;
|
|
|
|
} else {
|
|
|
|
str += this.text[this.i];
|
|
|
|
}
|
|
|
|
this.i++;
|
|
|
|
}
|
|
|
|
this.i++;
|
2014-01-23 17:45:08 +00:00
|
|
|
|
|
|
|
str = str.replace(/&#([0-9]{2,7});?/g, function (m, p1) {
|
|
|
|
return String.fromCharCode(parseInt(p1));
|
|
|
|
});
|
|
|
|
|
|
|
|
str = str.replace(/&#x([0-9a-fA-F]{2,7});?/g, function (m, p1) {
|
|
|
|
return String.fromCharCode(parseInt(p1, 16));
|
|
|
|
});
|
|
|
|
|
|
|
|
str = str.replace(/[\x00-\x1f]/g, "");
|
2013-10-31 05:39:35 +00:00
|
|
|
return str;
|
|
|
|
};
|
|
|
|
|
2013-11-02 07:25:16 +00:00
|
|
|
/* Attempts to parse a tagname and attributes from an
|
|
|
|
XML tag.
|
|
|
|
NOTE: Does not actually parse a DOM node, only parses
|
|
|
|
the tag between '<' and '>' because that's all I need
|
|
|
|
to do XSS filtering, I don't care what's between a tag
|
|
|
|
and its end tag (if it's another tag I handle that
|
|
|
|
separately)
|
|
|
|
*/
|
2013-10-31 05:39:35 +00:00
|
|
|
TagParser.prototype.parse = function () {
|
|
|
|
this.i = this.text.indexOf("<");
|
2013-11-02 07:25:16 +00:00
|
|
|
// Not a tag
|
2013-10-31 05:39:35 +00:00
|
|
|
if (this.i === -1) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
this.i++;
|
|
|
|
this.skipWhitespace();
|
|
|
|
|
2013-11-02 07:25:16 +00:00
|
|
|
// First non-whitespace string after the opening '<' is the tag name
|
2013-10-31 05:39:35 +00:00
|
|
|
var tname = this.readLiteral();
|
|
|
|
|
|
|
|
var attrs = {};
|
2013-11-02 07:25:16 +00:00
|
|
|
// Continue parsing attributes until the end of string is reached or
|
|
|
|
// the end of tag is reached
|
2013-10-31 05:48:01 +00:00
|
|
|
while (this.i < this.text.length && this.text[this.i] !== ">") {
|
2013-11-02 07:25:16 +00:00
|
|
|
// Read any string not containing equals, possibly delimited by
|
|
|
|
// " or '
|
2013-10-31 05:39:35 +00:00
|
|
|
var key = this.readLiteralOrString(/[^\s=>]/);
|
|
|
|
this.skipWhitespace();
|
2013-11-02 07:25:16 +00:00
|
|
|
// It's possible for tags to have attributes with no value, where
|
|
|
|
// the equals sign is not necessary
|
2013-10-31 05:39:35 +00:00
|
|
|
if (this.text[this.i] !== "=") {
|
|
|
|
if (key.trim().length > 0) {
|
|
|
|
attrs[key] = "";
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
this.i++;
|
2014-01-23 17:45:08 +00:00
|
|
|
//this.skipWhitespace();
|
2013-10-31 05:39:35 +00:00
|
|
|
var value = this.readLiteralOrString();
|
|
|
|
if (key.trim().length > 0) {
|
|
|
|
attrs[key] = value;
|
|
|
|
}
|
|
|
|
this.skipWhitespace();
|
|
|
|
}
|
2013-10-31 05:48:01 +00:00
|
|
|
|
2013-11-02 07:25:16 +00:00
|
|
|
// If end-of-string was not reached, consume the ending '>'
|
2013-10-31 05:48:01 +00:00
|
|
|
if (this.i < this.text.length) {
|
|
|
|
this.i++;
|
|
|
|
}
|
2013-10-31 05:39:35 +00:00
|
|
|
|
|
|
|
return {
|
|
|
|
tagName: tname,
|
|
|
|
attributes: attrs,
|
2013-11-02 07:25:16 +00:00
|
|
|
text: this.text.substring(0, this.i) // Original text (for replacement)
|
2013-10-31 05:39:35 +00:00
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Some of these may not even be HTML tags, I borrowed them from the
|
|
|
|
[now deprecated] XSS module of node-validator
|
|
|
|
*/
|
|
|
|
const badTags = new RegExp([
|
|
|
|
"alert",
|
|
|
|
"applet",
|
|
|
|
"audio",
|
|
|
|
"basefont",
|
|
|
|
"base",
|
|
|
|
"behavior",
|
|
|
|
"bgsound",
|
|
|
|
"blink",
|
|
|
|
"body",
|
|
|
|
"embed",
|
|
|
|
"expression",
|
|
|
|
"form",
|
|
|
|
"frameset",
|
|
|
|
"frame",
|
|
|
|
"head",
|
|
|
|
"html",
|
|
|
|
"ilayer",
|
|
|
|
"iframe",
|
|
|
|
"input",
|
|
|
|
"layer",
|
|
|
|
"link",
|
|
|
|
"meta",
|
|
|
|
"object",
|
|
|
|
"style",
|
|
|
|
"script",
|
|
|
|
"textarea",
|
|
|
|
"title",
|
|
|
|
"video",
|
|
|
|
"xml",
|
|
|
|
"xss"
|
|
|
|
].join("|"), "i");
|
|
|
|
|
2013-11-02 07:25:16 +00:00
|
|
|
/* Nasty attributes. Anything starting with "on" is probably a javascript
|
|
|
|
callback, and I hope you see why formaction is a bad idea.
|
|
|
|
*/
|
2013-10-31 05:39:35 +00:00
|
|
|
const badAttrs = new RegExp([
|
|
|
|
"\\bon\\S*",
|
2014-01-23 17:45:08 +00:00
|
|
|
"\\bformaction",
|
|
|
|
"\\baction"
|
2013-10-31 05:39:35 +00:00
|
|
|
].join("|"), "i");
|
|
|
|
|
|
|
|
function sanitizeHTML(str) {
|
|
|
|
var i = str.indexOf("<");
|
|
|
|
if (i === -1) {
|
2013-11-02 07:25:16 +00:00
|
|
|
// No HTML tags in the string
|
2013-10-31 05:39:35 +00:00
|
|
|
return str;
|
|
|
|
}
|
|
|
|
|
2013-11-02 07:25:16 +00:00
|
|
|
// Loop across all tag delimiters '<' in string, parse each one,
|
|
|
|
// and replace the results with sanitized tags
|
2013-10-31 05:39:35 +00:00
|
|
|
while (i !== -1) {
|
|
|
|
var t = new TagParser(str.substring(i)).tag;
|
|
|
|
if (t.tagName.replace("/", "").match(badTags)) {
|
2013-11-02 07:25:16 +00:00
|
|
|
// Note: Important that I replace the tag with a nonempty value,
|
|
|
|
// otherwise <scr<script>ipt> would possibly defeat the filter.
|
2013-10-31 05:39:35 +00:00
|
|
|
str = str.replace(t.text, "[tag removed]");
|
|
|
|
i = str.indexOf("<", i+1);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
for (var k in t.attributes) {
|
2013-11-02 07:28:43 +00:00
|
|
|
// Keys should not contain non-word characters.
|
|
|
|
var k2 = k.replace(/[^\w]/g, "");
|
|
|
|
if (k2 !== k) {
|
|
|
|
t.attributes[k2] = t.attributes[k];
|
|
|
|
delete t.attributes[k];
|
|
|
|
k = k2;
|
|
|
|
}
|
2013-11-02 07:25:16 +00:00
|
|
|
// If it's an evil attribute, just nuke it entirely
|
2013-10-31 05:39:35 +00:00
|
|
|
if (k.match(badAttrs)) {
|
|
|
|
delete t.attributes[k];
|
2014-01-23 17:45:08 +00:00
|
|
|
} else {
|
2014-06-26 03:22:54 +00:00
|
|
|
if (t.attributes[k].replace(/\s/g, "").indexOf("javascript:") !== -1) {
|
|
|
|
t.attributes[k] = "[removed]";
|
2013-10-31 05:48:01 +00:00
|
|
|
}
|
|
|
|
|
2013-10-31 05:39:35 +00:00
|
|
|
}
|
|
|
|
}
|
2013-11-02 07:25:16 +00:00
|
|
|
// Build the sanitized tag
|
2013-10-31 05:39:35 +00:00
|
|
|
var fmt = "<" + t.tagName;
|
|
|
|
for (var k in t.attributes) {
|
2013-11-02 07:25:16 +00:00
|
|
|
if (k.trim().length > 0) {
|
|
|
|
fmt += " " + k;
|
|
|
|
if (t.attributes[k].trim().length > 0) {
|
2014-01-23 17:45:08 +00:00
|
|
|
var delim = '"';
|
|
|
|
if (t.attributes[k].match(/[^\\]"/)) {
|
|
|
|
delim = "'";
|
|
|
|
if (t.attributes[k].match(/[^\\]'/)) {
|
|
|
|
delim = "`";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
fmt += "=" + delim + t.attributes[k] + delim;
|
2013-11-02 07:25:16 +00:00
|
|
|
}
|
|
|
|
}
|
2013-10-31 05:39:35 +00:00
|
|
|
}
|
|
|
|
str = str.replace(t.text, fmt + ">");
|
|
|
|
i = str.indexOf("<", i + fmt.length + 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
return str;
|
|
|
|
}
|
|
|
|
|
2013-11-02 07:25:16 +00:00
|
|
|
/* WIP: Sanitize a string where HTML is prohibited */
|
2013-10-31 23:53:03 +00:00
|
|
|
function sanitizeText(str) {
|
|
|
|
str = str.replace(/&/g, "&")
|
|
|
|
.replace(/</g, "<")
|
|
|
|
.replace(/>/g, ">")
|
|
|
|
.replace(/"/g, """)
|
|
|
|
.replace(/'/g, "'")
|
|
|
|
.replace(/\(/g, "(")
|
|
|
|
.replace(/\)/g, ")");
|
|
|
|
return str;
|
|
|
|
}
|
|
|
|
|
2013-11-05 16:37:50 +00:00
|
|
|
function decodeText(str) {
|
2014-01-23 17:45:08 +00:00
|
|
|
str = str.replace(/&#([0-9]{2,7});?/g, function (m, p1) {
|
2013-11-05 16:37:50 +00:00
|
|
|
return String.fromCharCode(parseInt(p1));
|
|
|
|
});
|
2014-01-23 17:45:08 +00:00
|
|
|
str = str.replace(/&#x([0-9a-f]{2,7});?/ig, function (m, p1) {
|
2013-11-05 16:37:50 +00:00
|
|
|
return String.fromCharCode(parseInt(p1, 16));
|
|
|
|
});
|
|
|
|
str = str.replace(/</g, "<")
|
|
|
|
.replace(/>/g, ">")
|
|
|
|
.replace(/"/g, "\"")
|
|
|
|
.replace(/&/g, "&");
|
|
|
|
return str;
|
|
|
|
}
|
|
|
|
|
2013-10-31 05:39:35 +00:00
|
|
|
module.exports.sanitizeHTML = sanitizeHTML;
|
2013-11-05 16:37:50 +00:00
|
|
|
module.exports.sanitizeText = sanitizeText;
|
|
|
|
module.exports.decodeText = decodeText;
|