sync/lib/xss.js

/*
    WARNING

    This file contains an XSS prevention module I wrote myself.  It has not
    been verified by any external agency, and due to the nature of XSS I cannot
    guarantee that it will filter correctly.  Feel free to send me bug reports
    and I will do my best to fix them, but use at your own risk.

*/

/* Prototype for a basic XML tag parser */
function TagParser(text) {
    this.text = text;
    this.i = 0;
    this.tag = this.parse();
}

/* Moves the position marker past any whitespace characters */
TagParser.prototype.skipWhitespace = function () {
    while (this.i < this.text.length && this.text[this.i].match(/\s/)) {
        this.i++;
    }
};

/* Reads a literal value matching the given regexp.  Defaults
   to /[^\s>]/; i.e. any string not containing whitespace or
   the end of tag character '>'
*/
TagParser.prototype.readLiteral = function (regexp) {
    if (regexp === void 0) {
        regexp = /[^\s>]/;
    }
    var str = "";
    while (this.i < this.text.length && this.text[this.i].match(regexp)) {
        str += this.text[this.i];
        this.i++;
    }

    str = str.replace(/&#([0-9]{2,7});?/g, function (m, p1) {
        return String.fromCharCode(parseInt(p1));
    });

    str = str.replace(/&#x([0-9a-fA-F]{2,7});?/g, function (m, p1) {
        return String.fromCharCode(parseInt(p1, 16));
    });

    str = str.replace(/[\x00-\x1f]/g, "");
    return str;
};

/* If the character at the current position is a quote, read
   a string.  Otherwise, read a literal
*/
TagParser.prototype.readLiteralOrString = function (regexp) {
    if (this.text[this.i].match(/["'`]/)) {
        return this.readString();
    }
    return this.readLiteral(regexp);
};

/* Read a string delimited by the character at the current
   position.  For XML tags this means strings enclosed in
   " or '.  Treats \" as a literal '"' symbol and not a
   delimiter.
*/
TagParser.prototype.readString = function () {
    var delim = this.text[this.i++];

    var str = "";
    while (this.i < this.text.length && this.text[this.i] !== delim) {
        if (this.text[this.i] === "\\" && this.text[this.i+1] === delim) {
            str += this.text[this.i+1];
            this.i++;
        } else {
            str += this.text[this.i];
        }
        this.i++;
    }
    this.i++;

    str = str.replace(/&#([0-9]{2,7});?/g, function (m, p1) {
        return String.fromCharCode(parseInt(p1));
    });

    str = str.replace(/&#x([0-9a-fA-F]{2,7});?/g, function (m, p1) {
        return String.fromCharCode(parseInt(p1, 16));
    });

    str = str.replace(/[\x00-\x1f]/g, "");
    return str;
};

/* Attempts to parse a tagname and attributes from an
   XML tag.
   NOTE: Does not actually parse a DOM node, only parses
   the tag between '<' and '>' because that's all I need
   to do XSS filtering, I don't care what's between a tag
   and its end tag (if it's another tag I handle that
   separately)
*/
TagParser.prototype.parse = function () {
    this.i = this.text.indexOf("<");
    // Not a tag
    if (this.i === -1) {
        return null;
    }

    this.i++;
    this.skipWhitespace();

    // First non-whitespace string after the opening '<' is the tag name
    var tname = this.readLiteral();

    var attrs = {};
    // Continue parsing attributes until the end of string is reached or
    // the end of tag is reached
    while (this.i < this.text.length && this.text[this.i] !== ">") {
        // Read any string not containing equals, possibly delimited by
        // " or '
        var key = this.readLiteralOrString(/[^\s=>]/);
        this.skipWhitespace();
        // It's possible for tags to have attributes with no value, where
        // the equals sign is not necessary
        if (this.text[this.i] !== "=") {
            if (key.trim().length > 0) {
                attrs[key] = "";
            }
            continue;
        }

        this.i++;
        //this.skipWhitespace();
        var value = this.readLiteralOrString();
        if (key.trim().length > 0) {
            attrs[key] = value;
        }
        this.skipWhitespace();
    }

    // If end-of-string was not reached, consume the ending '>'
    if (this.i < this.text.length) {
        this.i++;
    }

    return {
        tagName: tname,
        attributes: attrs,
        text: this.text.substring(0, this.i) // Original text (for replacement)
    };
};

/* Some of these may not even be HTML tags, I borrowed them from the
   [now deprecated] XSS module of node-validator
*/
const badTags = new RegExp([
    "alert",
    "applet",
    "audio",
    "basefont",
    "base",
    "behavior",
    "bgsound",
    "blink",
    "body",
    "embed",
    "expression",
    "form",
    "frameset",
    "frame",
    "head",
    "html",
    "ilayer",
    "iframe",
    "input",
    "layer",
    "link",
    "meta",
    "object",
    "style",
    "script",
    "textarea",
    "title",
    "video",
    "xml",
    "xss"
].join("|"), "i");

/* Nasty attributes.  Anything starting with "on" is probably a javascript
   callback, and I hope you see why formaction is a bad idea.
*/
const badAttrs = new RegExp([
    "\\bon\\S*",
    "\\bformaction",
    "\\baction"
].join("|"), "i");

function sanitizeHTML(str) {
    var i = str.indexOf("<");
    if (i === -1) {
        // No HTML tags in the string
        return str;
    }

    // Loop across all tag delimiters '<' in string, parse each one,
    // and replace the results with sanitized tags
    while (i !== -1) {
        var t = new TagParser(str.substring(i)).tag;
        if (t.tagName.replace("/", "").match(badTags)) {
            // Note: Important that I replace the tag with a nonempty value,
            // otherwise <scr<script>ipt> would possibly defeat the filter.
            str = str.replace(t.text, "[tag removed]");
            i = str.indexOf("<", i+1);
            continue;
        }
        for (var k in t.attributes) {
            // Keys should not contain non-word characters.
            var k2 = k.replace(/[^\w]/g, "");
            if (k2 !== k) {
                t.attributes[k2] = t.attributes[k];
                delete t.attributes[k];
                k = k2;
            }
            // If it's an evil attribute, just nuke it entirely
            if (k.match(badAttrs)) {
                delete t.attributes[k];
            } else {
                if (t.attributes[k].replace(/\s/g, "").indexOf("javascript:") !== -1) {
                    t.attributes[k] = "[removed]";
                }

            }
        }
        // Build the sanitized tag
        var fmt = "<" + t.tagName;
        for (var k in t.attributes) {
            if (k.trim().length > 0) {
                fmt += " " + k;
                if (t.attributes[k].trim().length > 0) {
                    var delim = '"';
                    if (t.attributes[k].match(/[^\\]"/)) {
                        delim = "'";
                        if (t.attributes[k].match(/[^\\]'/)) {
                            delim = "`";
                        }
                    }
                    fmt += "=" + delim + t.attributes[k] + delim;
                }
            }
        }
        str = str.replace(t.text, fmt + ">");
        i = str.indexOf("<", i + fmt.length + 1);
    }

    return str;
}

/* WIP: Sanitize a string where HTML is prohibited */
function sanitizeText(str) {
    str = str.replace(/&/g, "&amp;")
             .replace(/</g, "&lt;")
             .replace(/>/g, "&gt;")
             .replace(/"/g, "&quot;")
             .replace(/'/g, "&#39;")
             .replace(/\(/g, "&#40;")
             .replace(/\)/g, "&#41;");
    return str;
}

function decodeText(str) {
    str = str.replace(/&#([0-9]{2,7});?/g, function (m, p1) {
        return String.fromCharCode(parseInt(p1));
    });
    str = str.replace(/&#x([0-9a-f]{2,7});?/ig, function (m, p1) {
        return String.fromCharCode(parseInt(p1, 16));
    });
    str = str.replace(/&lt;/g, "<")
             .replace(/&gt;/g, ">")
             .replace(/&quot;/g, "\"")
             .replace(/&amp;/g, "&");
    return str;
}

module.exports.sanitizeHTML = sanitizeHTML;
module.exports.sanitizeText = sanitizeText;
module.exports.decodeText = decodeText;
Comment xss.js 2013-11-02 07:25:16 +00:00			`/*`
			`WARNING`

			`This file contains an XSS prevention module I wrote myself. It has not`
			`been verified by any external agency, and due to the nature of XSS I cannot`
			`guarantee that it will filter correctly. Feel free to send me bug reports`
			`and I will do my best to fix them, but use at your own risk.`

			`*/`

			`/* Prototype for a basic XML tag parser */`
Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`function TagParser(text) {`
			`this.text = text;`
			`this.i = 0;`
			`this.tag = this.parse();`
			`}`

Comment xss.js 2013-11-02 07:25:16 +00:00			`/* Moves the position marker past any whitespace characters */`
Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`TagParser.prototype.skipWhitespace = function () {`
			`while (this.i < this.text.length && this.text[this.i].match(/\s/)) {`
			`this.i++;`
			`}`
			`};`

Comment xss.js 2013-11-02 07:25:16 +00:00			`/* Reads a literal value matching the given regexp. Defaults`
			`to /[^\s>]/; i.e. any string not containing whitespace or`
			`the end of tag character '>'`
			`*/`
Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`TagParser.prototype.readLiteral = function (regexp) {`
			`if (regexp === void 0) {`
			`regexp = /[^\s>]/;`
			`}`
			`var str = "";`
			`while (this.i < this.text.length && this.text[this.i].match(regexp)) {`
			`str += this.text[this.i];`
			`this.i++;`
			`}`
Add XSS filter 2014-01-23 17:45:08 +00:00
			`str = str.replace(/&#([0-9]{2,7});?/g, function (m, p1) {`
			`return String.fromCharCode(parseInt(p1));`
			`});`

			`str = str.replace(/&#x([0-9a-fA-F]{2,7});?/g, function (m, p1) {`
			`return String.fromCharCode(parseInt(p1, 16));`
			`});`

			`str = str.replace(/[\x00-\x1f]/g, "");`
Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`return str;`
			`};`

Comment xss.js 2013-11-02 07:25:16 +00:00			`/* If the character at the current position is a quote, read`
			`a string. Otherwise, read a literal`
			`*/`
Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`TagParser.prototype.readLiteralOrString = function (regexp) {`
Add XSS filter 2014-01-23 17:45:08 +00:00			if (this.text[this.i].match(/["'`]/)) {
Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`return this.readString();`
			`}`
			`return this.readLiteral(regexp);`
			`};`

Comment xss.js 2013-11-02 07:25:16 +00:00			`/* Read a string delimited by the character at the current`
			`position. For XML tags this means strings enclosed in`
			`" or '. Treats \" as a literal '"' symbol and not a`
			`delimiter.`
			`*/`
Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`TagParser.prototype.readString = function () {`
			`var delim = this.text[this.i++];`

			`var str = "";`
			`while (this.i < this.text.length && this.text[this.i] !== delim) {`
			`if (this.text[this.i] === "\\" && this.text[this.i+1] === delim) {`
			`str += this.text[this.i+1];`
			`this.i++;`
			`} else {`
			`str += this.text[this.i];`
			`}`
			`this.i++;`
			`}`
			`this.i++;`
Add XSS filter 2014-01-23 17:45:08 +00:00
			`str = str.replace(/&#([0-9]{2,7});?/g, function (m, p1) {`
			`return String.fromCharCode(parseInt(p1));`
			`});`

			`str = str.replace(/&#x([0-9a-fA-F]{2,7});?/g, function (m, p1) {`
			`return String.fromCharCode(parseInt(p1, 16));`
			`});`

			`str = str.replace(/[\x00-\x1f]/g, "");`
Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`return str;`
			`};`

Comment xss.js 2013-11-02 07:25:16 +00:00			`/* Attempts to parse a tagname and attributes from an`
			`XML tag.`
			`NOTE: Does not actually parse a DOM node, only parses`
			`the tag between '<' and '>' because that's all I need`
			`to do XSS filtering, I don't care what's between a tag`
			`and its end tag (if it's another tag I handle that`
			`separately)`
			`*/`
Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`TagParser.prototype.parse = function () {`
			`this.i = this.text.indexOf("<");`
Comment xss.js 2013-11-02 07:25:16 +00:00			`// Not a tag`
Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`if (this.i === -1) {`
			`return null;`
			`}`

			`this.i++;`
			`this.skipWhitespace();`

Comment xss.js 2013-11-02 07:25:16 +00:00			`// First non-whitespace string after the opening '<' is the tag name`
Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`var tname = this.readLiteral();`

			`var attrs = {};`
Comment xss.js 2013-11-02 07:25:16 +00:00			`// Continue parsing attributes until the end of string is reached or`
			`// the end of tag is reached`
Fix a few edge cases for XSS 2013-10-31 05:48:01 +00:00			`while (this.i < this.text.length && this.text[this.i] !== ">") {`
Comment xss.js 2013-11-02 07:25:16 +00:00			`// Read any string not containing equals, possibly delimited by`
			`// " or '`
Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`var key = this.readLiteralOrString(/[^\s=>]/);`
			`this.skipWhitespace();`
Comment xss.js 2013-11-02 07:25:16 +00:00			`// It's possible for tags to have attributes with no value, where`
			`// the equals sign is not necessary`
Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`if (this.text[this.i] !== "=") {`
			`if (key.trim().length > 0) {`
			`attrs[key] = "";`
			`}`
			`continue;`
			`}`

			`this.i++;`
Add XSS filter 2014-01-23 17:45:08 +00:00			`//this.skipWhitespace();`
Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`var value = this.readLiteralOrString();`
			`if (key.trim().length > 0) {`
			`attrs[key] = value;`
			`}`
			`this.skipWhitespace();`
			`}`
Fix a few edge cases for XSS 2013-10-31 05:48:01 +00:00
Comment xss.js 2013-11-02 07:25:16 +00:00			`// If end-of-string was not reached, consume the ending '>'`
Fix a few edge cases for XSS 2013-10-31 05:48:01 +00:00			`if (this.i < this.text.length) {`
			`this.i++;`
			`}`
Implement basic XSS filter 2013-10-31 05:39:35 +00:00
			`return {`
			`tagName: tname,`
			`attributes: attrs,`
Comment xss.js 2013-11-02 07:25:16 +00:00			`text: this.text.substring(0, this.i) // Original text (for replacement)`
Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`};`
			`};`

			`/* Some of these may not even be HTML tags, I borrowed them from the`
			`[now deprecated] XSS module of node-validator`
			`*/`
			`const badTags = new RegExp([`
			`"alert",`
			`"applet",`
			`"audio",`
			`"basefont",`
			`"base",`
			`"behavior",`
			`"bgsound",`
			`"blink",`
			`"body",`
			`"embed",`
			`"expression",`
			`"form",`
			`"frameset",`
			`"frame",`
			`"head",`
			`"html",`
			`"ilayer",`
			`"iframe",`
			`"input",`
			`"layer",`
			`"link",`
			`"meta",`
			`"object",`
			`"style",`
			`"script",`
			`"textarea",`
			`"title",`
			`"video",`
			`"xml",`
			`"xss"`
			`].join("\|"), "i");`

Comment xss.js 2013-11-02 07:25:16 +00:00			`/* Nasty attributes. Anything starting with "on" is probably a javascript`
			`callback, and I hope you see why formaction is a bad idea.`
			`*/`
Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`const badAttrs = new RegExp([`
			`"\\bon\\S*",`
Add XSS filter 2014-01-23 17:45:08 +00:00			`"\\bformaction",`
			`"\\baction"`
Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`].join("\|"), "i");`

			`function sanitizeHTML(str) {`
			`var i = str.indexOf("<");`
			`if (i === -1) {`
Comment xss.js 2013-11-02 07:25:16 +00:00			`// No HTML tags in the string`
Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`return str;`
			`}`

Comment xss.js 2013-11-02 07:25:16 +00:00			`// Loop across all tag delimiters '<' in string, parse each one,`
			`// and replace the results with sanitized tags`
Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`while (i !== -1) {`
			`var t = new TagParser(str.substring(i)).tag;`
			`if (t.tagName.replace("/", "").match(badTags)) {`
Comment xss.js 2013-11-02 07:25:16 +00:00			`// Note: Important that I replace the tag with a nonempty value,`
			`// otherwise <scr<script>ipt> would possibly defeat the filter.`
Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`str = str.replace(t.text, "[tag removed]");`
			`i = str.indexOf("<", i+1);`
			`continue;`
			`}`
			`for (var k in t.attributes) {`
Minor correction to xss.js 2013-11-02 07:28:43 +00:00			`// Keys should not contain non-word characters.`
			`var k2 = k.replace(/[^\w]/g, "");`
			`if (k2 !== k) {`
			`t.attributes[k2] = t.attributes[k];`
			`delete t.attributes[k];`
			`k = k2;`
			`}`
Comment xss.js 2013-11-02 07:25:16 +00:00			`// If it's an evil attribute, just nuke it entirely`
Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`if (k.match(badAttrs)) {`
			`delete t.attributes[k];`
Add XSS filter 2014-01-23 17:45:08 +00:00			`} else {`
Update XSS filter 2014-06-26 03:22:54 +00:00			`if (t.attributes[k].replace(/\s/g, "").indexOf("javascript:") !== -1) {`
			`t.attributes[k] = "[removed]";`
Fix a few edge cases for XSS 2013-10-31 05:48:01 +00:00			`}`

Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`}`
			`}`
Comment xss.js 2013-11-02 07:25:16 +00:00			`// Build the sanitized tag`
Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`var fmt = "<" + t.tagName;`
			`for (var k in t.attributes) {`
Comment xss.js 2013-11-02 07:25:16 +00:00			`if (k.trim().length > 0) {`
			`fmt += " " + k;`
			`if (t.attributes[k].trim().length > 0) {`
Add XSS filter 2014-01-23 17:45:08 +00:00			`var delim = '"';`
			`if (t.attributes[k].match(/[^\\]"/)) {`
			`delim = "'";`
			`if (t.attributes[k].match(/[^\\]'/)) {`
			delim = "`";
			`}`
			`}`
			`fmt += "=" + delim + t.attributes[k] + delim;`
Comment xss.js 2013-11-02 07:25:16 +00:00			`}`
			`}`
Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`}`
			`str = str.replace(t.text, fmt + ">");`
			`i = str.indexOf("<", i + fmt.length + 1);`
			`}`

			`return str;`
			`}`

Comment xss.js 2013-11-02 07:25:16 +00:00			`/* WIP: Sanitize a string where HTML is prohibited */`
Start working on text sanitizer 2013-10-31 23:53:03 +00:00			`function sanitizeText(str) {`
			`str = str.replace(/&/g, "&")`
			`.replace(/</g, "<")`
			`.replace(/>/g, ">")`
			`.replace(/"/g, """)`
			`.replace(/'/g, "'")`
			`.replace(/\(/g, "(")`
			`.replace(/\)/g, ")");`
			`return str;`
			`}`

Work on text filter 2013-11-05 16:37:50 +00:00			`function decodeText(str) {`
Add XSS filter 2014-01-23 17:45:08 +00:00			`str = str.replace(/&#([0-9]{2,7});?/g, function (m, p1) {`
Work on text filter 2013-11-05 16:37:50 +00:00			`return String.fromCharCode(parseInt(p1));`
			`});`
Add XSS filter 2014-01-23 17:45:08 +00:00			`str = str.replace(/&#x([0-9a-f]{2,7});?/ig, function (m, p1) {`
Work on text filter 2013-11-05 16:37:50 +00:00			`return String.fromCharCode(parseInt(p1, 16));`
			`});`
			`str = str.replace(/</g, "<")`
			`.replace(/>/g, ">")`
			`.replace(/"/g, "\"")`
			`.replace(/&/g, "&");`
			`return str;`
			`}`

Implement basic XSS filter 2013-10-31 05:39:35 +00:00			`module.exports.sanitizeHTML = sanitizeHTML;`
Work on text filter 2013-11-05 16:37:50 +00:00			`module.exports.sanitizeText = sanitizeText;`
			`module.exports.decodeText = decodeText;`