andy-h
2/5/2014 - 7:35 PM

Functions to encode/decode text for use in HTML 4.01.

Functions to encode/decode text for use in HTML 4.01.

//encode reserved characters in a string for use in HTML
//if `keepValidEntities` is true, the amphersands for valid character entity references will not be encoded
function textToHTML(str, keepValidEntities){
	"use strict";
	var validEntityNames, rxp;
	
	if(keepValidEntities){
		//see http://www.w3.org/TR/html401/sgml/entities.html
		validEntityNames = ""+
			//markup-significant and internationalization characters
			"quot|amp|lt|gt|OElig|oelig|Scaron|scaron|Yuml|circ|tilde|ensp|emsp|thinsp|zwnj|zwj|lrm|"+
			"rlm|ndash|mdash|lsquo|rsquo|sbquo|ldquo|rdquo|bdquo|dagger|Dagger|permil|lsaquo|rsaquo|euro|"+
			//ISO 8859-1 characters
			"nbsp|iexcl|cent|pound|curren|yen|brvbar|sect|uml|copy|ordf|laquo|not|shy|reg|macr|deg|"+
			"plusmn|sup2|sup3|acute|micro|para|middot|cedil|sup1|ordm|raquo|frac14|frac12|frac34|iquest|"+
			"Agrave|Aacute|Acirc|Atilde|Auml|Aring|AElig|Ccedil|Egrave|Eacute|Ecirc|Euml|Igrave|Iacute|"+
			"Icirc|Iuml|ETH|Ntilde|Ograve|Oacute|Ocirc|Otilde|Ouml|times|Oslash|Ugrave|Uacute|Ucirc|Uuml|"+
			"Yacute|THORN|szlig|agrave|aacute|acirc|atilde|auml|aring|aelig|ccedil|egrave|eacute|ecirc|euml|"+
			"igrave|iacute|icirc|iuml|eth|ntilde|ograve|oacute|ocirc|otilde|ouml|divide|oslash|ugrave|"+
			"uacute|ucirc|uuml|yacute|thorn|yuml|"+
			//symbols, mathematical symbols, and Greek letters
			"fnof|Alpha|Beta|Gamma|Delta|Epsilon|Zeta|Eta|Theta|Iota|Kappa|Lambda|Mu|Nu|Xi|Omicron|Pi|"+
			"Rho|Sigma|Tau|Upsilon|Phi|Chi|Psi|Omega|alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|"+
			"kappa|lambda|mu|nu|xi|omicron|pi|rho|sigmaf|sigma|tau|upsilon|phi|chi|psi|omega|thetasym|"+
			"upsih|piv|bull|hellip|prime|Prime|oline|frasl|weierp|image|real|trade|alefsym|larr|uarr|rarr|"+
			"darr|harr|crarr|lArr|uArr|rArr|dArr|hArr|forall|part|exist|empty|nabla|isin|notin|ni|prod|"+
			"sum|minus|lowast|radic|prop|infin|ang|and|or|cap|cup|int|there4|sim|cong|asymp|ne|equiv|"+
			"le|ge|sub|sup|nsub|sube|supe|oplus|otimes|perp|sdot|lceil|rceil|lfloor|rfloor|lang|rang|"+
			"loz|spades|clubs|hearts|diams";
		
		rxp = new RegExp("&(?!(?:#([0-9]+|[xX][a-fA-F0-9]+)|"+validEntityNames+");)", "g");
		
		str = str.replace(rxp, "&");	//encode amphersands that are not part of a valid character entity reference
	}
	else{
		str = str.replace(/&/g, "&");	//encode all amphersands
	}
	//encode the other markup-significant characters
	return str.replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&#39;");
}

//decode all HTML character entity references in the string (not just the reserved characters)
function HTMLToText(str){
	"use strict";
	var tmp;
	tmp = document.createElement("div");
	tmp.innerHTML = str.replace(/</g, "&lt;").replace(/>/g, "&gt;");
	return tmp.firstChild.nodeValue;
}

//escapes the string for use as a JavaScript string value in embedded or inline code
//See http://code.google.com/p/doctype/wiki/ArticleXSSInJavaScript
//usage example:
//	document.body.innerHTML += ("<script>console.log(\"" + textToJavaScriptString(userInput) + "\")</script>";
function textToJavaScriptString(str){
	"use strict";
	str = str.replace(/\\/g, "\\\\");
	
	//prevent "escape from the quote" attacks by escaping quotes and line feed characters
	str = str.replace(/'/g, "\\u0027").replace(/"/g, "\\u0022");
	str = str.replace(/\u0009/ig, "\\t").replace(/\u000A/ig, "\\n").replace(/\u000D/ig, "\\r").replace(/\u0085/ig, "\\u0085");	//tab, line feed, carriage return, next line
	str = str.replace(/\u2028/ig, "\\u2028").replace(/\u2029/ig, "\\u2029");	//line separator, paragraph separator
	
	//prevent string from closing the tag
	str = str.replace(/</g, "\\u003C").replace(/>/g, "\\u003E");
	
	//these are escaped just in case ("defense-in-depth")
	str = str.replace(/&/g, "\\u0026").replace(/=/g, "\\u003D");
	
	return str;
}