Files
js-xss/lib/parser.js
2014-02-13 16:33:35 +08:00

181 lines
4.6 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* 简单 HTML Parser
*
* @author 老雷<leizongmin@gmail.com>
*/
/**
* 获取标签的名称
*
* @param {String} html 如:'<a hef="#">'
* @return {String}
*/
function getTagName (html) {
var i = html.indexOf(' ');
if (i === -1) {
var tagName = html.slice(1, -1);
} else {
var tagName = html.slice(1, i + 1);
}
tagName = tagName.trim().toLowerCase();
if (tagName[0] === '/') tagName = tagName.slice(1);
if (tagName[tagName.length - 1] === '/') tagName = tagName.slice(0, -1);
return tagName;
}
/**
* 是否为闭合标签
*
* @param {String} html 如:'<a hef="#">'
* @return {Boolean}
*/
function isClosing (html) {
return (html.slice(0, 2) === '</');
}
/**
* 分析HTML代码调用相应的函数处理返回处理后的HTML
*
* @param {String} html
* @param {Function} onTag 处理标签的函数
* 参数格式: function (sourcePosition, position, tag, html, isClosing)
* @param {Function} escapeHtml 对HTML进行转义的韩松
* @return {String}
*/
function parseTag (html, onTag, escapeHtml) {
'user strict';
var rethtml = ''; // 待返回的HTML
var lastPos = 0; // 上一个标签结束位置
var tagStart = false; // 当前标签开始位置
var quoteStart = false; // 引号开始位置
var currentPos = 0; // 当前位置
var len = html.length; // HTML长度
var currentHtml = ''; // 当前标签的HTML代码
var currentTagName = ''; // 当前标签的名称
// 逐个分析字符
for (currentPos = 0; currentPos < len; currentPos++) {
var c = html.charAt(currentPos);
if (tagStart === false) {
if (c === '<') {
tagStart = currentPos;
continue;
}
} else {
if (quoteStart === false) {
if (c === '<') {
rethtml += escapeHtml(html.slice(lastPos, currentPos));
tagStart = currentPos;
lastPos = currentPos;
continue;
}
if (c === '>') {
rethtml += escapeHtml(html.slice(lastPos, tagStart));
currentHtml = html.slice(tagStart, currentPos + 1);
currentTagName = getTagName(currentHtml);
rethtml += onTag(tagStart,
rethtml.length,
currentTagName,
currentHtml,
isClosing(currentHtml));
lastPos = currentPos + 1;
tagStart = false;
continue;
}
if (c === '"' || c === "'") {
quoteStart = c;
continue;
}
} else {
if (c === quoteStart) {
quoteStart = false;
continue;
}
}
}
}
if (lastPos < html.length) {
rethtml += escapeHtml(html.substr(lastPos));
}
return rethtml;
}
// 不符合属性名称规则的正则表达式
var REGEXP_ATTR_NAME = /[^a-zA-Z0-9_:\.\-]/img;
/**
* 分析标签HTML代码调用相应的函数处理返回HTML
*
* @param {String} html 如标签'<a href="#" target="_blank">' 则为 'href="#" target="_blank"'
* @param {Function} onAttr 处理属性值的函数
* 函数格式: function (name, value)
* @return {String}
*/
function parseAttr (html, onAttr) {
'user strict';
var lastPos = 0; // 当前位置
var retAttrs = []; // 待返回的属性列表
var tmpName = false; // 临时属性名称
var len = html.length; // HTML代码长度
function addAttr (name, value) {
name = name.trim();
name = name.replace(REGEXP_ATTR_NAME, '').toLowerCase();
if (name.length < 1) return;
retAttrs.push(onAttr(name, value || ''));
};
// 逐个分析字符
for (var i = 0; i < len; i++) {
var c = html.charAt(i),v;
if (tmpName === false && c === '=') {
tmpName = html.slice(lastPos, i);
lastPos = i + 1;
continue;
}
if (tmpName !== false) {
if (i === lastPos && (c === '"' || c === "'")) {
var j = html.indexOf(c, i + 1);
if (j === -1) {
break;
} else {
v = html.slice(lastPos + 1, j).trim();
addAttr(tmpName, v);
tmpName = false;
i = j;
lastPos = i + 1;
continue;
}
}
}
if (c === ' ') {
v = html.slice(lastPos, i).trim();
if (tmpName === false) {
addAttr(v);
} else {
addAttr(tmpName, v);
}
tmpName = false;
lastPos = i + 1;
continue;
}
}
if (lastPos < html.length) {
if (tmpName === false) {
addAttr(html.slice(lastPos));
} else {
addAttr(tmpName, html.slice(lastPos));
}
}
return retAttrs.join(' ').trim();
}
exports.parseTag = parseTag;
exports.parseAttr = parseAttr;