htmlparser.js 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. /**
  2. * html字符串转换成uNode节点
  3. * @file
  4. * @module UE
  5. * @since 1.2.6.1
  6. */
  7. /**
  8. * UEditor公用空间,UEditor所有的功能都挂载在该空间下
  9. * @unfile
  10. * @module UE
  11. */
  12. /**
  13. * html字符串转换成uNode节点的静态方法
  14. * @method htmlparser
  15. * @param { String } htmlstr 要转换的html代码
  16. * @param { Boolean } ignoreBlank 若设置为true,转换的时候忽略\n\r\t等空白字符
  17. * @return { uNode } 给定的html片段转换形成的uNode对象
  18. * @example
  19. * ```javascript
  20. * var root = UE.htmlparser('<p><b>htmlparser</b></p>', true);
  21. * ```
  22. */
  23. var htmlparser = (UE.htmlparser = function(htmlstr, ignoreBlank) {
  24. //todo 原来的方式 [^"'<>\/] 有\/就不能配对上 <TD vAlign=top background=../AAA.JPG> 这样的标签了
  25. //先去掉了,加上的原因忘了,这里先记录
  26. //var re_tag = /<(?:(?:\/([^>]+)>)|(?:!--([\S|\s]*?)-->)|(?:([^\s\/<>]+)\s*((?:(?:"[^"]*")|(?:'[^']*')|[^"'<>])*)\/?>))/g,
  27. //以上的正则表达式无法匹配:<div style="text-align:center;font-family:" font-size:14px;"=""><img src="http://hs-album.oss.aliyuncs.com/static/27/78/35/image/20161206/20161206174331_41105.gif" alt="" /><br /></div>
  28. //修改为如下正则表达式:
  29. var re_tag = /<(?:(?:\/([^>]+)>)|(?:!--([\S|\s]*?)-->)|(?:([^\/\s>]+)((?:\s+[\w\-:.]+(?:\s*=\s*?(?:(?:"[^"]*")|(?:'[^']*')|[^\s"'\/>]+))?)*)[\S\s]*?(\/?)>))/g,
  30. re_attr = /([\w\-:.]+)(?:(?:\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)')|([^\s>]+)))|(?=\s|$))/g;
  31. //ie下取得的html可能会有\n存在,要去掉,在处理replace(/[\t\r\n]*/g,'');代码高量的\n不能去除
  32. var allowEmptyTags = {
  33. b: 1,
  34. code: 1,
  35. i: 1,
  36. u: 1,
  37. strike: 1,
  38. s: 1,
  39. tt: 1,
  40. strong: 1,
  41. q: 1,
  42. samp: 1,
  43. em: 1,
  44. span: 1,
  45. sub: 1,
  46. img: 1,
  47. sup: 1,
  48. font: 1,
  49. big: 1,
  50. small: 1,
  51. iframe: 1,
  52. a: 1,
  53. br: 1,
  54. pre: 1
  55. };
  56. htmlstr = htmlstr.replace(new RegExp(domUtils.fillChar, "g"), "");
  57. if (!ignoreBlank) {
  58. htmlstr = htmlstr.replace(
  59. new RegExp(
  60. "[\\r\\t\\n" +
  61. (ignoreBlank ? "" : " ") +
  62. "]*</?(\\w+)\\s*(?:[^>]*)>[\\r\\t\\n" +
  63. (ignoreBlank ? "" : " ") +
  64. "]*",
  65. "g"
  66. ),
  67. function(a, b) {
  68. //br暂时单独处理
  69. if (b && allowEmptyTags[b.toLowerCase()]) {
  70. return a.replace(/(^[\n\r]+)|([\n\r]+$)/g, "");
  71. }
  72. return a
  73. .replace(new RegExp("^[\\r\\n" + (ignoreBlank ? "" : " ") + "]+"), "")
  74. .replace(
  75. new RegExp("[\\r\\n" + (ignoreBlank ? "" : " ") + "]+$"),
  76. ""
  77. );
  78. }
  79. );
  80. }
  81. var notTransAttrs = {
  82. href: 1,
  83. src: 1
  84. };
  85. var uNode = UE.uNode,
  86. needParentNode = {
  87. td: "tr",
  88. tr: ["tbody", "thead", "tfoot"],
  89. tbody: "table",
  90. th: "tr",
  91. thead: "table",
  92. tfoot: "table",
  93. caption: "table",
  94. li: ["ul", "ol"],
  95. dt: "dl",
  96. dd: "dl",
  97. option: "select"
  98. },
  99. needChild = {
  100. ol: "li",
  101. ul: "li"
  102. };
  103. function text(parent, data) {
  104. if (needChild[parent.tagName]) {
  105. var tmpNode = uNode.createElement(needChild[parent.tagName]);
  106. parent.appendChild(tmpNode);
  107. tmpNode.appendChild(uNode.createText(data));
  108. parent = tmpNode;
  109. } else {
  110. parent.appendChild(uNode.createText(data));
  111. }
  112. }
  113. function element(parent, tagName, htmlattr) {
  114. var needParentTag;
  115. if ((needParentTag = needParentNode[tagName])) {
  116. var tmpParent = parent,
  117. hasParent;
  118. while (tmpParent.type != "root") {
  119. if (
  120. utils.isArray(needParentTag)
  121. ? utils.indexOf(needParentTag, tmpParent.tagName) != -1
  122. : needParentTag == tmpParent.tagName
  123. ) {
  124. parent = tmpParent;
  125. hasParent = true;
  126. break;
  127. }
  128. tmpParent = tmpParent.parentNode;
  129. }
  130. if (!hasParent) {
  131. parent = element(
  132. parent,
  133. utils.isArray(needParentTag) ? needParentTag[0] : needParentTag
  134. );
  135. }
  136. }
  137. //按dtd处理嵌套
  138. // if(parent.type != 'root' && !dtd[parent.tagName][tagName])
  139. // parent = parent.parentNode;
  140. var elm = new uNode({
  141. parentNode: parent,
  142. type: "element",
  143. tagName: tagName.toLowerCase(),
  144. //是自闭合的处理一下
  145. children: dtd.$empty[tagName] ? null : []
  146. });
  147. //如果属性存在,处理属性
  148. if (htmlattr) {
  149. var attrs = {},
  150. match;
  151. while ((match = re_attr.exec(htmlattr))) {
  152. attrs[match[1].toLowerCase()] = notTransAttrs[match[1].toLowerCase()]
  153. ? match[2] || match[3] || match[4]
  154. : utils.unhtml(match[2] || match[3] || match[4]);
  155. }
  156. elm.attrs = attrs;
  157. }
  158. //trace:3970
  159. // //如果parent下不能放elm
  160. // if(dtd.$inline[parent.tagName] && dtd.$block[elm.tagName] && !dtd[parent.tagName][elm.tagName]){
  161. // parent = parent.parentNode;
  162. // elm.parentNode = parent;
  163. // }
  164. parent.children.push(elm);
  165. //如果是自闭合节点返回父亲节点
  166. return dtd.$empty[tagName] ? parent : elm;
  167. }
  168. function comment(parent, data) {
  169. parent.children.push(
  170. new uNode({
  171. type: "comment",
  172. data: data,
  173. parentNode: parent
  174. })
  175. );
  176. }
  177. var match,
  178. currentIndex = 0,
  179. nextIndex = 0;
  180. //设置根节点
  181. var root = new uNode({
  182. type: "root",
  183. children: []
  184. });
  185. var currentParent = root;
  186. while ((match = re_tag.exec(htmlstr))) {
  187. currentIndex = match.index;
  188. try {
  189. if (currentIndex > nextIndex) {
  190. //text node
  191. text(currentParent, htmlstr.slice(nextIndex, currentIndex));
  192. }
  193. if (match[3]) {
  194. if (dtd.$cdata[currentParent.tagName]) {
  195. text(currentParent, match[0]);
  196. } else {
  197. //start tag
  198. currentParent = element(
  199. currentParent,
  200. match[3].toLowerCase(),
  201. match[4]
  202. );
  203. }
  204. } else if (match[1]) {
  205. if (currentParent.type != "root") {
  206. if (dtd.$cdata[currentParent.tagName] && !dtd.$cdata[match[1]]) {
  207. text(currentParent, match[0]);
  208. } else {
  209. var tmpParent = currentParent;
  210. while (
  211. currentParent.type == "element" &&
  212. currentParent.tagName != match[1].toLowerCase()
  213. ) {
  214. currentParent = currentParent.parentNode;
  215. if (currentParent.type == "root") {
  216. currentParent = tmpParent;
  217. throw "break";
  218. }
  219. }
  220. //end tag
  221. currentParent = currentParent.parentNode;
  222. }
  223. }
  224. } else if (match[2]) {
  225. //comment
  226. comment(currentParent, match[2]);
  227. }
  228. } catch (e) {}
  229. nextIndex = re_tag.lastIndex;
  230. }
  231. //如果结束是文本,就有可能丢掉,所以这里手动判断一下
  232. //例如 <li>sdfsdfsdf<li>sdfsdfsdfsdf
  233. if (nextIndex < htmlstr.length) {
  234. text(currentParent, htmlstr.slice(nextIndex));
  235. }
  236. return root;
  237. });