| 1: | <?php
|
| 2: |
|
| 3: | |
| 4: | |
| 5: | |
| 6: | |
| 7: | |
| 8: | |
| 9: | |
| 10: | |
| 11: | |
| 12: | |
| 13: | |
| 14: | |
| 15: | |
| 16: | |
| 17: | |
| 18: | |
| 19: | |
| 20: | |
| 21: | |
| 22: | |
| 23: | |
| 24: | |
| 25: |
|
| 26: |
|
| 27: | class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
| 28: | {
|
| 29: |
|
| 30: | |
| 31: | |
| 32: |
|
| 33: | private $factory;
|
| 34: |
|
| 35: | public function __construct()
|
| 36: | {
|
| 37: |
|
| 38: | parent::__construct();
|
| 39: | $this->factory = new HTMLPurifier_TokenFactory();
|
| 40: | }
|
| 41: |
|
| 42: | |
| 43: | |
| 44: | |
| 45: | |
| 46: | |
| 47: |
|
| 48: | public function tokenizeHTML($html, $config, $context)
|
| 49: | {
|
| 50: | $html = $this->normalize($html, $config, $context);
|
| 51: |
|
| 52: |
|
| 53: |
|
| 54: | if ($config->get('Core.AggressivelyFixLt')) {
|
| 55: | $char = '[^a-z!\/]';
|
| 56: | $comment = "/<!--(.*?)(-->|\z)/is";
|
| 57: | $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
|
| 58: | do {
|
| 59: | $old = $html;
|
| 60: | $html = preg_replace("/<($char)/i", '<\\1', $html);
|
| 61: | } while ($html !== $old);
|
| 62: | $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html);
|
| 63: | }
|
| 64: |
|
| 65: |
|
| 66: | $html = $this->wrapHTML($html, $config, $context);
|
| 67: |
|
| 68: | $doc = new DOMDocument();
|
| 69: | $doc->encoding = 'UTF-8';
|
| 70: |
|
| 71: | $options = 0;
|
| 72: | if ($config->get('Core.AllowParseManyTags') && defined('LIBXML_PARSEHUGE')) {
|
| 73: | $options |= LIBXML_PARSEHUGE;
|
| 74: | }
|
| 75: |
|
| 76: | set_error_handler(array($this, 'muteErrorHandler'));
|
| 77: |
|
| 78: | if ($options) {
|
| 79: | $doc->loadHTML($html, $options);
|
| 80: | } else {
|
| 81: | $doc->loadHTML($html);
|
| 82: | }
|
| 83: | restore_error_handler();
|
| 84: |
|
| 85: | $body = $doc->getElementsByTagName('html')->item(0)->
|
| 86: | getElementsByTagName('body')->item(0);
|
| 87: |
|
| 88: | $div = $body->getElementsByTagName('div')->item(0);
|
| 89: | $tokens = array();
|
| 90: | $this->tokenizeDOM($div, $tokens, $config);
|
| 91: |
|
| 92: |
|
| 93: |
|
| 94: |
|
| 95: | if ($div->nextSibling) {
|
| 96: | $body->removeChild($div);
|
| 97: | $this->tokenizeDOM($body, $tokens, $config);
|
| 98: | }
|
| 99: | return $tokens;
|
| 100: | }
|
| 101: |
|
| 102: | |
| 103: | |
| 104: | |
| 105: | |
| 106: | |
| 107: | |
| 108: |
|
| 109: | protected function tokenizeDOM($node, &$tokens, $config)
|
| 110: | {
|
| 111: | $level = 0;
|
| 112: | $nodes = array($level => new HTMLPurifier_Queue(array($node)));
|
| 113: | $closingNodes = array();
|
| 114: | do {
|
| 115: | while (!$nodes[$level]->isEmpty()) {
|
| 116: | $node = $nodes[$level]->shift();
|
| 117: | $collect = $level > 0 ? true : false;
|
| 118: | $needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);
|
| 119: | if ($needEndingTag) {
|
| 120: | $closingNodes[$level][] = $node;
|
| 121: | }
|
| 122: | if ($node->childNodes && $node->childNodes->length) {
|
| 123: | $level++;
|
| 124: | $nodes[$level] = new HTMLPurifier_Queue();
|
| 125: | foreach ($node->childNodes as $childNode) {
|
| 126: | $nodes[$level]->push($childNode);
|
| 127: | }
|
| 128: | }
|
| 129: | }
|
| 130: | $level--;
|
| 131: | if ($level && isset($closingNodes[$level])) {
|
| 132: | while ($node = array_pop($closingNodes[$level])) {
|
| 133: | $this->createEndNode($node, $tokens);
|
| 134: | }
|
| 135: | }
|
| 136: | } while ($level > 0);
|
| 137: | }
|
| 138: |
|
| 139: | |
| 140: | |
| 141: | |
| 142: | |
| 143: |
|
| 144: | protected function getTagName($node)
|
| 145: | {
|
| 146: | if (isset($node->tagName)) {
|
| 147: | return $node->tagName;
|
| 148: | } else if (isset($node->nodeName)) {
|
| 149: | return $node->nodeName;
|
| 150: | } else if (isset($node->localName)) {
|
| 151: | return $node->localName;
|
| 152: | }
|
| 153: | return null;
|
| 154: | }
|
| 155: |
|
| 156: | |
| 157: | |
| 158: | |
| 159: | |
| 160: |
|
| 161: | protected function getData($node)
|
| 162: | {
|
| 163: | if (isset($node->data)) {
|
| 164: | return $node->data;
|
| 165: | } else if (isset($node->nodeValue)) {
|
| 166: | return $node->nodeValue;
|
| 167: | } else if (isset($node->textContent)) {
|
| 168: | return $node->textContent;
|
| 169: | }
|
| 170: | return null;
|
| 171: | }
|
| 172: |
|
| 173: |
|
| 174: | |
| 175: | |
| 176: | |
| 177: | |
| 178: | |
| 179: | |
| 180: | |
| 181: | |
| 182: |
|
| 183: | protected function createStartNode($node, &$tokens, $collect, $config)
|
| 184: | {
|
| 185: |
|
| 186: |
|
| 187: |
|
| 188: | if ($node->nodeType === XML_TEXT_NODE) {
|
| 189: | $data = $this->getData($node);
|
| 190: | if ($data !== null) {
|
| 191: | $tokens[] = $this->factory->createText($data);
|
| 192: | }
|
| 193: | return false;
|
| 194: | } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
|
| 195: |
|
| 196: | $last = end($tokens);
|
| 197: | $data = $node->data;
|
| 198: |
|
| 199: | if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) {
|
| 200: | $new_data = trim($data);
|
| 201: | if (substr($new_data, 0, 4) === '<!--') {
|
| 202: | $data = substr($new_data, 4);
|
| 203: | if (substr($data, -3) === '-->') {
|
| 204: | $data = substr($data, 0, -3);
|
| 205: | } else {
|
| 206: |
|
| 207: | }
|
| 208: | }
|
| 209: | }
|
| 210: | $tokens[] = $this->factory->createText($this->parseText($data, $config));
|
| 211: | return false;
|
| 212: | } elseif ($node->nodeType === XML_COMMENT_NODE) {
|
| 213: |
|
| 214: |
|
| 215: |
|
| 216: | $tokens[] = $this->factory->createComment($node->data);
|
| 217: | return false;
|
| 218: | } elseif ($node->nodeType !== XML_ELEMENT_NODE) {
|
| 219: |
|
| 220: | return false;
|
| 221: | }
|
| 222: | $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
|
| 223: | $tag_name = $this->getTagName($node);
|
| 224: | if (empty($tag_name)) {
|
| 225: | return (bool) $node->childNodes->length;
|
| 226: | }
|
| 227: |
|
| 228: | if (!$node->childNodes->length) {
|
| 229: | if ($collect) {
|
| 230: | $tokens[] = $this->factory->createEmpty($tag_name, $attr);
|
| 231: | }
|
| 232: | return false;
|
| 233: | } else {
|
| 234: | if ($collect) {
|
| 235: | $tokens[] = $this->factory->createStart($tag_name, $attr);
|
| 236: | }
|
| 237: | return true;
|
| 238: | }
|
| 239: | }
|
| 240: |
|
| 241: | |
| 242: | |
| 243: | |
| 244: |
|
| 245: | protected function createEndNode($node, &$tokens)
|
| 246: | {
|
| 247: | $tag_name = $this->getTagName($node);
|
| 248: | $tokens[] = $this->factory->createEnd($tag_name);
|
| 249: | }
|
| 250: |
|
| 251: | |
| 252: | |
| 253: | |
| 254: | |
| 255: | |
| 256: |
|
| 257: | protected function transformAttrToAssoc($node_map)
|
| 258: | {
|
| 259: |
|
| 260: |
|
| 261: |
|
| 262: | if ($node_map->length === 0) {
|
| 263: | return array();
|
| 264: | }
|
| 265: | $array = array();
|
| 266: | foreach ($node_map as $attr) {
|
| 267: | $array[$attr->name] = $attr->value;
|
| 268: | }
|
| 269: | return $array;
|
| 270: | }
|
| 271: |
|
| 272: | |
| 273: | |
| 274: | |
| 275: | |
| 276: |
|
| 277: | public function muteErrorHandler($errno, $errstr)
|
| 278: | {
|
| 279: | }
|
| 280: |
|
| 281: | |
| 282: | |
| 283: | |
| 284: | |
| 285: | |
| 286: |
|
| 287: | public function callbackUndoCommentSubst($matches)
|
| 288: | {
|
| 289: | return '<!--' . strtr($matches[1], array('&' => '&', '<' => '<')) . $matches[2];
|
| 290: | }
|
| 291: |
|
| 292: | |
| 293: | |
| 294: | |
| 295: | |
| 296: | |
| 297: |
|
| 298: | public function callbackArmorCommentEntities($matches)
|
| 299: | {
|
| 300: | return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2];
|
| 301: | }
|
| 302: |
|
| 303: | |
| 304: | |
| 305: | |
| 306: | |
| 307: | |
| 308: | |
| 309: |
|
| 310: | protected function wrapHTML($html, $config, $context, $use_div = true)
|
| 311: | {
|
| 312: | $def = $config->getDefinition('HTML');
|
| 313: | $ret = '';
|
| 314: |
|
| 315: | if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
|
| 316: | $ret .= '<!DOCTYPE html ';
|
| 317: | if (!empty($def->doctype->dtdPublic)) {
|
| 318: | $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
|
| 319: | }
|
| 320: | if (!empty($def->doctype->dtdSystem)) {
|
| 321: | $ret .= '"' . $def->doctype->dtdSystem . '" ';
|
| 322: | }
|
| 323: | $ret .= '>';
|
| 324: | }
|
| 325: |
|
| 326: | $ret .= '<html><head>';
|
| 327: | $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
|
| 328: |
|
| 329: | $ret .= '</head><body>';
|
| 330: | if ($use_div) $ret .= '<div>';
|
| 331: | $ret .= $html;
|
| 332: | if ($use_div) $ret .= '</div>';
|
| 333: | $ret .= '</body></html>';
|
| 334: | return $ret;
|
| 335: | }
|
| 336: | }
|
| 337: |
|
| 338: |
|
| 339: | |