1: | <?php
|
2: |
|
3: | |
4: | |
5: | |
6: | |
7: | |
8: | |
9: | |
10: | |
11: | |
12: | |
13: | |
14: | |
15: | |
16: | |
17: | |
18: | |
19: | |
20: | |
21: | |
22: | |
23: | |
24: | |
25: |
|
26: |
|
27: | class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
28: | {
|
29: |
|
30: | |
31: | |
32: |
|
33: | private $factory;
|
34: |
|
35: | public function __construct()
|
36: | {
|
37: |
|
38: | parent::__construct();
|
39: | $this->factory = new HTMLPurifier_TokenFactory();
|
40: | }
|
41: |
|
42: | |
43: | |
44: | |
45: | |
46: | |
47: |
|
48: | public function tokenizeHTML($html, $config, $context)
|
49: | {
|
50: | $html = $this->normalize($html, $config, $context);
|
51: |
|
52: |
|
53: |
|
54: | if ($config->get('Core.AggressivelyFixLt')) {
|
55: | $char = '[^a-z!\/]';
|
56: | $comment = "/<!--(.*?)(-->|\z)/is";
|
57: | $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
|
58: | do {
|
59: | $old = $html;
|
60: | $html = preg_replace("/<($char)/i", '<\\1', $html);
|
61: | } while ($html !== $old);
|
62: | $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html);
|
63: | }
|
64: |
|
65: |
|
66: | $html = $this->wrapHTML($html, $config, $context);
|
67: |
|
68: | $doc = new DOMDocument();
|
69: | $doc->encoding = 'UTF-8';
|
70: |
|
71: | $options = 0;
|
72: | if ($config->get('Core.AllowParseManyTags') && defined('LIBXML_PARSEHUGE')) {
|
73: | $options |= LIBXML_PARSEHUGE;
|
74: | }
|
75: |
|
76: | set_error_handler(array($this, 'muteErrorHandler'));
|
77: |
|
78: | if ($options) {
|
79: | $doc->loadHTML($html, $options);
|
80: | } else {
|
81: | $doc->loadHTML($html);
|
82: | }
|
83: | restore_error_handler();
|
84: |
|
85: | $body = $doc->getElementsByTagName('html')->item(0)->
|
86: | getElementsByTagName('body')->item(0);
|
87: |
|
88: | $div = $body->getElementsByTagName('div')->item(0);
|
89: | $tokens = array();
|
90: | $this->tokenizeDOM($div, $tokens, $config);
|
91: |
|
92: |
|
93: |
|
94: |
|
95: | if ($div->nextSibling) {
|
96: | $body->removeChild($div);
|
97: | $this->tokenizeDOM($body, $tokens, $config);
|
98: | }
|
99: | return $tokens;
|
100: | }
|
101: |
|
102: | |
103: | |
104: | |
105: | |
106: | |
107: | |
108: |
|
109: | protected function tokenizeDOM($node, &$tokens, $config)
|
110: | {
|
111: | $level = 0;
|
112: | $nodes = array($level => new HTMLPurifier_Queue(array($node)));
|
113: | $closingNodes = array();
|
114: | do {
|
115: | while (!$nodes[$level]->isEmpty()) {
|
116: | $node = $nodes[$level]->shift();
|
117: | $collect = $level > 0 ? true : false;
|
118: | $needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);
|
119: | if ($needEndingTag) {
|
120: | $closingNodes[$level][] = $node;
|
121: | }
|
122: | if ($node->childNodes && $node->childNodes->length) {
|
123: | $level++;
|
124: | $nodes[$level] = new HTMLPurifier_Queue();
|
125: | foreach ($node->childNodes as $childNode) {
|
126: | $nodes[$level]->push($childNode);
|
127: | }
|
128: | }
|
129: | }
|
130: | $level--;
|
131: | if ($level && isset($closingNodes[$level])) {
|
132: | while ($node = array_pop($closingNodes[$level])) {
|
133: | $this->createEndNode($node, $tokens);
|
134: | }
|
135: | }
|
136: | } while ($level > 0);
|
137: | }
|
138: |
|
139: | |
140: | |
141: | |
142: | |
143: |
|
144: | protected function getTagName($node)
|
145: | {
|
146: | if (isset($node->tagName)) {
|
147: | return $node->tagName;
|
148: | } else if (isset($node->nodeName)) {
|
149: | return $node->nodeName;
|
150: | } else if (isset($node->localName)) {
|
151: | return $node->localName;
|
152: | }
|
153: | return null;
|
154: | }
|
155: |
|
156: | |
157: | |
158: | |
159: | |
160: |
|
161: | protected function getData($node)
|
162: | {
|
163: | if (isset($node->data)) {
|
164: | return $node->data;
|
165: | } else if (isset($node->nodeValue)) {
|
166: | return $node->nodeValue;
|
167: | } else if (isset($node->textContent)) {
|
168: | return $node->textContent;
|
169: | }
|
170: | return null;
|
171: | }
|
172: |
|
173: |
|
174: | |
175: | |
176: | |
177: | |
178: | |
179: | |
180: | |
181: | |
182: |
|
183: | protected function createStartNode($node, &$tokens, $collect, $config)
|
184: | {
|
185: |
|
186: |
|
187: |
|
188: | if ($node->nodeType === XML_TEXT_NODE) {
|
189: | $data = $this->getData($node);
|
190: | if ($data !== null) {
|
191: | $tokens[] = $this->factory->createText($data);
|
192: | }
|
193: | return false;
|
194: | } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
|
195: |
|
196: | $last = end($tokens);
|
197: | $data = $node->data;
|
198: |
|
199: | if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) {
|
200: | $new_data = trim($data);
|
201: | if (substr($new_data, 0, 4) === '<!--') {
|
202: | $data = substr($new_data, 4);
|
203: | if (substr($data, -3) === '-->') {
|
204: | $data = substr($data, 0, -3);
|
205: | } else {
|
206: |
|
207: | }
|
208: | }
|
209: | }
|
210: | $tokens[] = $this->factory->createText($this->parseText($data, $config));
|
211: | return false;
|
212: | } elseif ($node->nodeType === XML_COMMENT_NODE) {
|
213: |
|
214: |
|
215: |
|
216: | $tokens[] = $this->factory->createComment($node->data);
|
217: | return false;
|
218: | } elseif ($node->nodeType !== XML_ELEMENT_NODE) {
|
219: |
|
220: | return false;
|
221: | }
|
222: | $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
|
223: | $tag_name = $this->getTagName($node);
|
224: | if (empty($tag_name)) {
|
225: | return (bool) $node->childNodes->length;
|
226: | }
|
227: |
|
228: | if (!$node->childNodes->length) {
|
229: | if ($collect) {
|
230: | $tokens[] = $this->factory->createEmpty($tag_name, $attr);
|
231: | }
|
232: | return false;
|
233: | } else {
|
234: | if ($collect) {
|
235: | $tokens[] = $this->factory->createStart($tag_name, $attr);
|
236: | }
|
237: | return true;
|
238: | }
|
239: | }
|
240: |
|
241: | |
242: | |
243: | |
244: |
|
245: | protected function createEndNode($node, &$tokens)
|
246: | {
|
247: | $tag_name = $this->getTagName($node);
|
248: | $tokens[] = $this->factory->createEnd($tag_name);
|
249: | }
|
250: |
|
251: | |
252: | |
253: | |
254: | |
255: | |
256: |
|
257: | protected function transformAttrToAssoc($node_map)
|
258: | {
|
259: |
|
260: |
|
261: |
|
262: | if ($node_map->length === 0) {
|
263: | return array();
|
264: | }
|
265: | $array = array();
|
266: | foreach ($node_map as $attr) {
|
267: | $array[$attr->name] = $attr->value;
|
268: | }
|
269: | return $array;
|
270: | }
|
271: |
|
272: | |
273: | |
274: | |
275: | |
276: |
|
277: | public function muteErrorHandler($errno, $errstr)
|
278: | {
|
279: | }
|
280: |
|
281: | |
282: | |
283: | |
284: | |
285: | |
286: |
|
287: | public function callbackUndoCommentSubst($matches)
|
288: | {
|
289: | return '<!--' . strtr($matches[1], array('&' => '&', '<' => '<')) . $matches[2];
|
290: | }
|
291: |
|
292: | |
293: | |
294: | |
295: | |
296: | |
297: |
|
298: | public function callbackArmorCommentEntities($matches)
|
299: | {
|
300: | return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2];
|
301: | }
|
302: |
|
303: | |
304: | |
305: | |
306: | |
307: | |
308: | |
309: |
|
310: | protected function wrapHTML($html, $config, $context, $use_div = true)
|
311: | {
|
312: | $def = $config->getDefinition('HTML');
|
313: | $ret = '';
|
314: |
|
315: | if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
|
316: | $ret .= '<!DOCTYPE html ';
|
317: | if (!empty($def->doctype->dtdPublic)) {
|
318: | $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
|
319: | }
|
320: | if (!empty($def->doctype->dtdSystem)) {
|
321: | $ret .= '"' . $def->doctype->dtdSystem . '" ';
|
322: | }
|
323: | $ret .= '>';
|
324: | }
|
325: |
|
326: | $ret .= '<html><head>';
|
327: | $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
|
328: |
|
329: | $ret .= '</head><body>';
|
330: | if ($use_div) $ret .= '<div>';
|
331: | $ret .= $html;
|
332: | if ($use_div) $ret .= '</div>';
|
333: | $ret .= '</body></html>';
|
334: | return $ret;
|
335: | }
|
336: | }
|
337: |
|
338: |
|
339: | |