1: | <?php
|
2: |
|
3: | |
4: | |
5: | |
6: | |
7: | |
8: | |
9: | |
10: | |
11: | |
12: |
|
13: | class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
14: | {
|
15: | |
16: | |
17: |
|
18: | public $tracksLineNumbers = true;
|
19: |
|
20: | |
21: | |
22: | |
23: |
|
24: | protected $_whitespace = "\x20\x09\x0D\x0A";
|
25: |
|
26: | |
27: | |
28: | |
29: | |
30: |
|
31: | protected function scriptCallback($matches)
|
32: | {
|
33: | return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
|
34: | }
|
35: |
|
36: | |
37: | |
38: | |
39: | |
40: | |
41: |
|
42: | public function tokenizeHTML($html, $config, $context)
|
43: | {
|
44: |
|
45: |
|
46: |
|
47: | if ($config->get('HTML.Trusted')) {
|
48: | $html = preg_replace_callback(
|
49: | '#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
|
50: | array($this, 'scriptCallback'),
|
51: | $html
|
52: | );
|
53: | }
|
54: |
|
55: | $html = $this->normalize($html, $config, $context);
|
56: |
|
57: | $cursor = 0;
|
58: | $inside_tag = false;
|
59: | $array = array();
|
60: |
|
61: |
|
62: | $maintain_line_numbers = $config->get('Core.MaintainLineNumbers');
|
63: |
|
64: | if ($maintain_line_numbers === null) {
|
65: |
|
66: |
|
67: | $maintain_line_numbers = $config->get('Core.CollectErrors');
|
68: | }
|
69: |
|
70: | if ($maintain_line_numbers) {
|
71: | $current_line = 1;
|
72: | $current_col = 0;
|
73: | $length = strlen($html);
|
74: | } else {
|
75: | $current_line = false;
|
76: | $current_col = false;
|
77: | $length = false;
|
78: | }
|
79: | $context->register('CurrentLine', $current_line);
|
80: | $context->register('CurrentCol', $current_col);
|
81: | $nl = "\n";
|
82: |
|
83: |
|
84: | $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');
|
85: |
|
86: | $e = false;
|
87: | if ($config->get('Core.CollectErrors')) {
|
88: | $e =& $context->get('ErrorCollector');
|
89: | }
|
90: |
|
91: |
|
92: | $loops = 0;
|
93: |
|
94: | while (++$loops) {
|
95: |
|
96: |
|
97: |
|
98: |
|
99: | if ($maintain_line_numbers) {
|
100: |
|
101: | $rcursor = $cursor - (int)$inside_tag;
|
102: |
|
103: |
|
104: |
|
105: |
|
106: |
|
107: | $nl_pos = strrpos($html, $nl, $rcursor - $length);
|
108: | $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
|
109: |
|
110: |
|
111: | if ($synchronize_interval &&
|
112: | $cursor > 0 &&
|
113: | $loops % $synchronize_interval === 0) {
|
114: | $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
|
115: | }
|
116: | }
|
117: |
|
118: | $position_next_lt = strpos($html, '<', $cursor);
|
119: | $position_next_gt = strpos($html, '>', $cursor);
|
120: |
|
121: |
|
122: |
|
123: | if ($position_next_lt === $cursor) {
|
124: | $inside_tag = true;
|
125: | $cursor++;
|
126: | }
|
127: |
|
128: | if (!$inside_tag && $position_next_lt !== false) {
|
129: |
|
130: | $token = new
|
131: | HTMLPurifier_Token_Text(
|
132: | $this->parseText(
|
133: | substr(
|
134: | $html,
|
135: | $cursor,
|
136: | $position_next_lt - $cursor
|
137: | ), $config
|
138: | )
|
139: | );
|
140: | if ($maintain_line_numbers) {
|
141: | $token->rawPosition($current_line, $current_col);
|
142: | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
|
143: | }
|
144: | $array[] = $token;
|
145: | $cursor = $position_next_lt + 1;
|
146: | $inside_tag = true;
|
147: | continue;
|
148: | } elseif (!$inside_tag) {
|
149: |
|
150: |
|
151: | if ($cursor === strlen($html)) {
|
152: | break;
|
153: | }
|
154: |
|
155: | $token = new
|
156: | HTMLPurifier_Token_Text(
|
157: | $this->parseText(
|
158: | substr(
|
159: | $html,
|
160: | $cursor
|
161: | ), $config
|
162: | )
|
163: | );
|
164: | if ($maintain_line_numbers) {
|
165: | $token->rawPosition($current_line, $current_col);
|
166: | }
|
167: | $array[] = $token;
|
168: | break;
|
169: | } elseif ($inside_tag && $position_next_gt !== false) {
|
170: |
|
171: |
|
172: | $strlen_segment = $position_next_gt - $cursor;
|
173: |
|
174: | if ($strlen_segment < 1) {
|
175: |
|
176: | $token = new HTMLPurifier_Token_Text('<');
|
177: | $cursor++;
|
178: | continue;
|
179: | }
|
180: |
|
181: | $segment = substr($html, $cursor, $strlen_segment);
|
182: |
|
183: | if ($segment === false) {
|
184: |
|
185: |
|
186: | break;
|
187: | }
|
188: |
|
189: |
|
190: | if (substr($segment, 0, 3) === '!--') {
|
191: |
|
192: | $position_comment_end = strpos($html, '-->', $cursor);
|
193: | if ($position_comment_end === false) {
|
194: |
|
195: |
|
196: |
|
197: | if ($e) {
|
198: | $e->send(E_WARNING, 'Lexer: Unclosed comment');
|
199: | }
|
200: | $position_comment_end = strlen($html);
|
201: | $end = true;
|
202: | } else {
|
203: | $end = false;
|
204: | }
|
205: | $strlen_segment = $position_comment_end - $cursor;
|
206: | $segment = substr($html, $cursor, $strlen_segment);
|
207: | $token = new
|
208: | HTMLPurifier_Token_Comment(
|
209: | substr(
|
210: | $segment,
|
211: | 3,
|
212: | $strlen_segment - 3
|
213: | )
|
214: | );
|
215: | if ($maintain_line_numbers) {
|
216: | $token->rawPosition($current_line, $current_col);
|
217: | $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
|
218: | }
|
219: | $array[] = $token;
|
220: | $cursor = $end ? $position_comment_end : $position_comment_end + 3;
|
221: | $inside_tag = false;
|
222: | continue;
|
223: | }
|
224: |
|
225: |
|
226: | $is_end_tag = (strpos($segment, '/') === 0);
|
227: | if ($is_end_tag) {
|
228: | $type = substr($segment, 1);
|
229: | $token = new HTMLPurifier_Token_End($type);
|
230: | if ($maintain_line_numbers) {
|
231: | $token->rawPosition($current_line, $current_col);
|
232: | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
|
233: | }
|
234: | $array[] = $token;
|
235: | $inside_tag = false;
|
236: | $cursor = $position_next_gt + 1;
|
237: | continue;
|
238: | }
|
239: |
|
240: |
|
241: |
|
242: |
|
243: | if (!ctype_alpha($segment[0])) {
|
244: |
|
245: | if ($e) {
|
246: | $e->send(E_NOTICE, 'Lexer: Unescaped lt');
|
247: | }
|
248: | $token = new HTMLPurifier_Token_Text('<');
|
249: | if ($maintain_line_numbers) {
|
250: | $token->rawPosition($current_line, $current_col);
|
251: | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
|
252: | }
|
253: | $array[] = $token;
|
254: | $inside_tag = false;
|
255: | continue;
|
256: | }
|
257: |
|
258: |
|
259: |
|
260: |
|
261: |
|
262: | $is_self_closing = (strrpos($segment, '/') === $strlen_segment - 1);
|
263: | if ($is_self_closing) {
|
264: | $strlen_segment--;
|
265: | $segment = substr($segment, 0, $strlen_segment);
|
266: | }
|
267: |
|
268: |
|
269: | $position_first_space = strcspn($segment, $this->_whitespace);
|
270: |
|
271: | if ($position_first_space >= $strlen_segment) {
|
272: | if ($is_self_closing) {
|
273: | $token = new HTMLPurifier_Token_Empty($segment);
|
274: | } else {
|
275: | $token = new HTMLPurifier_Token_Start($segment);
|
276: | }
|
277: | if ($maintain_line_numbers) {
|
278: | $token->rawPosition($current_line, $current_col);
|
279: | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
|
280: | }
|
281: | $array[] = $token;
|
282: | $inside_tag = false;
|
283: | $cursor = $position_next_gt + 1;
|
284: | continue;
|
285: | }
|
286: |
|
287: |
|
288: | $type = substr($segment, 0, $position_first_space);
|
289: | $attribute_string =
|
290: | trim(
|
291: | substr(
|
292: | $segment,
|
293: | $position_first_space
|
294: | )
|
295: | );
|
296: | if ($attribute_string) {
|
297: | $attr = $this->parseAttributeString(
|
298: | $attribute_string,
|
299: | $config,
|
300: | $context
|
301: | );
|
302: | } else {
|
303: | $attr = array();
|
304: | }
|
305: |
|
306: | if ($is_self_closing) {
|
307: | $token = new HTMLPurifier_Token_Empty($type, $attr);
|
308: | } else {
|
309: | $token = new HTMLPurifier_Token_Start($type, $attr);
|
310: | }
|
311: | if ($maintain_line_numbers) {
|
312: | $token->rawPosition($current_line, $current_col);
|
313: | $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
|
314: | }
|
315: | $array[] = $token;
|
316: | $cursor = $position_next_gt + 1;
|
317: | $inside_tag = false;
|
318: | continue;
|
319: | } else {
|
320: |
|
321: | if ($e) {
|
322: | $e->send(E_WARNING, 'Lexer: Missing gt');
|
323: | }
|
324: | $token = new
|
325: | HTMLPurifier_Token_Text(
|
326: | '<' .
|
327: | $this->parseText(
|
328: | substr($html, $cursor), $config
|
329: | )
|
330: | );
|
331: | if ($maintain_line_numbers) {
|
332: | $token->rawPosition($current_line, $current_col);
|
333: | }
|
334: |
|
335: | $array[] = $token;
|
336: | break;
|
337: | }
|
338: | break;
|
339: | }
|
340: |
|
341: | $context->destroy('CurrentLine');
|
342: | $context->destroy('CurrentCol');
|
343: | return $array;
|
344: | }
|
345: |
|
346: | |
347: | |
348: | |
349: | |
350: | |
351: | |
352: | |
353: |
|
354: | protected function substrCount($haystack, $needle, $offset, $length)
|
355: | {
|
356: | static $oldVersion;
|
357: | if ($oldVersion === null) {
|
358: | $oldVersion = version_compare(PHP_VERSION, '5.1', '<');
|
359: | }
|
360: | if ($oldVersion) {
|
361: | $haystack = substr($haystack, $offset, $length);
|
362: | return substr_count($haystack, $needle);
|
363: | } else {
|
364: | return substr_count($haystack, $needle, $offset, $length);
|
365: | }
|
366: | }
|
367: |
|
368: | |
369: | |
370: | |
371: | |
372: | |
373: | |
374: | |
375: |
|
376: | public function parseAttributeString($string, $config, $context)
|
377: | {
|
378: | $string = (string)$string;
|
379: |
|
380: | if ($string == '') {
|
381: | return array();
|
382: | }
|
383: |
|
384: | $e = false;
|
385: | if ($config->get('Core.CollectErrors')) {
|
386: | $e =& $context->get('ErrorCollector');
|
387: | }
|
388: |
|
389: |
|
390: |
|
391: | $num_equal = substr_count($string, '=');
|
392: | $has_space = strpos($string, ' ');
|
393: | if ($num_equal === 0 && !$has_space) {
|
394: |
|
395: | return array($string => $string);
|
396: | } elseif ($num_equal === 1 && !$has_space) {
|
397: |
|
398: | list($key, $quoted_value) = explode('=', $string);
|
399: | $quoted_value = trim($quoted_value);
|
400: | if (!$key) {
|
401: | if ($e) {
|
402: | $e->send(E_ERROR, 'Lexer: Missing attribute key');
|
403: | }
|
404: | return array();
|
405: | }
|
406: | if (!$quoted_value) {
|
407: | return array($key => '');
|
408: | }
|
409: | $first_char = @$quoted_value[0];
|
410: | $last_char = @$quoted_value[strlen($quoted_value) - 1];
|
411: |
|
412: | $same_quote = ($first_char == $last_char);
|
413: | $open_quote = ($first_char == '"' || $first_char == "'");
|
414: |
|
415: | if ($same_quote && $open_quote) {
|
416: |
|
417: | $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
|
418: | } else {
|
419: |
|
420: | if ($open_quote) {
|
421: | if ($e) {
|
422: | $e->send(E_ERROR, 'Lexer: Missing end quote');
|
423: | }
|
424: | $value = substr($quoted_value, 1);
|
425: | } else {
|
426: | $value = $quoted_value;
|
427: | }
|
428: | }
|
429: | if ($value === false) {
|
430: | $value = '';
|
431: | }
|
432: | return array($key => $this->parseAttr($value, $config));
|
433: | }
|
434: |
|
435: |
|
436: | $array = array();
|
437: | $cursor = 0;
|
438: | $size = strlen($string);
|
439: |
|
440: |
|
441: |
|
442: | $string .= ' ';
|
443: |
|
444: | $old_cursor = -1;
|
445: | while ($cursor < $size) {
|
446: | if ($old_cursor >= $cursor) {
|
447: | throw new Exception("Infinite loop detected");
|
448: | }
|
449: | $old_cursor = $cursor;
|
450: |
|
451: | $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
|
452: |
|
453: |
|
454: | $key_begin = $cursor;
|
455: |
|
456: |
|
457: | $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
|
458: |
|
459: | $key_end = $cursor;
|
460: |
|
461: | $key = substr($string, $key_begin, $key_end - $key_begin);
|
462: |
|
463: | if (!$key) {
|
464: | if ($e) {
|
465: | $e->send(E_ERROR, 'Lexer: Missing attribute key');
|
466: | }
|
467: | $cursor += 1 + strcspn($string, $this->_whitespace, $cursor + 1);
|
468: | continue;
|
469: | }
|
470: |
|
471: |
|
472: | $cursor += strspn($string, $this->_whitespace, $cursor);
|
473: |
|
474: | if ($cursor >= $size) {
|
475: | $array[$key] = $key;
|
476: | break;
|
477: | }
|
478: |
|
479: |
|
480: |
|
481: | $first_char = @$string[$cursor];
|
482: |
|
483: | if ($first_char == '=') {
|
484: |
|
485: |
|
486: | $cursor++;
|
487: | $cursor += strspn($string, $this->_whitespace, $cursor);
|
488: |
|
489: | if ($cursor === false) {
|
490: | $array[$key] = '';
|
491: | break;
|
492: | }
|
493: |
|
494: |
|
495: |
|
496: | $char = @$string[$cursor];
|
497: |
|
498: | if ($char == '"' || $char == "'") {
|
499: |
|
500: | $cursor++;
|
501: | $value_begin = $cursor;
|
502: | $cursor = strpos($string, $char, $cursor);
|
503: | $value_end = $cursor;
|
504: | } else {
|
505: |
|
506: | $value_begin = $cursor;
|
507: | $cursor += strcspn($string, $this->_whitespace, $cursor);
|
508: | $value_end = $cursor;
|
509: | }
|
510: |
|
511: |
|
512: | if ($cursor === false) {
|
513: | $cursor = $size;
|
514: | $value_end = $cursor;
|
515: | }
|
516: |
|
517: | $value = substr($string, $value_begin, $value_end - $value_begin);
|
518: | if ($value === false) {
|
519: | $value = '';
|
520: | }
|
521: | $array[$key] = $this->parseAttr($value, $config);
|
522: | $cursor++;
|
523: | } else {
|
524: |
|
525: | if ($key !== '') {
|
526: | $array[$key] = $key;
|
527: | } else {
|
528: |
|
529: | if ($e) {
|
530: | $e->send(E_ERROR, 'Lexer: Missing attribute key');
|
531: | }
|
532: | }
|
533: | }
|
534: | }
|
535: | return $array;
|
536: | }
|
537: | }
|
538: |
|
539: |
|
540: | |