1: | <?php
|
2: |
|
3: | |
4: | |
5: | |
6: | |
7: | |
8: | |
9: | |
10: | |
11: | |
12: | |
13: | |
14: | |
15: | |
16: | |
17: | |
18: | |
19: | |
20: | |
21: | |
22: | |
23: | |
24: | |
25: | |
26: | |
27: | |
28: | |
29: | |
30: | |
31: | |
32: | |
33: | |
34: | |
35: | |
36: | |
37: | |
38: | |
39: | |
40: | |
41: |
|
42: | class HTMLPurifier_Lexer
|
43: | {
|
44: |
|
45: | |
46: | |
47: | |
48: |
|
49: | public $tracksLineNumbers = false;
|
50: |
|
51: | |
52: | |
53: |
|
54: | private $_entity_parser;
|
55: |
|
56: |
|
57: |
|
58: | |
59: | |
60: | |
61: | |
62: | |
63: | |
64: | |
65: | |
66: | |
67: | |
68: | |
69: | |
70: | |
71: | |
72: | |
73: |
|
74: | public static function create($config)
|
75: | {
|
76: | if (!($config instanceof HTMLPurifier_Config)) {
|
77: | $lexer = $config;
|
78: | trigger_error(
|
79: | "Passing a prototype to
|
80: | HTMLPurifier_Lexer::create() is deprecated, please instead
|
81: | use %Core.LexerImpl",
|
82: | E_USER_WARNING
|
83: | );
|
84: | } else {
|
85: | $lexer = $config->get('Core.LexerImpl');
|
86: | }
|
87: |
|
88: | $needs_tracking =
|
89: | $config->get('Core.MaintainLineNumbers') ||
|
90: | $config->get('Core.CollectErrors');
|
91: |
|
92: | $inst = null;
|
93: | if (is_object($lexer)) {
|
94: | $inst = $lexer;
|
95: | } else {
|
96: | if (is_null($lexer)) {
|
97: | do {
|
98: |
|
99: | if ($needs_tracking) {
|
100: | $lexer = 'DirectLex';
|
101: | break;
|
102: | }
|
103: |
|
104: | if (class_exists('DOMDocument', false) &&
|
105: | method_exists('DOMDocument', 'loadHTML') &&
|
106: | !extension_loaded('domxml')
|
107: | ) {
|
108: |
|
109: |
|
110: |
|
111: |
|
112: | $lexer = 'DOMLex';
|
113: | } else {
|
114: | $lexer = 'DirectLex';
|
115: | }
|
116: | } while (0);
|
117: | }
|
118: |
|
119: |
|
120: | switch ($lexer) {
|
121: | case 'DOMLex':
|
122: | $inst = new HTMLPurifier_Lexer_DOMLex();
|
123: | break;
|
124: | case 'DirectLex':
|
125: | $inst = new HTMLPurifier_Lexer_DirectLex();
|
126: | break;
|
127: | case 'PH5P':
|
128: | $inst = new HTMLPurifier_Lexer_PH5P();
|
129: | break;
|
130: | default:
|
131: | throw new HTMLPurifier_Exception(
|
132: | "Cannot instantiate unrecognized Lexer type " .
|
133: | htmlspecialchars($lexer)
|
134: | );
|
135: | }
|
136: | }
|
137: |
|
138: | if (!$inst) {
|
139: | throw new HTMLPurifier_Exception('No lexer was instantiated');
|
140: | }
|
141: |
|
142: |
|
143: |
|
144: | if ($needs_tracking && !$inst->tracksLineNumbers) {
|
145: | throw new HTMLPurifier_Exception(
|
146: | 'Cannot use lexer that does not support line numbers with ' .
|
147: | 'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'
|
148: | );
|
149: | }
|
150: |
|
151: | return $inst;
|
152: |
|
153: | }
|
154: |
|
155: |
|
156: |
|
157: | public function __construct()
|
158: | {
|
159: | $this->_entity_parser = new HTMLPurifier_EntityParser();
|
160: | }
|
161: |
|
162: | |
163: | |
164: | |
165: |
|
166: | protected $_special_entity2str =
|
167: | array(
|
168: | '"' => '"',
|
169: | '&' => '&',
|
170: | '<' => '<',
|
171: | '>' => '>',
|
172: | ''' => "'",
|
173: | ''' => "'",
|
174: | ''' => "'"
|
175: | );
|
176: |
|
177: | public function parseText($string, $config) {
|
178: | return $this->parseData($string, false, $config);
|
179: | }
|
180: |
|
181: | public function parseAttr($string, $config) {
|
182: | return $this->parseData($string, true, $config);
|
183: | }
|
184: |
|
185: | |
186: | |
187: | |
188: | |
189: | |
190: | |
191: | |
192: | |
193: |
|
194: | public function parseData($string, $is_attr, $config)
|
195: | {
|
196: |
|
197: | if ($string === '') {
|
198: | return '';
|
199: | }
|
200: |
|
201: |
|
202: | $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
|
203: | ($string[strlen($string) - 1] === '&' ? 1 : 0);
|
204: |
|
205: | if (!$num_amp) {
|
206: | return $string;
|
207: | }
|
208: | $num_esc_amp = substr_count($string, '&');
|
209: | $string = strtr($string, $this->_special_entity2str);
|
210: |
|
211: |
|
212: | $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
|
213: | ($string[strlen($string) - 1] === '&' ? 1 : 0);
|
214: |
|
215: | if ($num_amp_2 <= $num_esc_amp) {
|
216: | return $string;
|
217: | }
|
218: |
|
219: |
|
220: | if ($config->get('Core.LegacyEntityDecoder')) {
|
221: | $string = $this->_entity_parser->substituteSpecialEntities($string);
|
222: | } else {
|
223: | if ($is_attr) {
|
224: | $string = $this->_entity_parser->substituteAttrEntities($string);
|
225: | } else {
|
226: | $string = $this->_entity_parser->substituteTextEntities($string);
|
227: | }
|
228: | }
|
229: | return $string;
|
230: | }
|
231: |
|
232: | |
233: | |
234: | |
235: | |
236: | |
237: | |
238: |
|
239: | public function tokenizeHTML($string, $config, $context)
|
240: | {
|
241: | trigger_error('Call to abstract class', E_USER_ERROR);
|
242: | }
|
243: |
|
244: | |
245: | |
246: | |
247: | |
248: |
|
249: | protected static function escapeCDATA($string)
|
250: | {
|
251: | return preg_replace_callback(
|
252: | '/<!\[CDATA\[(.+?)\]\]>/s',
|
253: | array('HTMLPurifier_Lexer', 'CDATACallback'),
|
254: | $string
|
255: | );
|
256: | }
|
257: |
|
258: | |
259: | |
260: | |
261: | |
262: |
|
263: | protected static function escapeCommentedCDATA($string)
|
264: | {
|
265: | return preg_replace_callback(
|
266: | '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
|
267: | array('HTMLPurifier_Lexer', 'CDATACallback'),
|
268: | $string
|
269: | );
|
270: | }
|
271: |
|
272: | |
273: | |
274: | |
275: | |
276: |
|
277: | protected static function removeIEConditional($string)
|
278: | {
|
279: | return preg_replace(
|
280: | '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si',
|
281: | '',
|
282: | $string
|
283: | );
|
284: | }
|
285: |
|
286: | |
287: | |
288: | |
289: | |
290: | |
291: | |
292: | |
293: | |
294: |
|
295: | protected static function CDATACallback($matches)
|
296: | {
|
297: |
|
298: | return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
|
299: | }
|
300: |
|
301: | |
302: | |
303: | |
304: | |
305: | |
306: | |
307: | |
308: | |
309: |
|
310: | public function normalize($html, $config, $context)
|
311: | {
|
312: |
|
313: | if ($config->get('Core.NormalizeNewlines')) {
|
314: | $html = str_replace("\r\n", "\n", (string)$html);
|
315: | $html = str_replace("\r", "\n", (string)$html);
|
316: | }
|
317: |
|
318: | if ($config->get('HTML.Trusted')) {
|
319: |
|
320: | $html = $this->escapeCommentedCDATA($html);
|
321: | }
|
322: |
|
323: |
|
324: | $html = $this->escapeCDATA($html);
|
325: |
|
326: | $html = $this->removeIEConditional($html);
|
327: |
|
328: |
|
329: | if ($config->get('Core.ConvertDocumentToFragment')) {
|
330: | $e = false;
|
331: | if ($config->get('Core.CollectErrors')) {
|
332: | $e =& $context->get('ErrorCollector');
|
333: | }
|
334: | $new_html = $this->extractBody($html);
|
335: | if ($e && $new_html != $html) {
|
336: | $e->send(E_WARNING, 'Lexer: Extracted body');
|
337: | }
|
338: | $html = $new_html;
|
339: | }
|
340: |
|
341: |
|
342: | if ($config->get('Core.LegacyEntityDecoder')) {
|
343: | $html = $this->_entity_parser->substituteNonSpecialEntities($html);
|
344: | }
|
345: |
|
346: |
|
347: |
|
348: |
|
349: | $html = HTMLPurifier_Encoder::cleanUTF8($html);
|
350: |
|
351: |
|
352: | if ($config->get('Core.RemoveProcessingInstructions')) {
|
353: | $html = preg_replace('#<\?.+?\?>#s', '', $html);
|
354: | }
|
355: |
|
356: | $hidden_elements = $config->get('Core.HiddenElements');
|
357: | if ($config->get('Core.AggressivelyRemoveScript') &&
|
358: | !($config->get('HTML.Trusted') || !$config->get('Core.RemoveScriptContents')
|
359: | || empty($hidden_elements["script"]))) {
|
360: | $html = preg_replace('#<script[^>]*>.*?</script>#i', '', $html);
|
361: | }
|
362: |
|
363: | return $html;
|
364: | }
|
365: |
|
366: | |
367: | |
368: | |
369: |
|
370: | public function extractBody($html)
|
371: | {
|
372: | $matches = array();
|
373: | $result = preg_match('|(.*?)<body[^>]*>(.*)</body>|is', $html, $matches);
|
374: | if ($result) {
|
375: |
|
376: | $comment_start = strrpos($matches[1], '<!--');
|
377: | $comment_end = strrpos($matches[1], '-->');
|
378: | if ($comment_start === false ||
|
379: | ($comment_end !== false && $comment_end > $comment_start)) {
|
380: | return $matches[2];
|
381: | }
|
382: | }
|
383: | return $html;
|
384: | }
|
385: | }
|
386: |
|
387: |
|
388: | |