File xoops_lib\modules\protector\library\HTMLPurifier\Lexer.php

1:	<?php
2:
3:	/**
4:	* Forgivingly lexes HTML (SGML-style) markup into tokens.
5:	*
6:	* A lexer parses a string of SGML-style markup and converts them into
7:	* corresponding tokens. It doesn't check for well-formedness, although its
8:	* internal mechanism may make this automatic (such as the case of
9:	* HTMLPurifier_Lexer_DOMLex). There are several implementations to choose
10:	* from.
11:	*
12:	* A lexer is HTML-oriented: it might work with XML, but it's not
13:	* recommended, as we adhere to a subset of the specification for optimization
14:	* reasons. This might change in the future. Also, most tokenizers are not
15:	* expected to handle DTDs or PIs.
16:	*
17:	* This class should not be directly instantiated, but you may use create() to
18:	* retrieve a default copy of the lexer. Being a supertype, this class
19:	* does not actually define any implementation, but offers commonly used
20:	* convenience functions for subclasses.
21:	*
22:	* @note The unit tests will instantiate this class for testing purposes, as
23:	* many of the utility functions require a class to be instantiated.
24:	* This means that, even though this class is not runnable, it will
25:	* not be declared abstract.
26:	*
27:	* @par
28:	*
29:	* @note
30:	* We use tokens rather than create a DOM representation because DOM would:
31:	*
32:	* @par
33:	* -# Require more processing and memory to create,
34:	* -# Is not streamable, and
35:	* -# Has the entire document structure (html and body not needed).
36:	*
37:	* @par
38:	* However, DOM is helpful in that it makes it easy to move around nodes
39:	* without a lot of lookaheads to see when a tag is closed. This is a
40:	* limitation of the token system and some workarounds would be nice.
41:	*/
42:	class HTMLPurifier_Lexer
43:	{
44:
45:	/**
46:	* Whether or not this lexer implements line-number/column-number tracking.
47:	* If it does, set to true.
48:	*/
49:	public $tracksLineNumbers = false;
50:
51:	/**
52:	* @type HTMLPurifier_EntityParser
53:	*/
54:	private $_entity_parser;
55:
56:	// -- STATIC ----------------------------------------------------------
57:
58:	/**
59:	* Retrieves or sets the default Lexer as a Prototype Factory.
60:	*
61:	* By default HTMLPurifier_Lexer_DOMLex will be returned. There are
62:	* a few exceptions involving special features that only DirectLex
63:	* implements.
64:	*
65:	* @note The behavior of this class has changed, rather than accepting
66:	* a prototype object, it now accepts a configuration object.
67:	* To specify your own prototype, set %Core.LexerImpl to it.
68:	* This change in behavior de-singletonizes the lexer object.
69:	*
70:	* @param HTMLPurifier_Config $config
71:	* @return HTMLPurifier_Lexer
72:	* @throws HTMLPurifier_Exception
73:	*/
74:	public static function create($config)
75:	{
76:	if (!($config instanceof HTMLPurifier_Config)) {
77:	$lexer = $config;
78:	trigger_error(
79:	"Passing a prototype to
80:	HTMLPurifier_Lexer::create() is deprecated, please instead
81:	use %Core.LexerImpl",
82:	E_USER_WARNING
83:	);
84:	} else {
85:	$lexer = $config->get('Core.LexerImpl');
86:	}
87:
88:	$needs_tracking =
89:	$config->get('Core.MaintainLineNumbers') \|\|
90:	$config->get('Core.CollectErrors');
91:
92:	$inst = null;
93:	if (is_object($lexer)) {
94:	$inst = $lexer;
95:	} else {
96:	if (is_null($lexer)) {
97:	do {
98:	// auto-detection algorithm
99:	if ($needs_tracking) {
100:	$lexer = 'DirectLex';
101:	break;
102:	}
103:
104:	if (class_exists('DOMDocument', false) &&
105:	method_exists('DOMDocument', 'loadHTML') &&
106:	!extension_loaded('domxml')
107:	) {
108:	// check for DOM support, because while it's part of the
109:	// core, it can be disabled compile time. Also, the PECL
110:	// domxml extension overrides the default DOM, and is evil
111:	// and nasty and we shan't bother to support it
112:	$lexer = 'DOMLex';
113:	} else {
114:	$lexer = 'DirectLex';
115:	}
116:	} while (0);
117:	} // do..while so we can break
118:
119:	// instantiate recognized string names
120:	switch ($lexer) {
121:	case 'DOMLex':
122:	$inst = new HTMLPurifier_Lexer_DOMLex();
123:	break;
124:	case 'DirectLex':
125:	$inst = new HTMLPurifier_Lexer_DirectLex();
126:	break;
127:	case 'PH5P':
128:	$inst = new HTMLPurifier_Lexer_PH5P();
129:	break;
130:	default:
131:	throw new HTMLPurifier_Exception(
132:	"Cannot instantiate unrecognized Lexer type " .
133:	htmlspecialchars($lexer)
134:	);
135:	}
136:	}
137:
138:	if (!$inst) {
139:	throw new HTMLPurifier_Exception('No lexer was instantiated');
140:	}
141:
142:	// once PHP DOM implements native line numbers, or we
143:	// hack out something using XSLT, remove this stipulation
144:	if ($needs_tracking && !$inst->tracksLineNumbers) {
145:	throw new HTMLPurifier_Exception(
146:	'Cannot use lexer that does not support line numbers with ' .
147:	'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'
148:	);
149:	}
150:
151:	return $inst;
152:
153:	}
154:
155:	// -- CONVENIENCE MEMBERS ---------------------------------------------
156:
157:	public function __construct()
158:	{
159:	$this->_entity_parser = new HTMLPurifier_EntityParser();
160:	}
161:
162:	/**
163:	* Most common entity to raw value conversion table for special entities.
164:	* @type array
165:	*/
166:	protected $_special_entity2str =
167:	array(
168:	'"' => '"',
169:	'&' => '&',
170:	'<' => '<',
171:	'>' => '>',
172:	''' => "'",
173:	''' => "'",
174:	''' => "'"
175:	);
176:
177:	public function parseText($string, $config) {
178:	return $this->parseData($string, false, $config);
179:	}
180:
181:	public function parseAttr($string, $config) {
182:	return $this->parseData($string, true, $config);
183:	}
184:
185:	/**
186:	* Parses special entities into the proper characters.
187:	*
188:	* This string will translate escaped versions of the special characters
189:	* into the correct ones.
190:	*
191:	* @param string $string String character data to be parsed.
192:	* @return string Parsed character data.
193:	*/
194:	public function parseData($string, $is_attr, $config)
195:	{
196:	// following functions require at least one character
197:	if ($string === '') {
198:	return '';
199:	}
200:
201:	// subtracts amps that cannot possibly be escaped
202:	$num_amp = substr_count($string, '&') - substr_count($string, '& ') -
203:	($string[strlen($string) - 1] === '&' ? 1 : 0);
204:
205:	if (!$num_amp) {
206:	return $string;
207:	} // abort if no entities
208:	$num_esc_amp = substr_count($string, '&');
209:	$string = strtr($string, $this->_special_entity2str);
210:
211:	// code duplication for sake of optimization, see above
212:	$num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
213:	($string[strlen($string) - 1] === '&' ? 1 : 0);
214:
215:	if ($num_amp_2 <= $num_esc_amp) {
216:	return $string;
217:	}
218:
219:	// hmm... now we have some uncommon entities. Use the callback.
220:	if ($config->get('Core.LegacyEntityDecoder')) {
221:	$string = $this->_entity_parser->substituteSpecialEntities($string);
222:	} else {
223:	if ($is_attr) {
224:	$string = $this->_entity_parser->substituteAttrEntities($string);
225:	} else {
226:	$string = $this->_entity_parser->substituteTextEntities($string);
227:	}
228:	}
229:	return $string;
230:	}
231:
232:	/**
233:	* Lexes an HTML string into tokens.
234:	* @param $string String HTML.
235:	* @param HTMLPurifier_Config $config
236:	* @param HTMLPurifier_Context $context
237:	* @return HTMLPurifier_Token[] array representation of HTML.
238:	*/
239:	public function tokenizeHTML($string, $config, $context)
240:	{
241:	trigger_error('Call to abstract class', E_USER_ERROR);
242:	}
243:
244:	/**
245:	* Translates CDATA sections into regular sections (through escaping).
246:	* @param string $string HTML string to process.
247:	* @return string HTML with CDATA sections escaped.
248:	*/
249:	protected static function escapeCDATA($string)
250:	{
251:	return preg_replace_callback(
252:	'/<!\[CDATA\[(.+?)\]\]>/s',
253:	array('HTMLPurifier_Lexer', 'CDATACallback'),
254:	$string
255:	);
256:	}
257:
258:	/**
259:	* Special CDATA case that is especially convoluted for <script>
260:	* @param string $string HTML string to process.
261:	* @return string HTML with CDATA sections escaped.
262:	*/
263:	protected static function escapeCommentedCDATA($string)
264:	{
265:	return preg_replace_callback(
266:	'#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
267:	array('HTMLPurifier_Lexer', 'CDATACallback'),
268:	$string
269:	);
270:	}
271:
272:	/**
273:	* Special Internet Explorer conditional comments should be removed.
274:	* @param string $string HTML string to process.
275:	* @return string HTML with conditional comments removed.
276:	*/
277:	protected static function removeIEConditional($string)
278:	{
279:	return preg_replace(
280:	'#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings
281:	'',
282:	$string
283:	);
284:	}
285:
286:	/**
287:	* Callback function for escapeCDATA() that does the work.
288:	*
289:	* @warning Though this is public in order to let the callback happen,
290:	* calling it directly is not recommended.
291:	* @param array $matches PCRE matches array, with index 0 the entire match
292:	* and 1 the inside of the CDATA section.
293:	* @return string Escaped internals of the CDATA section.
294:	*/
295:	protected static function CDATACallback($matches)
296:	{
297:	// not exactly sure why the character set is needed, but whatever
298:	return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
299:	}
300:
301:	/**
302:	* Takes a piece of HTML and normalizes it by converting entities, fixing
303:	* encoding, extracting bits, and other good stuff.
304:	* @param string $html HTML.
305:	* @param HTMLPurifier_Config $config
306:	* @param HTMLPurifier_Context $context
307:	* @return string
308:	* @todo Consider making protected
309:	*/
310:	public function normalize($html, $config, $context)
311:	{
312:	// normalize newlines to \n
313:	if ($config->get('Core.NormalizeNewlines')) {
314:	$html = str_replace("\r\n", "\n", (string)$html);
315:	$html = str_replace("\r", "\n", (string)$html);
316:	}
317:
318:	if ($config->get('HTML.Trusted')) {
319:	// escape convoluted CDATA
320:	$html = $this->escapeCommentedCDATA($html);
321:	}
322:
323:	// escape CDATA
324:	$html = $this->escapeCDATA($html);
325:
326:	$html = $this->removeIEConditional($html);
327:
328:	// extract body from document if applicable
329:	if ($config->get('Core.ConvertDocumentToFragment')) {
330:	$e = false;
331:	if ($config->get('Core.CollectErrors')) {
332:	$e =& $context->get('ErrorCollector');
333:	}
334:	$new_html = $this->extractBody($html);
335:	if ($e && $new_html != $html) {
336:	$e->send(E_WARNING, 'Lexer: Extracted body');
337:	}
338:	$html = $new_html;
339:	}
340:
341:	// expand entities that aren't the big five
342:	if ($config->get('Core.LegacyEntityDecoder')) {
343:	$html = $this->_entity_parser->substituteNonSpecialEntities($html);
344:	}
345:
346:	// clean into wellformed UTF-8 string for an SGML context: this has
347:	// to be done after entity expansion because the entities sometimes
348:	// represent non-SGML characters (horror, horror!)
349:	$html = HTMLPurifier_Encoder::cleanUTF8($html);
350:
351:	// if processing instructions are to removed, remove them now
352:	if ($config->get('Core.RemoveProcessingInstructions')) {
353:	$html = preg_replace('#<\?.+?\?>#s', '', $html);
354:	}
355:
356:	$hidden_elements = $config->get('Core.HiddenElements');
357:	if ($config->get('Core.AggressivelyRemoveScript') &&
358:	!($config->get('HTML.Trusted') \|\| !$config->get('Core.RemoveScriptContents')
359:	\|\| empty($hidden_elements["script"]))) {
360:	$html = preg_replace('#<script[^>]>.?</script>#i', '', $html);
361:	}
362:
363:	return $html;
364:	}
365:
366:	/**
367:	* Takes a string of HTML (fragment or document) and returns the content
368:	* @todo Consider making protected
369:	*/
370:	public function extractBody($html)
371:	{
372:	$matches = array();
373:	$result = preg_match('\|(.?)<body[^>]>(.*)</body>\|is', $html, $matches);
374:	if ($result) {
375:	// Make sure it's not in a comment
376:	$comment_start = strrpos($matches[1], '<!--');
377:	$comment_end = strrpos($matches[1], '-->');
378:	if ($comment_start === false \|\|
379:	($comment_end !== false && $comment_end > $comment_start)) {
380:	return $matches[2];
381:	}
382:	}
383:	return $html;
384:	}
385:	}
386:
387:	// vim: et sw=4 sts=4
388:

Namespaces

Classes

Interfaces

Exceptions

Functions