File xoops_lib\modules\protector\library\HTMLPurifier\Lexer\DOMLex.php

1:	<?php
2:
3:	/**
4:	* Parser that uses PHP 5's DOM extension (part of the core).
5:	*
6:	* In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
7:	* It gives us a forgiving HTML parser, which we use to transform the HTML
8:	* into a DOM, and then into the tokens. It is blazingly fast (for large
9:	* documents, it performs twenty times faster than
10:	* HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5.
11:	*
12:	* @note Any empty elements will have empty tokens associated with them, even if
13:	* this is prohibited by the spec. This is cannot be fixed until the spec
14:	* comes into play.
15:	*
16:	* @note PHP's DOM extension does not actually parse any entities, we use
17:	* our own function to do that.
18:	*
19:	* @warning DOM tends to drop whitespace, which may wreak havoc on indenting.
20:	* If this is a huge problem, due to the fact that HTML is hand
21:	* edited and you are unable to get a parser cache that caches the
22:	* the output of HTML Purifier while keeping the original HTML lying
23:	* around, you may want to run Tidy on the resulting output or use
24:	* HTMLPurifier_DirectLex
25:	*/
26:
27:	class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
28:	{
29:
30:	/**
31:	* @type HTMLPurifier_TokenFactory
32:	*/
33:	private $factory;
34:
35:	public function __construct()
36:	{
37:	// setup the factory
38:	parent::__construct();
39:	$this->factory = new HTMLPurifier_TokenFactory();
40:	}
41:
42:	/**
43:	* @param string $html
44:	* @param HTMLPurifier_Config $config
45:	* @param HTMLPurifier_Context $context
46:	* @return HTMLPurifier_Token[]
47:	*/
48:	public function tokenizeHTML($html, $config, $context)
49:	{
50:	$html = $this->normalize($html, $config, $context);
51:
52:	// attempt to armor stray angled brackets that cannot possibly
53:	// form tags and thus are probably being used as emoticons
54:	if ($config->get('Core.AggressivelyFixLt')) {
55:	$char = '[^a-z!\/]';
56:	$comment = "/<!--(.*?)(-->\|\z)/is";
57:	$html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
58:	do {
59:	$old = $html;
60:	$html = preg_replace("/<($char)/i", '<\\1', $html);
61:	} while ($html !== $old);
62:	$html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
63:	}
64:
65:	// preprocess html, essential for UTF-8
66:	$html = $this->wrapHTML($html, $config, $context);
67:
68:	$doc = new DOMDocument();
69:	$doc->encoding = 'UTF-8'; // theoretically, the above has this covered
70:
71:	$options = 0;
72:	if ($config->get('Core.AllowParseManyTags') && defined('LIBXML_PARSEHUGE')) {
73:	$options \|= LIBXML_PARSEHUGE;
74:	}
75:
76:	set_error_handler(array($this, 'muteErrorHandler'));
77:	// loadHTML() fails on PHP 5.3 when second parameter is given
78:	if ($options) {
79:	$doc->loadHTML($html, $options);
80:	} else {
81:	$doc->loadHTML($html);
82:	}
83:	restore_error_handler();
84:
85:	$body = $doc->getElementsByTagName('html')->item(0)-> // <html>
86:	getElementsByTagName('body')->item(0); // <body>
87:
88:	$div = $body->getElementsByTagName('div')->item(0); // <div>
89:	$tokens = array();
90:	$this->tokenizeDOM($div, $tokens, $config);
91:	// If the div has a sibling, that means we tripped across
92:	// a premature </div> tag. So remove the div we parsed,
93:	// and then tokenize the rest of body. We can't tokenize
94:	// the sibling directly as we'll lose the tags in that case.
95:	if ($div->nextSibling) {
96:	$body->removeChild($div);
97:	$this->tokenizeDOM($body, $tokens, $config);
98:	}
99:	return $tokens;
100:	}
101:
102:	/**
103:	* Iterative function that tokenizes a node, putting it into an accumulator.
104:	* To iterate is human, to recurse divine - L. Peter Deutsch
105:	* @param DOMNode $node DOMNode to be tokenized.
106:	* @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.
107:	* @return HTMLPurifier_Token of node appended to previously passed tokens.
108:	*/
109:	protected function tokenizeDOM($node, &$tokens, $config)
110:	{
111:	$level = 0;
112:	$nodes = array($level => new HTMLPurifier_Queue(array($node)));
113:	$closingNodes = array();
114:	do {
115:	while (!$nodes[$level]->isEmpty()) {
116:	$node = $nodes[$level]->shift(); // FIFO
117:	$collect = $level > 0 ? true : false;
118:	$needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);
119:	if ($needEndingTag) {
120:	$closingNodes[$level][] = $node;
121:	}
122:	if ($node->childNodes && $node->childNodes->length) {
123:	$level++;
124:	$nodes[$level] = new HTMLPurifier_Queue();
125:	foreach ($node->childNodes as $childNode) {
126:	$nodes[$level]->push($childNode);
127:	}
128:	}
129:	}
130:	$level--;
131:	if ($level && isset($closingNodes[$level])) {
132:	while ($node = array_pop($closingNodes[$level])) {
133:	$this->createEndNode($node, $tokens);
134:	}
135:	}
136:	} while ($level > 0);
137:	}
138:
139:	/**
140:	* Portably retrieve the tag name of a node; deals with older versions
141:	* of libxml like 2.7.6
142:	* @param DOMNode $node
143:	*/
144:	protected function getTagName($node)
145:	{
146:	if (isset($node->tagName)) {
147:	return $node->tagName;
148:	} else if (isset($node->nodeName)) {
149:	return $node->nodeName;
150:	} else if (isset($node->localName)) {
151:	return $node->localName;
152:	}
153:	return null;
154:	}
155:
156:	/**
157:	* Portably retrieve the data of a node; deals with older versions
158:	* of libxml like 2.7.6
159:	* @param DOMNode $node
160:	*/
161:	protected function getData($node)
162:	{
163:	if (isset($node->data)) {
164:	return $node->data;
165:	} else if (isset($node->nodeValue)) {
166:	return $node->nodeValue;
167:	} else if (isset($node->textContent)) {
168:	return $node->textContent;
169:	}
170:	return null;
171:	}
172:
173:
174:	/**
175:	* @param DOMNode $node DOMNode to be tokenized.
176:	* @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.
177:	* @param bool $collect Says whether or start and close are collected, set to
178:	* false at first recursion because it's the implicit DIV
179:	* tag you're dealing with.
180:	* @return bool if the token needs an endtoken
181:	* @todo data and tagName properties don't seem to exist in DOMNode?
182:	*/
183:	protected function createStartNode($node, &$tokens, $collect, $config)
184:	{
185:	// intercept non element nodes. WE MUST catch all of them,
186:	// but we're not getting the character reference nodes because
187:	// those should have been preprocessed
188:	if ($node->nodeType === XML_TEXT_NODE) {
189:	$data = $this->getData($node); // Handle variable data property
190:	if ($data !== null) {
191:	$tokens[] = $this->factory->createText($data);
192:	}
193:	return false;
194:	} elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
195:	// undo libxml's special treatment of <script> and <style> tags
196:	$last = end($tokens);
197:	$data = $node->data;
198:	// (note $node->tagname is already normalized)
199:	if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' \|\| $last->name == 'style')) {
200:	$new_data = trim($data);
201:	if (substr($new_data, 0, 4) === '<!--') {
202:	$data = substr($new_data, 4);
203:	if (substr($data, -3) === '-->') {
204:	$data = substr($data, 0, -3);
205:	} else {
206:	// Highly suspicious! Not sure what to do...
207:	}
208:	}
209:	}
210:	$tokens[] = $this->factory->createText($this->parseText($data, $config));
211:	return false;
212:	} elseif ($node->nodeType === XML_COMMENT_NODE) {
213:	// this is code is only invoked for comments in script/style in versions
214:	// of libxml pre-2.6.28 (regular comments, of course, are still
215:	// handled regularly)
216:	$tokens[] = $this->factory->createComment($node->data);
217:	return false;
218:	} elseif ($node->nodeType !== XML_ELEMENT_NODE) {
219:	// not-well tested: there may be other nodes we have to grab
220:	return false;
221:	}
222:	$attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
223:	$tag_name = $this->getTagName($node); // Handle variable tagName property
224:	if (empty($tag_name)) {
225:	return (bool) $node->childNodes->length;
226:	}
227:	// We still have to make sure that the element actually IS empty
228:	if (!$node->childNodes->length) {
229:	if ($collect) {
230:	$tokens[] = $this->factory->createEmpty($tag_name, $attr);
231:	}
232:	return false;
233:	} else {
234:	if ($collect) {
235:	$tokens[] = $this->factory->createStart($tag_name, $attr);
236:	}
237:	return true;
238:	}
239:	}
240:
241:	/**
242:	* @param DOMNode $node
243:	* @param HTMLPurifier_Token[] $tokens
244:	*/
245:	protected function createEndNode($node, &$tokens)
246:	{
247:	$tag_name = $this->getTagName($node); // Handle variable tagName property
248:	$tokens[] = $this->factory->createEnd($tag_name);
249:	}
250:
251:	/**
252:	* Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
253:	*
254:	* @param DOMNamedNodeMap $node_map DOMNamedNodeMap of DOMAttr objects.
255:	* @return array Associative array of attributes.
256:	*/
257:	protected function transformAttrToAssoc($node_map)
258:	{
259:	// NamedNodeMap is documented very well, so we're using undocumented
260:	// features, namely, the fact that it implements Iterator and
261:	// has a ->length attribute
262:	if ($node_map->length === 0) {
263:	return array();
264:	}
265:	$array = array();
266:	foreach ($node_map as $attr) {
267:	$array[$attr->name] = $attr->value;
268:	}
269:	return $array;
270:	}
271:
272:	/**
273:	* An error handler that mutes all errors
274:	* @param int $errno
275:	* @param string $errstr
276:	*/
277:	public function muteErrorHandler($errno, $errstr)
278:	{
279:	}
280:
281:	/**
282:	* Callback function for undoing escaping of stray angled brackets
283:	* in comments
284:	* @param array $matches
285:	* @return string
286:	*/
287:	public function callbackUndoCommentSubst($matches)
288:	{
289:	return '<!--' . strtr($matches[1], array('&' => '&', '<' => '<')) . $matches[2];
290:	}
291:
292:	/**
293:	* Callback function that entity-izes ampersands in comments so that
294:	* callbackUndoCommentSubst doesn't clobber them
295:	* @param array $matches
296:	* @return string
297:	*/
298:	public function callbackArmorCommentEntities($matches)
299:	{
300:	return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2];
301:	}
302:
303:	/**
304:	* Wraps an HTML fragment in the necessary HTML
305:	* @param string $html
306:	* @param HTMLPurifier_Config $config
307:	* @param HTMLPurifier_Context $context
308:	* @return string
309:	*/
310:	protected function wrapHTML($html, $config, $context, $use_div = true)
311:	{
312:	$def = $config->getDefinition('HTML');
313:	$ret = '';
314:
315:	if (!empty($def->doctype->dtdPublic) \|\| !empty($def->doctype->dtdSystem)) {
316:	$ret .= '<!DOCTYPE html ';
317:	if (!empty($def->doctype->dtdPublic)) {
318:	$ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
319:	}
320:	if (!empty($def->doctype->dtdSystem)) {
321:	$ret .= '"' . $def->doctype->dtdSystem . '" ';
322:	}
323:	$ret .= '>';
324:	}
325:
326:	$ret .= '<html><head>';
327:	$ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
328:	// No protection if $html contains a stray </div>!
329:	$ret .= '</head><body>';
330:	if ($use_div) $ret .= '<div>';
331:	$ret .= $html;
332:	if ($use_div) $ret .= '</div>';
333:	$ret .= '</body></html>';
334:	return $ret;
335:	}
336:	}
337:
338:	// vim: et sw=4 sts=4
339:

Namespaces

Classes

Interfaces

Exceptions

Functions