1: <?php
2:
3: /**
4: * Experimental HTML5-based parser using Jeroen van der Meer's PH5P library.
5: * Occupies space in the HTML5 pseudo-namespace, which may cause conflicts.
6: *
7: * @note
8: * Recent changes to PHP's DOM extension have resulted in some fatal
9: * error conditions with the original version of PH5P. Pending changes,
10: * this lexer will punt to DirectLex if DOM throws an exception.
11: */
12:
13: class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex
14: {
15: /**
16: * @param string $html
17: * @param HTMLPurifier_Config $config
18: * @param HTMLPurifier_Context $context
19: * @return HTMLPurifier_Token[]
20: */
21: public function tokenizeHTML($html, $config, $context)
22: {
23: $new_html = $this->normalize($html, $config, $context);
24: $new_html = $this->wrapHTML($new_html, $config, $context, false /* no div */);
25: try {
26: $parser = new HTML5($new_html);
27: $doc = $parser->save();
28: } catch (DOMException $e) {
29: // Uh oh, it failed. Punt to DirectLex.
30: $lexer = new HTMLPurifier_Lexer_DirectLex();
31: $context->register('PH5PError', $e); // save the error, so we can detect it
32: return $lexer->tokenizeHTML($html, $config, $context); // use original HTML
33: }
34: $tokens = array();
35: $this->tokenizeDOM(
36: $doc->getElementsByTagName('html')->item(0)-> // <html>
37: getElementsByTagName('body')->item(0) // <body>
38: ,
39: $tokens, $config
40: );
41: return $tokens;
42: }
43: }
44:
45: /*
46:
47: Copyright 2007 Jeroen van der Meer <http://jero.net/>
48:
49: Permission is hereby granted, free of charge, to any person obtaining a
50: copy of this software and associated documentation files (the
51: "Software"), to deal in the Software without restriction, including
52: without limitation the rights to use, copy, modify, merge, publish,
53: distribute, sublicense, and/or sell copies of the Software, and to
54: permit persons to whom the Software is furnished to do so, subject to
55: the following conditions:
56:
57: The above copyright notice and this permission notice shall be included
58: in all copies or substantial portions of the Software.
59:
60: THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
61: OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
62: MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
63: IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
64: CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
65: TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
66: SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
67:
68: */
69:
70: class HTML5
71: {
72: private $data;
73: private $char;
74: private $EOF;
75: private $state;
76: private $tree;
77: private $token;
78: private $content_model;
79: private $escape = false;
80: private $entities = array(
81: 'AElig;',
82: 'AElig',
83: 'AMP;',
84: 'AMP',
85: 'Aacute;',
86: 'Aacute',
87: 'Acirc;',
88: 'Acirc',
89: 'Agrave;',
90: 'Agrave',
91: 'Alpha;',
92: 'Aring;',
93: 'Aring',
94: 'Atilde;',
95: 'Atilde',
96: 'Auml;',
97: 'Auml',
98: 'Beta;',
99: 'COPY;',
100: 'COPY',
101: 'Ccedil;',
102: 'Ccedil',
103: 'Chi;',
104: 'Dagger;',
105: 'Delta;',
106: 'ETH;',
107: 'ETH',
108: 'Eacute;',
109: 'Eacute',
110: 'Ecirc;',
111: 'Ecirc',
112: 'Egrave;',
113: 'Egrave',
114: 'Epsilon;',
115: 'Eta;',
116: 'Euml;',
117: 'Euml',
118: 'GT;',
119: 'GT',
120: 'Gamma;',
121: 'Iacute;',
122: 'Iacute',
123: 'Icirc;',
124: 'Icirc',
125: 'Igrave;',
126: 'Igrave',
127: 'Iota;',
128: 'Iuml;',
129: 'Iuml',
130: 'Kappa;',
131: 'LT;',
132: 'LT',
133: 'Lambda;',
134: 'Mu;',
135: 'Ntilde;',
136: 'Ntilde',
137: 'Nu;',
138: 'OElig;',
139: 'Oacute;',
140: 'Oacute',
141: 'Ocirc;',
142: 'Ocirc',
143: 'Ograve;',
144: 'Ograve',
145: 'Omega;',
146: 'Omicron;',
147: 'Oslash;',
148: 'Oslash',
149: 'Otilde;',
150: 'Otilde',
151: 'Ouml;',
152: 'Ouml',
153: 'Phi;',
154: 'Pi;',
155: 'Prime;',
156: 'Psi;',
157: 'QUOT;',
158: 'QUOT',
159: 'REG;',
160: 'REG',
161: 'Rho;',
162: 'Scaron;',
163: 'Sigma;',
164: 'THORN;',
165: 'THORN',
166: 'TRADE;',
167: 'Tau;',
168: 'Theta;',
169: 'Uacute;',
170: 'Uacute',
171: 'Ucirc;',
172: 'Ucirc',
173: 'Ugrave;',
174: 'Ugrave',
175: 'Upsilon;',
176: 'Uuml;',
177: 'Uuml',
178: 'Xi;',
179: 'Yacute;',
180: 'Yacute',
181: 'Yuml;',
182: 'Zeta;',
183: 'aacute;',
184: 'aacute',
185: 'acirc;',
186: 'acirc',
187: 'acute;',
188: 'acute',
189: 'aelig;',
190: 'aelig',
191: 'agrave;',
192: 'agrave',
193: 'alefsym;',
194: 'alpha;',
195: 'amp;',
196: 'amp',
197: 'and;',
198: 'ang;',
199: 'apos;',
200: 'aring;',
201: 'aring',
202: 'asymp;',
203: 'atilde;',
204: 'atilde',
205: 'auml;',
206: 'auml',
207: 'bdquo;',
208: 'beta;',
209: 'brvbar;',
210: 'brvbar',
211: 'bull;',
212: 'cap;',
213: 'ccedil;',
214: 'ccedil',
215: 'cedil;',
216: 'cedil',
217: 'cent;',
218: 'cent',
219: 'chi;',
220: 'circ;',
221: 'clubs;',
222: 'cong;',
223: 'copy;',
224: 'copy',
225: 'crarr;',
226: 'cup;',
227: 'curren;',
228: 'curren',
229: 'dArr;',
230: 'dagger;',
231: 'darr;',
232: 'deg;',
233: 'deg',
234: 'delta;',
235: 'diams;',
236: 'divide;',
237: 'divide',
238: 'eacute;',
239: 'eacute',
240: 'ecirc;',
241: 'ecirc',
242: 'egrave;',
243: 'egrave',
244: 'empty;',
245: 'emsp;',
246: 'ensp;',
247: 'epsilon;',
248: 'equiv;',
249: 'eta;',
250: 'eth;',
251: 'eth',
252: 'euml;',
253: 'euml',
254: 'euro;',
255: 'exist;',
256: 'fnof;',
257: 'forall;',
258: 'frac12;',
259: 'frac12',
260: 'frac14;',
261: 'frac14',
262: 'frac34;',
263: 'frac34',
264: 'frasl;',
265: 'gamma;',
266: 'ge;',
267: 'gt;',
268: 'gt',
269: 'hArr;',
270: 'harr;',
271: 'hearts;',
272: 'hellip;',
273: 'iacute;',
274: 'iacute',
275: 'icirc;',
276: 'icirc',
277: 'iexcl;',
278: 'iexcl',
279: 'igrave;',
280: 'igrave',
281: 'image;',
282: 'infin;',
283: 'int;',
284: 'iota;',
285: 'iquest;',
286: 'iquest',
287: 'isin;',
288: 'iuml;',
289: 'iuml',
290: 'kappa;',
291: 'lArr;',
292: 'lambda;',
293: 'lang;',
294: 'laquo;',
295: 'laquo',
296: 'larr;',
297: 'lceil;',
298: 'ldquo;',
299: 'le;',
300: 'lfloor;',
301: 'lowast;',
302: 'loz;',
303: 'lrm;',
304: 'lsaquo;',
305: 'lsquo;',
306: 'lt;',
307: 'lt',
308: 'macr;',
309: 'macr',
310: 'mdash;',
311: 'micro;',
312: 'micro',
313: 'middot;',
314: 'middot',
315: 'minus;',
316: 'mu;',
317: 'nabla;',
318: 'nbsp;',
319: 'nbsp',
320: 'ndash;',
321: 'ne;',
322: 'ni;',
323: 'not;',
324: 'not',
325: 'notin;',
326: 'nsub;',
327: 'ntilde;',
328: 'ntilde',
329: 'nu;',
330: 'oacute;',
331: 'oacute',
332: 'ocirc;',
333: 'ocirc',
334: 'oelig;',
335: 'ograve;',
336: 'ograve',
337: 'oline;',
338: 'omega;',
339: 'omicron;',
340: 'oplus;',
341: 'or;',
342: 'ordf;',
343: 'ordf',
344: 'ordm;',
345: 'ordm',
346: 'oslash;',
347: 'oslash',
348: 'otilde;',
349: 'otilde',
350: 'otimes;',
351: 'ouml;',
352: 'ouml',
353: 'para;',
354: 'para',
355: 'part;',
356: 'permil;',
357: 'perp;',
358: 'phi;',
359: 'pi;',
360: 'piv;',
361: 'plusmn;',
362: 'plusmn',
363: 'pound;',
364: 'pound',
365: 'prime;',
366: 'prod;',
367: 'prop;',
368: 'psi;',
369: 'quot;',
370: 'quot',
371: 'rArr;',
372: 'radic;',
373: 'rang;',
374: 'raquo;',
375: 'raquo',
376: 'rarr;',
377: 'rceil;',
378: 'rdquo;',
379: 'real;',
380: 'reg;',
381: 'reg',
382: 'rfloor;',
383: 'rho;',
384: 'rlm;',
385: 'rsaquo;',
386: 'rsquo;',
387: 'sbquo;',
388: 'scaron;',
389: 'sdot;',
390: 'sect;',
391: 'sect',
392: 'shy;',
393: 'shy',
394: 'sigma;',
395: 'sigmaf;',
396: 'sim;',
397: 'spades;',
398: 'sub;',
399: 'sube;',
400: 'sum;',
401: 'sup1;',
402: 'sup1',
403: 'sup2;',
404: 'sup2',
405: 'sup3;',
406: 'sup3',
407: 'sup;',
408: 'supe;',
409: 'szlig;',
410: 'szlig',
411: 'tau;',
412: 'there4;',
413: 'theta;',
414: 'thetasym;',
415: 'thinsp;',
416: 'thorn;',
417: 'thorn',
418: 'tilde;',
419: 'times;',
420: 'times',
421: 'trade;',
422: 'uArr;',
423: 'uacute;',
424: 'uacute',
425: 'uarr;',
426: 'ucirc;',
427: 'ucirc',
428: 'ugrave;',
429: 'ugrave',
430: 'uml;',
431: 'uml',
432: 'upsih;',
433: 'upsilon;',
434: 'uuml;',
435: 'uuml',
436: 'weierp;',
437: 'xi;',
438: 'yacute;',
439: 'yacute',
440: 'yen;',
441: 'yen',
442: 'yuml;',
443: 'yuml',
444: 'zeta;',
445: 'zwj;',
446: 'zwnj;'
447: );
448:
449: const PCDATA = 0;
450: const RCDATA = 1;
451: const CDATA = 2;
452: const PLAINTEXT = 3;
453:
454: const DOCTYPE = 0;
455: const STARTTAG = 1;
456: const ENDTAG = 2;
457: const COMMENT = 3;
458: const CHARACTR = 4;
459: const EOF = 5;
460:
461: public function __construct($data)
462: {
463: $this->data = $data;
464: $this->char = -1;
465: $this->EOF = strlen($data);
466: $this->tree = new HTML5TreeConstructer;
467: $this->content_model = self::PCDATA;
468:
469: $this->state = 'data';
470:
471: while ($this->state !== null) {
472: $this->{$this->state . 'State'}();
473: }
474: }
475:
476: public function save()
477: {
478: return $this->tree->save();
479: }
480:
481: private function char()
482: {
483: return ($this->char < $this->EOF)
484: ? $this->data[$this->char]
485: : false;
486: }
487:
488: private function character($s, $l = 0)
489: {
490: if ($s + $l < $this->EOF) {
491: if ($l === 0) {
492: return $this->data[$s];
493: } else {
494: return substr($this->data, $s, $l);
495: }
496: }
497: }
498:
499: private function characters($char_class, $start)
500: {
501: return preg_replace('#^([' . $char_class . ']+).*#s', '\\1', substr($this->data, $start));
502: }
503:
504: private function dataState()
505: {
506: // Consume the next input character
507: $this->char++;
508: $char = $this->char();
509:
510: if ($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
511: /* U+0026 AMPERSAND (&)
512: When the content model flag is set to one of the PCDATA or RCDATA
513: states: switch to the entity data state. Otherwise: treat it as per
514: the "anything else" entry below. */
515: $this->state = 'entityData';
516:
517: } elseif ($char === '-') {
518: /* If the content model flag is set to either the RCDATA state or
519: the CDATA state, and the escape flag is false, and there are at
520: least three characters before this one in the input stream, and the
521: last four characters in the input stream, including this one, are
522: U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
523: and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
524: if (($this->content_model === self::RCDATA || $this->content_model ===
525: self::CDATA) && $this->escape === false &&
526: $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--'
527: ) {
528: $this->escape = true;
529: }
530:
531: /* In any case, emit the input character as a character token. Stay
532: in the data state. */
533: $this->emitToken(
534: array(
535: 'type' => self::CHARACTR,
536: 'data' => $char
537: )
538: );
539:
540: /* U+003C LESS-THAN SIGN (<) */
541: } elseif ($char === '<' && ($this->content_model === self::PCDATA ||
542: (($this->content_model === self::RCDATA ||
543: $this->content_model === self::CDATA) && $this->escape === false))
544: ) {
545: /* When the content model flag is set to the PCDATA state: switch
546: to the tag open state.
547:
548: When the content model flag is set to either the RCDATA state or
549: the CDATA state and the escape flag is false: switch to the tag
550: open state.
551:
552: Otherwise: treat it as per the "anything else" entry below. */
553: $this->state = 'tagOpen';
554:
555: /* U+003E GREATER-THAN SIGN (>) */
556: } elseif ($char === '>') {
557: /* If the content model flag is set to either the RCDATA state or
558: the CDATA state, and the escape flag is true, and the last three
559: characters in the input stream including this one are U+002D
560: HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
561: set the escape flag to false. */
562: if (($this->content_model === self::RCDATA ||
563: $this->content_model === self::CDATA) && $this->escape === true &&
564: $this->character($this->char, 3) === '-->'
565: ) {
566: $this->escape = false;
567: }
568:
569: /* In any case, emit the input character as a character token.
570: Stay in the data state. */
571: $this->emitToken(
572: array(
573: 'type' => self::CHARACTR,
574: 'data' => $char
575: )
576: );
577:
578: } elseif ($this->char === $this->EOF) {
579: /* EOF
580: Emit an end-of-file token. */
581: $this->EOF();
582:
583: } elseif ($this->content_model === self::PLAINTEXT) {
584: /* When the content model flag is set to the PLAINTEXT state
585: THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
586: the text and emit it as a character token. */
587: $this->emitToken(
588: array(
589: 'type' => self::CHARACTR,
590: 'data' => substr($this->data, $this->char)
591: )
592: );
593:
594: $this->EOF();
595:
596: } else {
597: /* Anything else
598: THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
599: otherwise would also be treated as a character token and emit it
600: as a single character token. Stay in the data state. */
601: $len = strcspn($this->data, '<&', $this->char);
602: $char = substr($this->data, $this->char, $len);
603: $this->char += $len - 1;
604:
605: $this->emitToken(
606: array(
607: 'type' => self::CHARACTR,
608: 'data' => $char
609: )
610: );
611:
612: $this->state = 'data';
613: }
614: }
615:
616: private function entityDataState()
617: {
618: // Attempt to consume an entity.
619: $entity = $this->entity();
620:
621: // If nothing is returned, emit a U+0026 AMPERSAND character token.
622: // Otherwise, emit the character token that was returned.
623: $char = (!$entity) ? '&' : $entity;
624: $this->emitToken(
625: array(
626: 'type' => self::CHARACTR,
627: 'data' => $char
628: )
629: );
630:
631: // Finally, switch to the data state.
632: $this->state = 'data';
633: }
634:
635: private function tagOpenState()
636: {
637: switch ($this->content_model) {
638: case self::RCDATA:
639: case self::CDATA:
640: /* If the next input character is a U+002F SOLIDUS (/) character,
641: consume it and switch to the close tag open state. If the next
642: input character is not a U+002F SOLIDUS (/) character, emit a
643: U+003C LESS-THAN SIGN character token and switch to the data
644: state to process the next input character. */
645: if ($this->character($this->char + 1) === '/') {
646: $this->char++;
647: $this->state = 'closeTagOpen';
648:
649: } else {
650: $this->emitToken(
651: array(
652: 'type' => self::CHARACTR,
653: 'data' => '<'
654: )
655: );
656:
657: $this->state = 'data';
658: }
659: break;
660:
661: case self::PCDATA:
662: // If the content model flag is set to the PCDATA state
663: // Consume the next input character:
664: $this->char++;
665: $char = $this->char();
666:
667: if ($char === '!') {
668: /* U+0021 EXCLAMATION MARK (!)
669: Switch to the markup declaration open state. */
670: $this->state = 'markupDeclarationOpen';
671:
672: } elseif ($char === '/') {
673: /* U+002F SOLIDUS (/)
674: Switch to the close tag open state. */
675: $this->state = 'closeTagOpen';
676:
677: } elseif (preg_match('/^[A-Za-z]$/', $char)) {
678: /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
679: Create a new start tag token, set its tag name to the lowercase
680: version of the input character (add 0x0020 to the character's code
681: point), then switch to the tag name state. (Don't emit the token
682: yet; further details will be filled in before it is emitted.) */
683: $this->token = array(
684: 'name' => strtolower($char),
685: 'type' => self::STARTTAG,
686: 'attr' => array()
687: );
688:
689: $this->state = 'tagName';
690:
691: } elseif ($char === '>') {
692: /* U+003E GREATER-THAN SIGN (>)
693: Parse error. Emit a U+003C LESS-THAN SIGN character token and a
694: U+003E GREATER-THAN SIGN character token. Switch to the data state. */
695: $this->emitToken(
696: array(
697: 'type' => self::CHARACTR,
698: 'data' => '<>'
699: )
700: );
701:
702: $this->state = 'data';
703:
704: } elseif ($char === '?') {
705: /* U+003F QUESTION MARK (?)
706: Parse error. Switch to the bogus comment state. */
707: $this->state = 'bogusComment';
708:
709: } else {
710: /* Anything else
711: Parse error. Emit a U+003C LESS-THAN SIGN character token and
712: reconsume the current input character in the data state. */
713: $this->emitToken(
714: array(
715: 'type' => self::CHARACTR,
716: 'data' => '<'
717: )
718: );
719:
720: $this->char--;
721: $this->state = 'data';
722: }
723: break;
724: }
725: }
726:
727: private function closeTagOpenState()
728: {
729: $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
730: $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
731:
732: if (($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
733: (!$the_same || ($the_same && (!preg_match(
734: '/[\t\n\x0b\x0c >\/]/',
735: $this->character($this->char + 1 + strlen($next_node))
736: ) || $this->EOF === $this->char)))
737: ) {
738: /* If the content model flag is set to the RCDATA or CDATA states then
739: examine the next few characters. If they do not match the tag name of
740: the last start tag token emitted (case insensitively), or if they do but
741: they are not immediately followed by one of the following characters:
742: * U+0009 CHARACTER TABULATION
743: * U+000A LINE FEED (LF)
744: * U+000B LINE TABULATION
745: * U+000C FORM FEED (FF)
746: * U+0020 SPACE
747: * U+003E GREATER-THAN SIGN (>)
748: * U+002F SOLIDUS (/)
749: * EOF
750: ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
751: token, a U+002F SOLIDUS character token, and switch to the data state
752: to process the next input character. */
753: $this->emitToken(
754: array(
755: 'type' => self::CHARACTR,
756: 'data' => '</'
757: )
758: );
759:
760: $this->state = 'data';
761:
762: } else {
763: /* Otherwise, if the content model flag is set to the PCDATA state,
764: or if the next few characters do match that tag name, consume the
765: next input character: */
766: $this->char++;
767: $char = $this->char();
768:
769: if (preg_match('/^[A-Za-z]$/', $char)) {
770: /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
771: Create a new end tag token, set its tag name to the lowercase version
772: of the input character (add 0x0020 to the character's code point), then
773: switch to the tag name state. (Don't emit the token yet; further details
774: will be filled in before it is emitted.) */
775: $this->token = array(
776: 'name' => strtolower($char),
777: 'type' => self::ENDTAG
778: );
779:
780: $this->state = 'tagName';
781:
782: } elseif ($char === '>') {
783: /* U+003E GREATER-THAN SIGN (>)
784: Parse error. Switch to the data state. */
785: $this->state = 'data';
786:
787: } elseif ($this->char === $this->EOF) {
788: /* EOF
789: Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
790: SOLIDUS character token. Reconsume the EOF character in the data state. */
791: $this->emitToken(
792: array(
793: 'type' => self::CHARACTR,
794: 'data' => '</'
795: )
796: );
797:
798: $this->char--;
799: $this->state = 'data';
800:
801: } else {
802: /* Parse error. Switch to the bogus comment state. */
803: $this->state = 'bogusComment';
804: }
805: }
806: }
807:
808: private function tagNameState()
809: {
810: // Consume the next input character:
811: $this->char++;
812: $char = $this->character($this->char);
813:
814: if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
815: /* U+0009 CHARACTER TABULATION
816: U+000A LINE FEED (LF)
817: U+000B LINE TABULATION
818: U+000C FORM FEED (FF)
819: U+0020 SPACE
820: Switch to the before attribute name state. */
821: $this->state = 'beforeAttributeName';
822:
823: } elseif ($char === '>') {
824: /* U+003E GREATER-THAN SIGN (>)
825: Emit the current tag token. Switch to the data state. */
826: $this->emitToken($this->token);
827: $this->state = 'data';
828:
829: } elseif ($this->char === $this->EOF) {
830: /* EOF
831: Parse error. Emit the current tag token. Reconsume the EOF
832: character in the data state. */
833: $this->emitToken($this->token);
834:
835: $this->char--;
836: $this->state = 'data';
837:
838: } elseif ($char === '/') {
839: /* U+002F SOLIDUS (/)
840: Parse error unless this is a permitted slash. Switch to the before
841: attribute name state. */
842: $this->state = 'beforeAttributeName';
843:
844: } else {
845: /* Anything else
846: Append the current input character to the current tag token's tag name.
847: Stay in the tag name state. */
848: $this->token['name'] .= strtolower($char);
849: $this->state = 'tagName';
850: }
851: }
852:
853: private function beforeAttributeNameState()
854: {
855: // Consume the next input character:
856: $this->char++;
857: $char = $this->character($this->char);
858:
859: if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
860: /* U+0009 CHARACTER TABULATION
861: U+000A LINE FEED (LF)
862: U+000B LINE TABULATION
863: U+000C FORM FEED (FF)
864: U+0020 SPACE
865: Stay in the before attribute name state. */
866: $this->state = 'beforeAttributeName';
867:
868: } elseif ($char === '>') {
869: /* U+003E GREATER-THAN SIGN (>)
870: Emit the current tag token. Switch to the data state. */
871: $this->emitToken($this->token);
872: $this->state = 'data';
873:
874: } elseif ($char === '/') {
875: /* U+002F SOLIDUS (/)
876: Parse error unless this is a permitted slash. Stay in the before
877: attribute name state. */
878: $this->state = 'beforeAttributeName';
879:
880: } elseif ($this->char === $this->EOF) {
881: /* EOF
882: Parse error. Emit the current tag token. Reconsume the EOF
883: character in the data state. */
884: $this->emitToken($this->token);
885:
886: $this->char--;
887: $this->state = 'data';
888:
889: } else {
890: /* Anything else
891: Start a new attribute in the current tag token. Set that attribute's
892: name to the current input character, and its value to the empty string.
893: Switch to the attribute name state. */
894: $this->token['attr'][] = array(
895: 'name' => strtolower($char),
896: 'value' => null
897: );
898:
899: $this->state = 'attributeName';
900: }
901: }
902:
903: private function attributeNameState()
904: {
905: // Consume the next input character:
906: $this->char++;
907: $char = $this->character($this->char);
908:
909: if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
910: /* U+0009 CHARACTER TABULATION
911: U+000A LINE FEED (LF)
912: U+000B LINE TABULATION
913: U+000C FORM FEED (FF)
914: U+0020 SPACE
915: Stay in the before attribute name state. */
916: $this->state = 'afterAttributeName';
917:
918: } elseif ($char === '=') {
919: /* U+003D EQUALS SIGN (=)
920: Switch to the before attribute value state. */
921: $this->state = 'beforeAttributeValue';
922:
923: } elseif ($char === '>') {
924: /* U+003E GREATER-THAN SIGN (>)
925: Emit the current tag token. Switch to the data state. */
926: $this->emitToken($this->token);
927: $this->state = 'data';
928:
929: } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {
930: /* U+002F SOLIDUS (/)
931: Parse error unless this is a permitted slash. Switch to the before
932: attribute name state. */
933: $this->state = 'beforeAttributeName';
934:
935: } elseif ($this->char === $this->EOF) {
936: /* EOF
937: Parse error. Emit the current tag token. Reconsume the EOF
938: character in the data state. */
939: $this->emitToken($this->token);
940:
941: $this->char--;
942: $this->state = 'data';
943:
944: } else {
945: /* Anything else
946: Append the current input character to the current attribute's name.
947: Stay in the attribute name state. */
948: $last = count($this->token['attr']) - 1;
949: $this->token['attr'][$last]['name'] .= strtolower($char);
950:
951: $this->state = 'attributeName';
952: }
953: }
954:
955: private function afterAttributeNameState()
956: {
957: // Consume the next input character:
958: $this->char++;
959: $char = $this->character($this->char);
960:
961: if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
962: /* U+0009 CHARACTER TABULATION
963: U+000A LINE FEED (LF)
964: U+000B LINE TABULATION
965: U+000C FORM FEED (FF)
966: U+0020 SPACE
967: Stay in the after attribute name state. */
968: $this->state = 'afterAttributeName';
969:
970: } elseif ($char === '=') {
971: /* U+003D EQUALS SIGN (=)
972: Switch to the before attribute value state. */
973: $this->state = 'beforeAttributeValue';
974:
975: } elseif ($char === '>') {
976: /* U+003E GREATER-THAN SIGN (>)
977: Emit the current tag token. Switch to the data state. */
978: $this->emitToken($this->token);
979: $this->state = 'data';
980:
981: } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {
982: /* U+002F SOLIDUS (/)
983: Parse error unless this is a permitted slash. Switch to the
984: before attribute name state. */
985: $this->state = 'beforeAttributeName';
986:
987: } elseif ($this->char === $this->EOF) {
988: /* EOF
989: Parse error. Emit the current tag token. Reconsume the EOF
990: character in the data state. */
991: $this->emitToken($this->token);
992:
993: $this->char--;
994: $this->state = 'data';
995:
996: } else {
997: /* Anything else
998: Start a new attribute in the current tag token. Set that attribute's
999: name to the current input character, and its value to the empty string.
1000: Switch to the attribute name state. */
1001: $this->token['attr'][] = array(
1002: 'name' => strtolower($char),
1003: 'value' => null
1004: );
1005:
1006: $this->state = 'attributeName';
1007: }
1008: }
1009:
1010: private function beforeAttributeValueState()
1011: {
1012: // Consume the next input character:
1013: $this->char++;
1014: $char = $this->character($this->char);
1015:
1016: if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1017: /* U+0009 CHARACTER TABULATION
1018: U+000A LINE FEED (LF)
1019: U+000B LINE TABULATION
1020: U+000C FORM FEED (FF)
1021: U+0020 SPACE
1022: Stay in the before attribute value state. */
1023: $this->state = 'beforeAttributeValue';
1024:
1025: } elseif ($char === '"') {
1026: /* U+0022 QUOTATION MARK (")
1027: Switch to the attribute value (double-quoted) state. */
1028: $this->state = 'attributeValueDoubleQuoted';
1029:
1030: } elseif ($char === '&') {
1031: /* U+0026 AMPERSAND (&)
1032: Switch to the attribute value (unquoted) state and reconsume
1033: this input character. */
1034: $this->char--;
1035: $this->state = 'attributeValueUnquoted';
1036:
1037: } elseif ($char === '\'') {
1038: /* U+0027 APOSTROPHE (')
1039: Switch to the attribute value (single-quoted) state. */
1040: $this->state = 'attributeValueSingleQuoted';
1041:
1042: } elseif ($char === '>') {
1043: /* U+003E GREATER-THAN SIGN (>)
1044: Emit the current tag token. Switch to the data state. */
1045: $this->emitToken($this->token);
1046: $this->state = 'data';
1047:
1048: } else {
1049: /* Anything else
1050: Append the current input character to the current attribute's value.
1051: Switch to the attribute value (unquoted) state. */
1052: $last = count($this->token['attr']) - 1;
1053: $this->token['attr'][$last]['value'] .= $char;
1054:
1055: $this->state = 'attributeValueUnquoted';
1056: }
1057: }
1058:
1059: private function attributeValueDoubleQuotedState()
1060: {
1061: // Consume the next input character:
1062: $this->char++;
1063: $char = $this->character($this->char);
1064:
1065: if ($char === '"') {
1066: /* U+0022 QUOTATION MARK (")
1067: Switch to the before attribute name state. */
1068: $this->state = 'beforeAttributeName';
1069:
1070: } elseif ($char === '&') {
1071: /* U+0026 AMPERSAND (&)
1072: Switch to the entity in attribute value state. */
1073: $this->entityInAttributeValueState('double');
1074:
1075: } elseif ($this->char === $this->EOF) {
1076: /* EOF
1077: Parse error. Emit the current tag token. Reconsume the character
1078: in the data state. */
1079: $this->emitToken($this->token);
1080:
1081: $this->char--;
1082: $this->state = 'data';
1083:
1084: } else {
1085: /* Anything else
1086: Append the current input character to the current attribute's value.
1087: Stay in the attribute value (double-quoted) state. */
1088: $last = count($this->token['attr']) - 1;
1089: $this->token['attr'][$last]['value'] .= $char;
1090:
1091: $this->state = 'attributeValueDoubleQuoted';
1092: }
1093: }
1094:
1095: private function attributeValueSingleQuotedState()
1096: {
1097: // Consume the next input character:
1098: $this->char++;
1099: $char = $this->character($this->char);
1100:
1101: if ($char === '\'') {
1102: /* U+0022 QUOTATION MARK (')
1103: Switch to the before attribute name state. */
1104: $this->state = 'beforeAttributeName';
1105:
1106: } elseif ($char === '&') {
1107: /* U+0026 AMPERSAND (&)
1108: Switch to the entity in attribute value state. */
1109: $this->entityInAttributeValueState('single');
1110:
1111: } elseif ($this->char === $this->EOF) {
1112: /* EOF
1113: Parse error. Emit the current tag token. Reconsume the character
1114: in the data state. */
1115: $this->emitToken($this->token);
1116:
1117: $this->char--;
1118: $this->state = 'data';
1119:
1120: } else {
1121: /* Anything else
1122: Append the current input character to the current attribute's value.
1123: Stay in the attribute value (single-quoted) state. */
1124: $last = count($this->token['attr']) - 1;
1125: $this->token['attr'][$last]['value'] .= $char;
1126:
1127: $this->state = 'attributeValueSingleQuoted';
1128: }
1129: }
1130:
1131: private function attributeValueUnquotedState()
1132: {
1133: // Consume the next input character:
1134: $this->char++;
1135: $char = $this->character($this->char);
1136:
1137: if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1138: /* U+0009 CHARACTER TABULATION
1139: U+000A LINE FEED (LF)
1140: U+000B LINE TABULATION
1141: U+000C FORM FEED (FF)
1142: U+0020 SPACE
1143: Switch to the before attribute name state. */
1144: $this->state = 'beforeAttributeName';
1145:
1146: } elseif ($char === '&') {
1147: /* U+0026 AMPERSAND (&)
1148: Switch to the entity in attribute value state. */
1149: $this->entityInAttributeValueState();
1150:
1151: } elseif ($char === '>') {
1152: /* U+003E GREATER-THAN SIGN (>)
1153: Emit the current tag token. Switch to the data state. */
1154: $this->emitToken($this->token);
1155: $this->state = 'data';
1156:
1157: } else {
1158: /* Anything else
1159: Append the current input character to the current attribute's value.
1160: Stay in the attribute value (unquoted) state. */
1161: $last = count($this->token['attr']) - 1;
1162: $this->token['attr'][$last]['value'] .= $char;
1163:
1164: $this->state = 'attributeValueUnquoted';
1165: }
1166: }
1167:
1168: private function entityInAttributeValueState()
1169: {
1170: // Attempt to consume an entity.
1171: $entity = $this->entity();
1172:
1173: // If nothing is returned, append a U+0026 AMPERSAND character to the
1174: // current attribute's value. Otherwise, emit the character token that
1175: // was returned.
1176: $char = (!$entity)
1177: ? '&'
1178: : $entity;
1179:
1180: $last = count($this->token['attr']) - 1;
1181: $this->token['attr'][$last]['value'] .= $char;
1182: }
1183:
1184: private function bogusCommentState()
1185: {
1186: /* Consume every character up to the first U+003E GREATER-THAN SIGN
1187: character (>) or the end of the file (EOF), whichever comes first. Emit
1188: a comment token whose data is the concatenation of all the characters
1189: starting from and including the character that caused the state machine
1190: to switch into the bogus comment state, up to and including the last
1191: consumed character before the U+003E character, if any, or up to the
1192: end of the file otherwise. (If the comment was started by the end of
1193: the file (EOF), the token is empty.) */
1194: $data = $this->characters('^>', $this->char);
1195: $this->emitToken(
1196: array(
1197: 'data' => $data,
1198: 'type' => self::COMMENT
1199: )
1200: );
1201:
1202: $this->char += strlen($data);
1203:
1204: /* Switch to the data state. */
1205: $this->state = 'data';
1206:
1207: /* If the end of the file was reached, reconsume the EOF character. */
1208: if ($this->char === $this->EOF) {
1209: $this->char = $this->EOF - 1;
1210: }
1211: }
1212:
1213: private function markupDeclarationOpenState()
1214: {
1215: /* If the next two characters are both U+002D HYPHEN-MINUS (-)
1216: characters, consume those two characters, create a comment token whose
1217: data is the empty string, and switch to the comment state. */
1218: if ($this->character($this->char + 1, 2) === '--') {
1219: $this->char += 2;
1220: $this->state = 'comment';
1221: $this->token = array(
1222: 'data' => null,
1223: 'type' => self::COMMENT
1224: );
1225:
1226: /* Otherwise if the next seven chacacters are a case-insensitive match
1227: for the word "DOCTYPE", then consume those characters and switch to the
1228: DOCTYPE state. */
1229: } elseif (strtolower($this->character($this->char + 1, 7)) === 'doctype') {
1230: $this->char += 7;
1231: $this->state = 'doctype';
1232:
1233: /* Otherwise, is is a parse error. Switch to the bogus comment state.
1234: The next character that is consumed, if any, is the first character
1235: that will be in the comment. */
1236: } else {
1237: $this->char++;
1238: $this->state = 'bogusComment';
1239: }
1240: }
1241:
1242: private function commentState()
1243: {
1244: /* Consume the next input character: */
1245: $this->char++;
1246: $char = $this->char();
1247:
1248: /* U+002D HYPHEN-MINUS (-) */
1249: if ($char === '-') {
1250: /* Switch to the comment dash state */
1251: $this->state = 'commentDash';
1252:
1253: /* EOF */
1254: } elseif ($this->char === $this->EOF) {
1255: /* Parse error. Emit the comment token. Reconsume the EOF character
1256: in the data state. */
1257: $this->emitToken($this->token);
1258: $this->char--;
1259: $this->state = 'data';
1260:
1261: /* Anything else */
1262: } else {
1263: /* Append the input character to the comment token's data. Stay in
1264: the comment state. */
1265: $this->token['data'] .= $char;
1266: }
1267: }
1268:
1269: private function commentDashState()
1270: {
1271: /* Consume the next input character: */
1272: $this->char++;
1273: $char = $this->char();
1274:
1275: /* U+002D HYPHEN-MINUS (-) */
1276: if ($char === '-') {
1277: /* Switch to the comment end state */
1278: $this->state = 'commentEnd';
1279:
1280: /* EOF */
1281: } elseif ($this->char === $this->EOF) {
1282: /* Parse error. Emit the comment token. Reconsume the EOF character
1283: in the data state. */
1284: $this->emitToken($this->token);
1285: $this->char--;
1286: $this->state = 'data';
1287:
1288: /* Anything else */
1289: } else {
1290: /* Append a U+002D HYPHEN-MINUS (-) character and the input
1291: character to the comment token's data. Switch to the comment state. */
1292: $this->token['data'] .= '-' . $char;
1293: $this->state = 'comment';
1294: }
1295: }
1296:
1297: private function commentEndState()
1298: {
1299: /* Consume the next input character: */
1300: $this->char++;
1301: $char = $this->char();
1302:
1303: if ($char === '>') {
1304: $this->emitToken($this->token);
1305: $this->state = 'data';
1306:
1307: } elseif ($char === '-') {
1308: $this->token['data'] .= '-';
1309:
1310: } elseif ($this->char === $this->EOF) {
1311: $this->emitToken($this->token);
1312: $this->char--;
1313: $this->state = 'data';
1314:
1315: } else {
1316: $this->token['data'] .= '--' . $char;
1317: $this->state = 'comment';
1318: }
1319: }
1320:
1321: private function doctypeState()
1322: {
1323: /* Consume the next input character: */
1324: $this->char++;
1325: $char = $this->char();
1326:
1327: if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1328: $this->state = 'beforeDoctypeName';
1329:
1330: } else {
1331: $this->char--;
1332: $this->state = 'beforeDoctypeName';
1333: }
1334: }
1335:
1336: private function beforeDoctypeNameState()
1337: {
1338: /* Consume the next input character: */
1339: $this->char++;
1340: $char = $this->char();
1341:
1342: if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1343: // Stay in the before DOCTYPE name state.
1344:
1345: } elseif (preg_match('/^[a-z]$/', $char)) {
1346: $this->token = array(
1347: 'name' => strtoupper($char),
1348: 'type' => self::DOCTYPE,
1349: 'error' => true
1350: );
1351:
1352: $this->state = 'doctypeName';
1353:
1354: } elseif ($char === '>') {
1355: $this->emitToken(
1356: array(
1357: 'name' => null,
1358: 'type' => self::DOCTYPE,
1359: 'error' => true
1360: )
1361: );
1362:
1363: $this->state = 'data';
1364:
1365: } elseif ($this->char === $this->EOF) {
1366: $this->emitToken(
1367: array(
1368: 'name' => null,
1369: 'type' => self::DOCTYPE,
1370: 'error' => true
1371: )
1372: );
1373:
1374: $this->char--;
1375: $this->state = 'data';
1376:
1377: } else {
1378: $this->token = array(
1379: 'name' => $char,
1380: 'type' => self::DOCTYPE,
1381: 'error' => true
1382: );
1383:
1384: $this->state = 'doctypeName';
1385: }
1386: }
1387:
1388: private function doctypeNameState()
1389: {
1390: /* Consume the next input character: */
1391: $this->char++;
1392: $char = $this->char();
1393:
1394: if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1395: $this->state = 'AfterDoctypeName';
1396:
1397: } elseif ($char === '>') {
1398: $this->emitToken($this->token);
1399: $this->state = 'data';
1400:
1401: } elseif (preg_match('/^[a-z]$/', $char)) {
1402: $this->token['name'] .= strtoupper($char);
1403:
1404: } elseif ($this->char === $this->EOF) {
1405: $this->emitToken($this->token);
1406: $this->char--;
1407: $this->state = 'data';
1408:
1409: } else {
1410: $this->token['name'] .= $char;
1411: }
1412:
1413: $this->token['error'] = ($this->token['name'] === 'HTML')
1414: ? false
1415: : true;
1416: }
1417:
1418: private function afterDoctypeNameState()
1419: {
1420: /* Consume the next input character: */
1421: $this->char++;
1422: $char = $this->char();
1423:
1424: if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1425: // Stay in the DOCTYPE name state.
1426:
1427: } elseif ($char === '>') {
1428: $this->emitToken($this->token);
1429: $this->state = 'data';
1430:
1431: } elseif ($this->char === $this->EOF) {
1432: $this->emitToken($this->token);
1433: $this->char--;
1434: $this->state = 'data';
1435:
1436: } else {
1437: $this->token['error'] = true;
1438: $this->state = 'bogusDoctype';
1439: }
1440: }
1441:
1442: private function bogusDoctypeState()
1443: {
1444: /* Consume the next input character: */
1445: $this->char++;
1446: $char = $this->char();
1447:
1448: if ($char === '>') {
1449: $this->emitToken($this->token);
1450: $this->state = 'data';
1451:
1452: } elseif ($this->char === $this->EOF) {
1453: $this->emitToken($this->token);
1454: $this->char--;
1455: $this->state = 'data';
1456:
1457: } else {
1458: // Stay in the bogus DOCTYPE state.
1459: }
1460: }
1461:
1462: private function entity()
1463: {
1464: $start = $this->char;
1465:
1466: // This section defines how to consume an entity. This definition is
1467: // used when parsing entities in text and in attributes.
1468:
1469: // The behaviour depends on the identity of the next character (the
1470: // one immediately after the U+0026 AMPERSAND character):
1471:
1472: switch ($this->character($this->char + 1)) {
1473: // U+0023 NUMBER SIGN (#)
1474: case '#':
1475:
1476: // The behaviour further depends on the character after the
1477: // U+0023 NUMBER SIGN:
1478: switch ($this->character($this->char + 1)) {
1479: // U+0078 LATIN SMALL LETTER X
1480: // U+0058 LATIN CAPITAL LETTER X
1481: case 'x':
1482: case 'X':
1483: // Follow the steps below, but using the range of
1484: // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1485: // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1486: // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1487: // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1488: // words, 0-9, A-F, a-f).
1489: $char = 1;
1490: $char_class = '0-9A-Fa-f';
1491: break;
1492:
1493: // Anything else
1494: default:
1495: // Follow the steps below, but using the range of
1496: // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1497: // NINE (i.e. just 0-9).
1498: $char = 0;
1499: $char_class = '0-9';
1500: break;
1501: }
1502:
1503: // Consume as many characters as match the range of characters
1504: // given above.
1505: $this->char++;
1506: $e_name = $this->characters($char_class, $this->char + $char + 1);
1507: $entity = $this->character($start, $this->char);
1508: $cond = strlen($e_name) > 0;
1509:
1510: // The rest of the parsing happens below.
1511: break;
1512:
1513: // Anything else
1514: default:
1515: // Consume the maximum number of characters possible, with the
1516: // consumed characters case-sensitively matching one of the
1517: // identifiers in the first column of the entities table.
1518:
1519: $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1520: $len = strlen($e_name);
1521:
1522: for ($c = 1; $c <= $len; $c++) {
1523: $id = substr($e_name, 0, $c);
1524: $this->char++;
1525:
1526: if (in_array($id, $this->entities)) {
1527: if ($e_name[$c - 1] !== ';') {
1528: if ($c < $len && $e_name[$c] == ';') {
1529: $this->char++; // consume extra semicolon
1530: }
1531: }
1532: $entity = $id;
1533: break;
1534: }
1535: }
1536:
1537: $cond = isset($entity);
1538: // The rest of the parsing happens below.
1539: break;
1540: }
1541:
1542: if (!$cond) {
1543: // If no match can be made, then this is a parse error. No
1544: // characters are consumed, and nothing is returned.
1545: $this->char = $start;
1546: return false;
1547: }
1548:
1549: // Return a character token for the character corresponding to the
1550: // entity name (as given by the second column of the entities table).
1551: return html_entity_decode('&' . rtrim($entity, ';') . ';', ENT_QUOTES, 'UTF-8');
1552: }
1553:
1554: private function emitToken($token)
1555: {
1556: $emit = $this->tree->emitToken($token);
1557:
1558: if (is_int($emit)) {
1559: $this->content_model = $emit;
1560:
1561: } elseif ($token['type'] === self::ENDTAG) {
1562: $this->content_model = self::PCDATA;
1563: }
1564: }
1565:
1566: private function EOF()
1567: {
1568: $this->state = null;
1569: $this->tree->emitToken(
1570: array(
1571: 'type' => self::EOF
1572: )
1573: );
1574: }
1575: }
1576:
1577: class HTML5TreeConstructer
1578: {
1579: public $stack = array();
1580:
1581: private $phase;
1582: private $mode;
1583: private $dom;
1584: private $foster_parent = null;
1585: private $a_formatting = array();
1586:
1587: private $head_pointer = null;
1588: private $form_pointer = null;
1589:
1590: private $scoping = array('button', 'caption', 'html', 'marquee', 'object', 'table', 'td', 'th');
1591: private $formatting = array(
1592: 'a',
1593: 'b',
1594: 'big',
1595: 'em',
1596: 'font',
1597: 'i',
1598: 'nobr',
1599: 's',
1600: 'small',
1601: 'strike',
1602: 'strong',
1603: 'tt',
1604: 'u'
1605: );
1606: private $special = array(
1607: 'address',
1608: 'area',
1609: 'base',
1610: 'basefont',
1611: 'bgsound',
1612: 'blockquote',
1613: 'body',
1614: 'br',
1615: 'center',
1616: 'col',
1617: 'colgroup',
1618: 'dd',
1619: 'dir',
1620: 'div',
1621: 'dl',
1622: 'dt',
1623: 'embed',
1624: 'fieldset',
1625: 'form',
1626: 'frame',
1627: 'frameset',
1628: 'h1',
1629: 'h2',
1630: 'h3',
1631: 'h4',
1632: 'h5',
1633: 'h6',
1634: 'head',
1635: 'hr',
1636: 'iframe',
1637: 'image',
1638: 'img',
1639: 'input',
1640: 'isindex',
1641: 'li',
1642: 'link',
1643: 'listing',
1644: 'menu',
1645: 'meta',
1646: 'noembed',
1647: 'noframes',
1648: 'noscript',
1649: 'ol',
1650: 'optgroup',
1651: 'option',
1652: 'p',
1653: 'param',
1654: 'plaintext',
1655: 'pre',
1656: 'script',
1657: 'select',
1658: 'spacer',
1659: 'style',
1660: 'tbody',
1661: 'textarea',
1662: 'tfoot',
1663: 'thead',
1664: 'title',
1665: 'tr',
1666: 'ul',
1667: 'wbr'
1668: );
1669:
1670: // The different phases.
1671: const INIT_PHASE = 0;
1672: const ROOT_PHASE = 1;
1673: const MAIN_PHASE = 2;
1674: const END_PHASE = 3;
1675:
1676: // The different insertion modes for the main phase.
1677: const BEFOR_HEAD = 0;
1678: const IN_HEAD = 1;
1679: const AFTER_HEAD = 2;
1680: const IN_BODY = 3;
1681: const IN_TABLE = 4;
1682: const IN_CAPTION = 5;
1683: const IN_CGROUP = 6;
1684: const IN_TBODY = 7;
1685: const IN_ROW = 8;
1686: const IN_CELL = 9;
1687: const IN_SELECT = 10;
1688: const AFTER_BODY = 11;
1689: const IN_FRAME = 12;
1690: const AFTR_FRAME = 13;
1691:
1692: // The different types of elements.
1693: const SPECIAL = 0;
1694: const SCOPING = 1;
1695: const FORMATTING = 2;
1696: const PHRASING = 3;
1697:
1698: const MARKER = 0;
1699:
1700: public function __construct()
1701: {
1702: $this->phase = self::INIT_PHASE;
1703: $this->mode = self::BEFOR_HEAD;
1704: $this->dom = new DOMDocument;
1705:
1706: $this->dom->encoding = 'UTF-8';
1707: $this->dom->preserveWhiteSpace = true;
1708: $this->dom->substituteEntities = true;
1709: $this->dom->strictErrorChecking = false;
1710: }
1711:
1712: // Process tag tokens
1713: public function emitToken($token)
1714: {
1715: switch ($this->phase) {
1716: case self::INIT_PHASE:
1717: return $this->initPhase($token);
1718: break;
1719: case self::ROOT_PHASE:
1720: return $this->rootElementPhase($token);
1721: break;
1722: case self::MAIN_PHASE:
1723: return $this->mainPhase($token);
1724: break;
1725: case self::END_PHASE :
1726: return $this->trailingEndPhase($token);
1727: break;
1728: }
1729: }
1730:
1731: private function initPhase($token)
1732: {
1733: /* Initially, the tree construction stage must handle each token
1734: emitted from the tokenisation stage as follows: */
1735:
1736: /* A DOCTYPE token that is marked as being in error
1737: A comment token
1738: A start tag token
1739: An end tag token
1740: A character token that is not one of one of U+0009 CHARACTER TABULATION,
1741: U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1742: or U+0020 SPACE
1743: An end-of-file token */
1744: if ((isset($token['error']) && $token['error']) ||
1745: $token['type'] === HTML5::COMMENT ||
1746: $token['type'] === HTML5::STARTTAG ||
1747: $token['type'] === HTML5::ENDTAG ||
1748: $token['type'] === HTML5::EOF ||
1749: ($token['type'] === HTML5::CHARACTR && isset($token['data']) &&
1750: !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))
1751: ) {
1752: /* This specification does not define how to handle this case. In
1753: particular, user agents may ignore the entirety of this specification
1754: altogether for such documents, and instead invoke special parse modes
1755: with a greater emphasis on backwards compatibility. */
1756:
1757: $this->phase = self::ROOT_PHASE;
1758: return $this->rootElementPhase($token);
1759:
1760: /* A DOCTYPE token marked as being correct */
1761: } elseif (isset($token['error']) && !$token['error']) {
1762: /* Append a DocumentType node to the Document node, with the name
1763: attribute set to the name given in the DOCTYPE token (which will be
1764: "HTML"), and the other attributes specific to DocumentType objects
1765: set to null, empty lists, or the empty string as appropriate. */
1766: $doctype = new DOMDocumentType(null, null, 'HTML');
1767:
1768: /* Then, switch to the root element phase of the tree construction
1769: stage. */
1770: $this->phase = self::ROOT_PHASE;
1771:
1772: /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1773: U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1774: or U+0020 SPACE */
1775: } elseif (isset($token['data']) && preg_match(
1776: '/^[\t\n\x0b\x0c ]+$/',
1777: $token['data']
1778: )
1779: ) {
1780: /* Append that character to the Document node. */
1781: $text = $this->dom->createTextNode($token['data']);
1782: $this->dom->appendChild($text);
1783: }
1784: }
1785:
1786: private function rootElementPhase($token)
1787: {
1788: /* After the initial phase, as each token is emitted from the tokenisation
1789: stage, it must be processed as described in this section. */
1790:
1791: /* A DOCTYPE token */
1792: if ($token['type'] === HTML5::DOCTYPE) {
1793: // Parse error. Ignore the token.
1794:
1795: /* A comment token */
1796: } elseif ($token['type'] === HTML5::COMMENT) {
1797: /* Append a Comment node to the Document object with the data
1798: attribute set to the data given in the comment token. */
1799: $comment = $this->dom->createComment($token['data']);
1800: $this->dom->appendChild($comment);
1801:
1802: /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1803: U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1804: or U+0020 SPACE */
1805: } elseif ($token['type'] === HTML5::CHARACTR &&
1806: preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
1807: ) {
1808: /* Append that character to the Document node. */
1809: $text = $this->dom->createTextNode($token['data']);
1810: $this->dom->appendChild($text);
1811:
1812: /* A character token that is not one of U+0009 CHARACTER TABULATION,
1813: U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED
1814: (FF), or U+0020 SPACE
1815: A start tag token
1816: An end tag token
1817: An end-of-file token */
1818: } elseif (($token['type'] === HTML5::CHARACTR &&
1819: !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
1820: $token['type'] === HTML5::STARTTAG ||
1821: $token['type'] === HTML5::ENDTAG ||
1822: $token['type'] === HTML5::EOF
1823: ) {
1824: /* Create an HTMLElement node with the tag name html, in the HTML
1825: namespace. Append it to the Document object. Switch to the main
1826: phase and reprocess the current token. */
1827: $html = $this->dom->createElement('html');
1828: $this->dom->appendChild($html);
1829: $this->stack[] = $html;
1830:
1831: $this->phase = self::MAIN_PHASE;
1832: return $this->mainPhase($token);
1833: }
1834: }
1835:
1836: private function mainPhase($token)
1837: {
1838: /* Tokens in the main phase must be handled as follows: */
1839:
1840: /* A DOCTYPE token */
1841: if ($token['type'] === HTML5::DOCTYPE) {
1842: // Parse error. Ignore the token.
1843:
1844: /* A start tag token with the tag name "html" */
1845: } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') {
1846: /* If this start tag token was not the first start tag token, then
1847: it is a parse error. */
1848:
1849: /* For each attribute on the token, check to see if the attribute
1850: is already present on the top element of the stack of open elements.
1851: If it is not, add the attribute and its corresponding value to that
1852: element. */
1853: foreach ($token['attr'] as $attr) {
1854: if (!$this->stack[0]->hasAttribute($attr['name'])) {
1855: $this->stack[0]->setAttribute($attr['name'], $attr['value']);
1856: }
1857: }
1858:
1859: /* An end-of-file token */
1860: } elseif ($token['type'] === HTML5::EOF) {
1861: /* Generate implied end tags. */
1862: $this->generateImpliedEndTags();
1863:
1864: /* Anything else. */
1865: } else {
1866: /* Depends on the insertion mode: */
1867: switch ($this->mode) {
1868: case self::BEFOR_HEAD:
1869: return $this->beforeHead($token);
1870: break;
1871: case self::IN_HEAD:
1872: return $this->inHead($token);
1873: break;
1874: case self::AFTER_HEAD:
1875: return $this->afterHead($token);
1876: break;
1877: case self::IN_BODY:
1878: return $this->inBody($token);
1879: break;
1880: case self::IN_TABLE:
1881: return $this->inTable($token);
1882: break;
1883: case self::IN_CAPTION:
1884: return $this->inCaption($token);
1885: break;
1886: case self::IN_CGROUP:
1887: return $this->inColumnGroup($token);
1888: break;
1889: case self::IN_TBODY:
1890: return $this->inTableBody($token);
1891: break;
1892: case self::IN_ROW:
1893: return $this->inRow($token);
1894: break;
1895: case self::IN_CELL:
1896: return $this->inCell($token);
1897: break;
1898: case self::IN_SELECT:
1899: return $this->inSelect($token);
1900: break;
1901: case self::AFTER_BODY:
1902: return $this->afterBody($token);
1903: break;
1904: case self::IN_FRAME:
1905: return $this->inFrameset($token);
1906: break;
1907: case self::AFTR_FRAME:
1908: return $this->afterFrameset($token);
1909: break;
1910: case self::END_PHASE:
1911: return $this->trailingEndPhase($token);
1912: break;
1913: }
1914: }
1915: }
1916:
1917: private function beforeHead($token)
1918: {
1919: /* Handle the token as follows: */
1920:
1921: /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1922: U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1923: or U+0020 SPACE */
1924: if ($token['type'] === HTML5::CHARACTR &&
1925: preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
1926: ) {
1927: /* Append the character to the current node. */
1928: $this->insertText($token['data']);
1929:
1930: /* A comment token */
1931: } elseif ($token['type'] === HTML5::COMMENT) {
1932: /* Append a Comment node to the current node with the data attribute
1933: set to the data given in the comment token. */
1934: $this->insertComment($token['data']);
1935:
1936: /* A start tag token with the tag name "head" */
1937: } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') {
1938: /* Create an element for the token, append the new element to the
1939: current node and push it onto the stack of open elements. */
1940: $element = $this->insertElement($token);
1941:
1942: /* Set the head element pointer to this new element node. */
1943: $this->head_pointer = $element;
1944:
1945: /* Change the insertion mode to "in head". */
1946: $this->mode = self::IN_HEAD;
1947:
1948: /* A start tag token whose tag name is one of: "base", "link", "meta",
1949: "script", "style", "title". Or an end tag with the tag name "html".
1950: Or a character token that is not one of U+0009 CHARACTER TABULATION,
1951: U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1952: or U+0020 SPACE. Or any other start tag token */
1953: } elseif ($token['type'] === HTML5::STARTTAG ||
1954: ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') ||
1955: ($token['type'] === HTML5::CHARACTR && !preg_match(
1956: '/^[\t\n\x0b\x0c ]$/',
1957: $token['data']
1958: ))
1959: ) {
1960: /* Act as if a start tag token with the tag name "head" and no
1961: attributes had been seen, then reprocess the current token. */
1962: $this->beforeHead(
1963: array(
1964: 'name' => 'head',
1965: 'type' => HTML5::STARTTAG,
1966: 'attr' => array()
1967: )
1968: );
1969:
1970: return $this->inHead($token);
1971:
1972: /* Any other end tag */
1973: } elseif ($token['type'] === HTML5::ENDTAG) {
1974: /* Parse error. Ignore the token. */
1975: }
1976: }
1977:
1978: private function inHead($token)
1979: {
1980: /* Handle the token as follows: */
1981:
1982: /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1983: U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1984: or U+0020 SPACE.
1985:
1986: THIS DIFFERS FROM THE SPEC: If the current node is either a title, style
1987: or script element, append the character to the current node regardless
1988: of its content. */
1989: if (($token['type'] === HTML5::CHARACTR &&
1990: preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || (
1991: $token['type'] === HTML5::CHARACTR && in_array(
1992: end($this->stack)->nodeName,
1993: array('title', 'style', 'script')
1994: ))
1995: ) {
1996: /* Append the character to the current node. */
1997: $this->insertText($token['data']);
1998:
1999: /* A comment token */
2000: } elseif ($token['type'] === HTML5::COMMENT) {
2001: /* Append a Comment node to the current node with the data attribute
2002: set to the data given in the comment token. */
2003: $this->insertComment($token['data']);
2004:
2005: } elseif ($token['type'] === HTML5::ENDTAG &&
2006: in_array($token['name'], array('title', 'style', 'script'))
2007: ) {
2008: array_pop($this->stack);
2009: return HTML5::PCDATA;
2010:
2011: /* A start tag with the tag name "title" */
2012: } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') {
2013: /* Create an element for the token and append the new element to the
2014: node pointed to by the head element pointer, or, if that is null
2015: (innerHTML case), to the current node. */
2016: if ($this->head_pointer !== null) {
2017: $element = $this->insertElement($token, false);
2018: $this->head_pointer->appendChild($element);
2019:
2020: } else {
2021: $element = $this->insertElement($token);
2022: }
2023:
2024: /* Switch the tokeniser's content model flag to the RCDATA state. */
2025: return HTML5::RCDATA;
2026:
2027: /* A start tag with the tag name "style" */
2028: } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') {
2029: /* Create an element for the token and append the new element to the
2030: node pointed to by the head element pointer, or, if that is null
2031: (innerHTML case), to the current node. */
2032: if ($this->head_pointer !== null) {
2033: $element = $this->insertElement($token, false);
2034: $this->head_pointer->appendChild($element);
2035:
2036: } else {
2037: $this->insertElement($token);
2038: }
2039:
2040: /* Switch the tokeniser's content model flag to the CDATA state. */
2041: return HTML5::CDATA;
2042:
2043: /* A start tag with the tag name "script" */
2044: } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') {
2045: /* Create an element for the token. */
2046: $element = $this->insertElement($token, false);
2047: $this->head_pointer->appendChild($element);
2048:
2049: /* Switch the tokeniser's content model flag to the CDATA state. */
2050: return HTML5::CDATA;
2051:
2052: /* A start tag with the tag name "base", "link", or "meta" */
2053: } elseif ($token['type'] === HTML5::STARTTAG && in_array(
2054: $token['name'],
2055: array('base', 'link', 'meta')
2056: )
2057: ) {
2058: /* Create an element for the token and append the new element to the
2059: node pointed to by the head element pointer, or, if that is null
2060: (innerHTML case), to the current node. */
2061: if ($this->head_pointer !== null) {
2062: $element = $this->insertElement($token, false);
2063: $this->head_pointer->appendChild($element);
2064: array_pop($this->stack);
2065:
2066: } else {
2067: $this->insertElement($token);
2068: }
2069:
2070: /* An end tag with the tag name "head" */
2071: } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') {
2072: /* If the current node is a head element, pop the current node off
2073: the stack of open elements. */
2074: if ($this->head_pointer->isSameNode(end($this->stack))) {
2075: array_pop($this->stack);
2076:
2077: /* Otherwise, this is a parse error. */
2078: } else {
2079: // k
2080: }
2081:
2082: /* Change the insertion mode to "after head". */
2083: $this->mode = self::AFTER_HEAD;
2084:
2085: /* A start tag with the tag name "head" or an end tag except "html". */
2086: } elseif (($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') ||
2087: ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')
2088: ) {
2089: // Parse error. Ignore the token.
2090:
2091: /* Anything else */
2092: } else {
2093: /* If the current node is a head element, act as if an end tag
2094: token with the tag name "head" had been seen. */
2095: if ($this->head_pointer->isSameNode(end($this->stack))) {
2096: $this->inHead(
2097: array(
2098: 'name' => 'head',
2099: 'type' => HTML5::ENDTAG
2100: )
2101: );
2102:
2103: /* Otherwise, change the insertion mode to "after head". */
2104: } else {
2105: $this->mode = self::AFTER_HEAD;
2106: }
2107:
2108: /* Then, reprocess the current token. */
2109: return $this->afterHead($token);
2110: }
2111: }
2112:
2113: private function afterHead($token)
2114: {
2115: /* Handle the token as follows: */
2116:
2117: /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2118: U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2119: or U+0020 SPACE */
2120: if ($token['type'] === HTML5::CHARACTR &&
2121: preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
2122: ) {
2123: /* Append the character to the current node. */
2124: $this->insertText($token['data']);
2125:
2126: /* A comment token */
2127: } elseif ($token['type'] === HTML5::COMMENT) {
2128: /* Append a Comment node to the current node with the data attribute
2129: set to the data given in the comment token. */
2130: $this->insertComment($token['data']);
2131:
2132: /* A start tag token with the tag name "body" */
2133: } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') {
2134: /* Insert a body element for the token. */
2135: $this->insertElement($token);
2136:
2137: /* Change the insertion mode to "in body". */
2138: $this->mode = self::IN_BODY;
2139:
2140: /* A start tag token with the tag name "frameset" */
2141: } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') {
2142: /* Insert a frameset element for the token. */
2143: $this->insertElement($token);
2144:
2145: /* Change the insertion mode to "in frameset". */
2146: $this->mode = self::IN_FRAME;
2147:
2148: /* A start tag token whose tag name is one of: "base", "link", "meta",
2149: "script", "style", "title" */
2150: } elseif ($token['type'] === HTML5::STARTTAG && in_array(
2151: $token['name'],
2152: array('base', 'link', 'meta', 'script', 'style', 'title')
2153: )
2154: ) {
2155: /* Parse error. Switch the insertion mode back to "in head" and
2156: reprocess the token. */
2157: $this->mode = self::IN_HEAD;
2158: return $this->inHead($token);
2159:
2160: /* Anything else */
2161: } else {
2162: /* Act as if a start tag token with the tag name "body" and no
2163: attributes had been seen, and then reprocess the current token. */
2164: $this->afterHead(
2165: array(
2166: 'name' => 'body',
2167: 'type' => HTML5::STARTTAG,
2168: 'attr' => array()
2169: )
2170: );
2171:
2172: return $this->inBody($token);
2173: }
2174: }
2175:
2176: private function inBody($token)
2177: {
2178: /* Handle the token as follows: */
2179:
2180: switch ($token['type']) {
2181: /* A character token */
2182: case HTML5::CHARACTR:
2183: /* Reconstruct the active formatting elements, if any. */
2184: $this->reconstructActiveFormattingElements();
2185:
2186: /* Append the token's character to the current node. */
2187: $this->insertText($token['data']);
2188: break;
2189:
2190: /* A comment token */
2191: case HTML5::COMMENT:
2192: /* Append a Comment node to the current node with the data
2193: attribute set to the data given in the comment token. */
2194: $this->insertComment($token['data']);
2195: break;
2196:
2197: case HTML5::STARTTAG:
2198: switch ($token['name']) {
2199: /* A start tag token whose tag name is one of: "script",
2200: "style" */
2201: case 'script':
2202: case 'style':
2203: /* Process the token as if the insertion mode had been "in
2204: head". */
2205: return $this->inHead($token);
2206: break;
2207:
2208: /* A start tag token whose tag name is one of: "base", "link",
2209: "meta", "title" */
2210: case 'base':
2211: case 'link':
2212: case 'meta':
2213: case 'title':
2214: /* Parse error. Process the token as if the insertion mode
2215: had been "in head". */
2216: return $this->inHead($token);
2217: break;
2218:
2219: /* A start tag token with the tag name "body" */
2220: case 'body':
2221: /* Parse error. If the second element on the stack of open
2222: elements is not a body element, or, if the stack of open
2223: elements has only one node on it, then ignore the token.
2224: (innerHTML case) */
2225: if (count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') {
2226: // Ignore
2227:
2228: /* Otherwise, for each attribute on the token, check to see
2229: if the attribute is already present on the body element (the
2230: second element) on the stack of open elements. If it is not,
2231: add the attribute and its corresponding value to that
2232: element. */
2233: } else {
2234: foreach ($token['attr'] as $attr) {
2235: if (!$this->stack[1]->hasAttribute($attr['name'])) {
2236: $this->stack[1]->setAttribute($attr['name'], $attr['value']);
2237: }
2238: }
2239: }
2240: break;
2241:
2242: /* A start tag whose tag name is one of: "address",
2243: "blockquote", "center", "dir", "div", "dl", "fieldset",
2244: "listing", "menu", "ol", "p", "ul" */
2245: case 'address':
2246: case 'blockquote':
2247: case 'center':
2248: case 'dir':
2249: case 'div':
2250: case 'dl':
2251: case 'fieldset':
2252: case 'listing':
2253: case 'menu':
2254: case 'ol':
2255: case 'p':
2256: case 'ul':
2257: /* If the stack of open elements has a p element in scope,
2258: then act as if an end tag with the tag name p had been
2259: seen. */
2260: if ($this->elementInScope('p')) {
2261: $this->emitToken(
2262: array(
2263: 'name' => 'p',
2264: 'type' => HTML5::ENDTAG
2265: )
2266: );
2267: }
2268:
2269: /* Insert an HTML element for the token. */
2270: $this->insertElement($token);
2271: break;
2272:
2273: /* A start tag whose tag name is "form" */
2274: case 'form':
2275: /* If the form element pointer is not null, ignore the
2276: token with a parse error. */
2277: if ($this->form_pointer !== null) {
2278: // Ignore.
2279:
2280: /* Otherwise: */
2281: } else {
2282: /* If the stack of open elements has a p element in
2283: scope, then act as if an end tag with the tag name p
2284: had been seen. */
2285: if ($this->elementInScope('p')) {
2286: $this->emitToken(
2287: array(
2288: 'name' => 'p',
2289: 'type' => HTML5::ENDTAG
2290: )
2291: );
2292: }
2293:
2294: /* Insert an HTML element for the token, and set the
2295: form element pointer to point to the element created. */
2296: $element = $this->insertElement($token);
2297: $this->form_pointer = $element;
2298: }
2299: break;
2300:
2301: /* A start tag whose tag name is "li", "dd" or "dt" */
2302: case 'li':
2303: case 'dd':
2304: case 'dt':
2305: /* If the stack of open elements has a p element in scope,
2306: then act as if an end tag with the tag name p had been
2307: seen. */
2308: if ($this->elementInScope('p')) {
2309: $this->emitToken(
2310: array(
2311: 'name' => 'p',
2312: 'type' => HTML5::ENDTAG
2313: )
2314: );
2315: }
2316:
2317: $stack_length = count($this->stack) - 1;
2318:
2319: for ($n = $stack_length; 0 <= $n; $n--) {
2320: /* 1. Initialise node to be the current node (the
2321: bottommost node of the stack). */
2322: $stop = false;
2323: $node = $this->stack[$n];
2324: $cat = $this->getElementCategory($node->tagName);
2325:
2326: /* 2. If node is an li, dd or dt element, then pop all
2327: the nodes from the current node up to node, including
2328: node, then stop this algorithm. */
2329: if ($token['name'] === $node->tagName || ($token['name'] !== 'li'
2330: && ($node->tagName === 'dd' || $node->tagName === 'dt'))
2331: ) {
2332: for ($x = $stack_length; $x >= $n; $x--) {
2333: array_pop($this->stack);
2334: }
2335:
2336: break;
2337: }
2338:
2339: /* 3. If node is not in the formatting category, and is
2340: not in the phrasing category, and is not an address or
2341: div element, then stop this algorithm. */
2342: if ($cat !== self::FORMATTING && $cat !== self::PHRASING &&
2343: $node->tagName !== 'address' && $node->tagName !== 'div'
2344: ) {
2345: break;
2346: }
2347: }
2348:
2349: /* Finally, insert an HTML element with the same tag
2350: name as the token's. */
2351: $this->insertElement($token);
2352: break;
2353:
2354: /* A start tag token whose tag name is "plaintext" */
2355: case 'plaintext':
2356: /* If the stack of open elements has a p element in scope,
2357: then act as if an end tag with the tag name p had been
2358: seen. */
2359: if ($this->elementInScope('p')) {
2360: $this->emitToken(
2361: array(
2362: 'name' => 'p',
2363: 'type' => HTML5::ENDTAG
2364: )
2365: );
2366: }
2367:
2368: /* Insert an HTML element for the token. */
2369: $this->insertElement($token);
2370:
2371: return HTML5::PLAINTEXT;
2372: break;
2373:
2374: /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
2375: "h5", "h6" */
2376: case 'h1':
2377: case 'h2':
2378: case 'h3':
2379: case 'h4':
2380: case 'h5':
2381: case 'h6':
2382: /* If the stack of open elements has a p element in scope,
2383: then act as if an end tag with the tag name p had been seen. */
2384: if ($this->elementInScope('p')) {
2385: $this->emitToken(
2386: array(
2387: 'name' => 'p',
2388: 'type' => HTML5::ENDTAG
2389: )
2390: );
2391: }
2392:
2393: /* If the stack of open elements has in scope an element whose
2394: tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2395: this is a parse error; pop elements from the stack until an
2396: element with one of those tag names has been popped from the
2397: stack. */
2398: while ($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {
2399: array_pop($this->stack);
2400: }
2401:
2402: /* Insert an HTML element for the token. */
2403: $this->insertElement($token);
2404: break;
2405:
2406: /* A start tag whose tag name is "a" */
2407: case 'a':
2408: /* If the list of active formatting elements contains
2409: an element whose tag name is "a" between the end of the
2410: list and the last marker on the list (or the start of
2411: the list if there is no marker on the list), then this
2412: is a parse error; act as if an end tag with the tag name
2413: "a" had been seen, then remove that element from the list
2414: of active formatting elements and the stack of open
2415: elements if the end tag didn't already remove it (it
2416: might not have if the element is not in table scope). */
2417: $leng = count($this->a_formatting);
2418:
2419: for ($n = $leng - 1; $n >= 0; $n--) {
2420: if ($this->a_formatting[$n] === self::MARKER) {
2421: break;
2422:
2423: } elseif ($this->a_formatting[$n]->nodeName === 'a') {
2424: $this->emitToken(
2425: array(
2426: 'name' => 'a',
2427: 'type' => HTML5::ENDTAG
2428: )
2429: );
2430: break;
2431: }
2432: }
2433:
2434: /* Reconstruct the active formatting elements, if any. */
2435: $this->reconstructActiveFormattingElements();
2436:
2437: /* Insert an HTML element for the token. */
2438: $el = $this->insertElement($token);
2439:
2440: /* Add that element to the list of active formatting
2441: elements. */
2442: $this->a_formatting[] = $el;
2443: break;
2444:
2445: /* A start tag whose tag name is one of: "b", "big", "em", "font",
2446: "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2447: case 'b':
2448: case 'big':
2449: case 'em':
2450: case 'font':
2451: case 'i':
2452: case 'nobr':
2453: case 's':
2454: case 'small':
2455: case 'strike':
2456: case 'strong':
2457: case 'tt':
2458: case 'u':
2459: /* Reconstruct the active formatting elements, if any. */
2460: $this->reconstructActiveFormattingElements();
2461:
2462: /* Insert an HTML element for the token. */
2463: $el = $this->insertElement($token);
2464:
2465: /* Add that element to the list of active formatting
2466: elements. */
2467: $this->a_formatting[] = $el;
2468: break;
2469:
2470: /* A start tag token whose tag name is "button" */
2471: case 'button':
2472: /* If the stack of open elements has a button element in scope,
2473: then this is a parse error; act as if an end tag with the tag
2474: name "button" had been seen, then reprocess the token. (We don't
2475: do that. Unnecessary.) */
2476: if ($this->elementInScope('button')) {
2477: $this->inBody(
2478: array(
2479: 'name' => 'button',
2480: 'type' => HTML5::ENDTAG
2481: )
2482: );
2483: }
2484:
2485: /* Reconstruct the active formatting elements, if any. */
2486: $this->reconstructActiveFormattingElements();
2487:
2488: /* Insert an HTML element for the token. */
2489: $this->insertElement($token);
2490:
2491: /* Insert a marker at the end of the list of active
2492: formatting elements. */
2493: $this->a_formatting[] = self::MARKER;
2494: break;
2495:
2496: /* A start tag token whose tag name is one of: "marquee", "object" */
2497: case 'marquee':
2498: case 'object':
2499: /* Reconstruct the active formatting elements, if any. */
2500: $this->reconstructActiveFormattingElements();
2501:
2502: /* Insert an HTML element for the token. */
2503: $this->insertElement($token);
2504:
2505: /* Insert a marker at the end of the list of active
2506: formatting elements. */
2507: $this->a_formatting[] = self::MARKER;
2508: break;
2509:
2510: /* A start tag token whose tag name is "xmp" */
2511: case 'xmp':
2512: /* Reconstruct the active formatting elements, if any. */
2513: $this->reconstructActiveFormattingElements();
2514:
2515: /* Insert an HTML element for the token. */
2516: $this->insertElement($token);
2517:
2518: /* Switch the content model flag to the CDATA state. */
2519: return HTML5::CDATA;
2520: break;
2521:
2522: /* A start tag whose tag name is "table" */
2523: case 'table':
2524: /* If the stack of open elements has a p element in scope,
2525: then act as if an end tag with the tag name p had been seen. */
2526: if ($this->elementInScope('p')) {
2527: $this->emitToken(
2528: array(
2529: 'name' => 'p',
2530: 'type' => HTML5::ENDTAG
2531: )
2532: );
2533: }
2534:
2535: /* Insert an HTML element for the token. */
2536: $this->insertElement($token);
2537:
2538: /* Change the insertion mode to "in table". */
2539: $this->mode = self::IN_TABLE;
2540: break;
2541:
2542: /* A start tag whose tag name is one of: "area", "basefont",
2543: "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
2544: case 'area':
2545: case 'basefont':
2546: case 'bgsound':
2547: case 'br':
2548: case 'embed':
2549: case 'img':
2550: case 'param':
2551: case 'spacer':
2552: case 'wbr':
2553: /* Reconstruct the active formatting elements, if any. */
2554: $this->reconstructActiveFormattingElements();
2555:
2556: /* Insert an HTML element for the token. */
2557: $this->insertElement($token);
2558:
2559: /* Immediately pop the current node off the stack of open elements. */
2560: array_pop($this->stack);
2561: break;
2562:
2563: /* A start tag whose tag name is "hr" */
2564: case 'hr':
2565: /* If the stack of open elements has a p element in scope,
2566: then act as if an end tag with the tag name p had been seen. */
2567: if ($this->elementInScope('p')) {
2568: $this->emitToken(
2569: array(
2570: 'name' => 'p',
2571: 'type' => HTML5::ENDTAG
2572: )
2573: );
2574: }
2575:
2576: /* Insert an HTML element for the token. */
2577: $this->insertElement($token);
2578:
2579: /* Immediately pop the current node off the stack of open elements. */
2580: array_pop($this->stack);
2581: break;
2582:
2583: /* A start tag whose tag name is "image" */
2584: case 'image':
2585: /* Parse error. Change the token's tag name to "img" and
2586: reprocess it. (Don't ask.) */
2587: $token['name'] = 'img';
2588: return $this->inBody($token);
2589: break;
2590:
2591: /* A start tag whose tag name is "input" */
2592: case 'input':
2593: /* Reconstruct the active formatting elements, if any. */
2594: $this->reconstructActiveFormattingElements();
2595:
2596: /* Insert an input element for the token. */
2597: $element = $this->insertElement($token, false);
2598:
2599: /* If the form element pointer is not null, then associate the
2600: input element with the form element pointed to by the form
2601: element pointer. */
2602: $this->form_pointer !== null
2603: ? $this->form_pointer->appendChild($element)
2604: : end($this->stack)->appendChild($element);
2605:
2606: /* Pop that input element off the stack of open elements. */
2607: array_pop($this->stack);
2608: break;
2609:
2610: /* A start tag whose tag name is "isindex" */
2611: case 'isindex':
2612: /* Parse error. */
2613: // w/e
2614:
2615: /* If the form element pointer is not null,
2616: then ignore the token. */
2617: if ($this->form_pointer === null) {
2618: /* Act as if a start tag token with the tag name "form" had
2619: been seen. */
2620: $this->inBody(
2621: array(
2622: 'name' => 'body',
2623: 'type' => HTML5::STARTTAG,
2624: 'attr' => array()
2625: )
2626: );
2627:
2628: /* Act as if a start tag token with the tag name "hr" had
2629: been seen. */
2630: $this->inBody(
2631: array(
2632: 'name' => 'hr',
2633: 'type' => HTML5::STARTTAG,
2634: 'attr' => array()
2635: )
2636: );
2637:
2638: /* Act as if a start tag token with the tag name "p" had
2639: been seen. */
2640: $this->inBody(
2641: array(
2642: 'name' => 'p',
2643: 'type' => HTML5::STARTTAG,
2644: 'attr' => array()
2645: )
2646: );
2647:
2648: /* Act as if a start tag token with the tag name "label"
2649: had been seen. */
2650: $this->inBody(
2651: array(
2652: 'name' => 'label',
2653: 'type' => HTML5::STARTTAG,
2654: 'attr' => array()
2655: )
2656: );
2657:
2658: /* Act as if a stream of character tokens had been seen. */
2659: $this->insertText(
2660: 'This is a searchable index. ' .
2661: 'Insert your search keywords here: '
2662: );
2663:
2664: /* Act as if a start tag token with the tag name "input"
2665: had been seen, with all the attributes from the "isindex"
2666: token, except with the "name" attribute set to the value
2667: "isindex" (ignoring any explicit "name" attribute). */
2668: $attr = $token['attr'];
2669: $attr[] = array('name' => 'name', 'value' => 'isindex');
2670:
2671: $this->inBody(
2672: array(
2673: 'name' => 'input',
2674: 'type' => HTML5::STARTTAG,
2675: 'attr' => $attr
2676: )
2677: );
2678:
2679: /* Act as if a stream of character tokens had been seen
2680: (see below for what they should say). */
2681: $this->insertText(
2682: 'This is a searchable index. ' .
2683: 'Insert your search keywords here: '
2684: );
2685:
2686: /* Act as if an end tag token with the tag name "label"
2687: had been seen. */
2688: $this->inBody(
2689: array(
2690: 'name' => 'label',
2691: 'type' => HTML5::ENDTAG
2692: )
2693: );
2694:
2695: /* Act as if an end tag token with the tag name "p" had
2696: been seen. */
2697: $this->inBody(
2698: array(
2699: 'name' => 'p',
2700: 'type' => HTML5::ENDTAG
2701: )
2702: );
2703:
2704: /* Act as if a start tag token with the tag name "hr" had
2705: been seen. */
2706: $this->inBody(
2707: array(
2708: 'name' => 'hr',
2709: 'type' => HTML5::ENDTAG
2710: )
2711: );
2712:
2713: /* Act as if an end tag token with the tag name "form" had
2714: been seen. */
2715: $this->inBody(
2716: array(
2717: 'name' => 'form',
2718: 'type' => HTML5::ENDTAG
2719: )
2720: );
2721: }
2722: break;
2723:
2724: /* A start tag whose tag name is "textarea" */
2725: case 'textarea':
2726: $this->insertElement($token);
2727:
2728: /* Switch the tokeniser's content model flag to the
2729: RCDATA state. */
2730: return HTML5::RCDATA;
2731: break;
2732:
2733: /* A start tag whose tag name is one of: "iframe", "noembed",
2734: "noframes" */
2735: case 'iframe':
2736: case 'noembed':
2737: case 'noframes':
2738: $this->insertElement($token);
2739:
2740: /* Switch the tokeniser's content model flag to the CDATA state. */
2741: return HTML5::CDATA;
2742: break;
2743:
2744: /* A start tag whose tag name is "select" */
2745: case 'select':
2746: /* Reconstruct the active formatting elements, if any. */
2747: $this->reconstructActiveFormattingElements();
2748:
2749: /* Insert an HTML element for the token. */
2750: $this->insertElement($token);
2751:
2752: /* Change the insertion mode to "in select". */
2753: $this->mode = self::IN_SELECT;
2754: break;
2755:
2756: /* A start or end tag whose tag name is one of: "caption", "col",
2757: "colgroup", "frame", "frameset", "head", "option", "optgroup",
2758: "tbody", "td", "tfoot", "th", "thead", "tr". */
2759: case 'caption':
2760: case 'col':
2761: case 'colgroup':
2762: case 'frame':
2763: case 'frameset':
2764: case 'head':
2765: case 'option':
2766: case 'optgroup':
2767: case 'tbody':
2768: case 'td':
2769: case 'tfoot':
2770: case 'th':
2771: case 'thead':
2772: case 'tr':
2773: // Parse error. Ignore the token.
2774: break;
2775:
2776: /* A start or end tag whose tag name is one of: "event-source",
2777: "section", "nav", "article", "aside", "header", "footer",
2778: "datagrid", "command" */
2779: case 'event-source':
2780: case 'section':
2781: case 'nav':
2782: case 'article':
2783: case 'aside':
2784: case 'header':
2785: case 'footer':
2786: case 'datagrid':
2787: case 'command':
2788: // Work in progress!
2789: break;
2790:
2791: /* A start tag token not covered by the previous entries */
2792: default:
2793: /* Reconstruct the active formatting elements, if any. */
2794: $this->reconstructActiveFormattingElements();
2795:
2796: $this->insertElement($token, true, true);
2797: break;
2798: }
2799: break;
2800:
2801: case HTML5::ENDTAG:
2802: switch ($token['name']) {
2803: /* An end tag with the tag name "body" */
2804: case 'body':
2805: /* If the second element in the stack of open elements is
2806: not a body element, this is a parse error. Ignore the token.
2807: (innerHTML case) */
2808: if (count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') {
2809: // Ignore.
2810:
2811: /* If the current node is not the body element, then this
2812: is a parse error. */
2813: } elseif (end($this->stack)->nodeName !== 'body') {
2814: // Parse error.
2815: }
2816:
2817: /* Change the insertion mode to "after body". */
2818: $this->mode = self::AFTER_BODY;
2819: break;
2820:
2821: /* An end tag with the tag name "html" */
2822: case 'html':
2823: /* Act as if an end tag with tag name "body" had been seen,
2824: then, if that token wasn't ignored, reprocess the current
2825: token. */
2826: $this->inBody(
2827: array(
2828: 'name' => 'body',
2829: 'type' => HTML5::ENDTAG
2830: )
2831: );
2832:
2833: return $this->afterBody($token);
2834: break;
2835:
2836: /* An end tag whose tag name is one of: "address", "blockquote",
2837: "center", "dir", "div", "dl", "fieldset", "listing", "menu",
2838: "ol", "pre", "ul" */
2839: case 'address':
2840: case 'blockquote':
2841: case 'center':
2842: case 'dir':
2843: case 'div':
2844: case 'dl':
2845: case 'fieldset':
2846: case 'listing':
2847: case 'menu':
2848: case 'ol':
2849: case 'pre':
2850: case 'ul':
2851: /* If the stack of open elements has an element in scope
2852: with the same tag name as that of the token, then generate
2853: implied end tags. */
2854: if ($this->elementInScope($token['name'])) {
2855: $this->generateImpliedEndTags();
2856:
2857: /* Now, if the current node is not an element with
2858: the same tag name as that of the token, then this
2859: is a parse error. */
2860: // w/e
2861:
2862: /* If the stack of open elements has an element in
2863: scope with the same tag name as that of the token,
2864: then pop elements from this stack until an element
2865: with that tag name has been popped from the stack. */
2866: for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2867: if ($this->stack[$n]->nodeName === $token['name']) {
2868: $n = -1;
2869: }
2870:
2871: array_pop($this->stack);
2872: }
2873: }
2874: break;
2875:
2876: /* An end tag whose tag name is "form" */
2877: case 'form':
2878: /* If the stack of open elements has an element in scope
2879: with the same tag name as that of the token, then generate
2880: implied end tags. */
2881: if ($this->elementInScope($token['name'])) {
2882: $this->generateImpliedEndTags();
2883:
2884: }
2885:
2886: if (end($this->stack)->nodeName !== $token['name']) {
2887: /* Now, if the current node is not an element with the
2888: same tag name as that of the token, then this is a parse
2889: error. */
2890: // w/e
2891:
2892: } else {
2893: /* Otherwise, if the current node is an element with
2894: the same tag name as that of the token pop that element
2895: from the stack. */
2896: array_pop($this->stack);
2897: }
2898:
2899: /* In any case, set the form element pointer to null. */
2900: $this->form_pointer = null;
2901: break;
2902:
2903: /* An end tag whose tag name is "p" */
2904: case 'p':
2905: /* If the stack of open elements has a p element in scope,
2906: then generate implied end tags, except for p elements. */
2907: if ($this->elementInScope('p')) {
2908: $this->generateImpliedEndTags(array('p'));
2909:
2910: /* If the current node is not a p element, then this is
2911: a parse error. */
2912: // k
2913:
2914: /* If the stack of open elements has a p element in
2915: scope, then pop elements from this stack until the stack
2916: no longer has a p element in scope. */
2917: for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2918: if ($this->elementInScope('p')) {
2919: array_pop($this->stack);
2920:
2921: } else {
2922: break;
2923: }
2924: }
2925: }
2926: break;
2927:
2928: /* An end tag whose tag name is "dd", "dt", or "li" */
2929: case 'dd':
2930: case 'dt':
2931: case 'li':
2932: /* If the stack of open elements has an element in scope
2933: whose tag name matches the tag name of the token, then
2934: generate implied end tags, except for elements with the
2935: same tag name as the token. */
2936: if ($this->elementInScope($token['name'])) {
2937: $this->generateImpliedEndTags(array($token['name']));
2938:
2939: /* If the current node is not an element with the same
2940: tag name as the token, then this is a parse error. */
2941: // w/e
2942:
2943: /* If the stack of open elements has an element in scope
2944: whose tag name matches the tag name of the token, then
2945: pop elements from this stack until an element with that
2946: tag name has been popped from the stack. */
2947: for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2948: if ($this->stack[$n]->nodeName === $token['name']) {
2949: $n = -1;
2950: }
2951:
2952: array_pop($this->stack);
2953: }
2954: }
2955: break;
2956:
2957: /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
2958: "h5", "h6" */
2959: case 'h1':
2960: case 'h2':
2961: case 'h3':
2962: case 'h4':
2963: case 'h5':
2964: case 'h6':
2965: $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
2966:
2967: /* If the stack of open elements has in scope an element whose
2968: tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2969: generate implied end tags. */
2970: if ($this->elementInScope($elements)) {
2971: $this->generateImpliedEndTags();
2972:
2973: /* Now, if the current node is not an element with the same
2974: tag name as that of the token, then this is a parse error. */
2975: // w/e
2976:
2977: /* If the stack of open elements has in scope an element
2978: whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
2979: "h6", then pop elements from the stack until an element
2980: with one of those tag names has been popped from the stack. */
2981: while ($this->elementInScope($elements)) {
2982: array_pop($this->stack);
2983: }
2984: }
2985: break;
2986:
2987: /* An end tag whose tag name is one of: "a", "b", "big", "em",
2988: "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2989: case 'a':
2990: case 'b':
2991: case 'big':
2992: case 'em':
2993: case 'font':
2994: case 'i':
2995: case 'nobr':
2996: case 's':
2997: case 'small':
2998: case 'strike':
2999: case 'strong':
3000: case 'tt':
3001: case 'u':
3002: /* 1. Let the formatting element be the last element in
3003: the list of active formatting elements that:
3004: * is between the end of the list and the last scope
3005: marker in the list, if any, or the start of the list
3006: otherwise, and
3007: * has the same tag name as the token.
3008: */
3009: while (true) {
3010: for ($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
3011: if ($this->a_formatting[$a] === self::MARKER) {
3012: break;
3013:
3014: } elseif ($this->a_formatting[$a]->tagName === $token['name']) {
3015: $formatting_element = $this->a_formatting[$a];
3016: $in_stack = in_array($formatting_element, $this->stack, true);
3017: $fe_af_pos = $a;
3018: break;
3019: }
3020: }
3021:
3022: /* If there is no such node, or, if that node is
3023: also in the stack of open elements but the element
3024: is not in scope, then this is a parse error. Abort
3025: these steps. The token is ignored. */
3026: if (!isset($formatting_element) || ($in_stack &&
3027: !$this->elementInScope($token['name']))
3028: ) {
3029: break;
3030:
3031: /* Otherwise, if there is such a node, but that node
3032: is not in the stack of open elements, then this is a
3033: parse error; remove the element from the list, and
3034: abort these steps. */
3035: } elseif (isset($formatting_element) && !$in_stack) {
3036: unset($this->a_formatting[$fe_af_pos]);
3037: $this->a_formatting = array_merge($this->a_formatting);
3038: break;
3039: }
3040:
3041: /* 2. Let the furthest block be the topmost node in the
3042: stack of open elements that is lower in the stack
3043: than the formatting element, and is not an element in
3044: the phrasing or formatting categories. There might
3045: not be one. */
3046: $fe_s_pos = array_search($formatting_element, $this->stack, true);
3047: $length = count($this->stack);
3048:
3049: for ($s = $fe_s_pos + 1; $s < $length; $s++) {
3050: $category = $this->getElementCategory($this->stack[$s]->nodeName);
3051:
3052: if ($category !== self::PHRASING && $category !== self::FORMATTING) {
3053: $furthest_block = $this->stack[$s];
3054: }
3055: }
3056:
3057: /* 3. If there is no furthest block, then the UA must
3058: skip the subsequent steps and instead just pop all
3059: the nodes from the bottom of the stack of open
3060: elements, from the current node up to the formatting
3061: element, and remove the formatting element from the
3062: list of active formatting elements. */
3063: if (!isset($furthest_block)) {
3064: for ($n = $length - 1; $n >= $fe_s_pos; $n--) {
3065: array_pop($this->stack);
3066: }
3067:
3068: unset($this->a_formatting[$fe_af_pos]);
3069: $this->a_formatting = array_merge($this->a_formatting);
3070: break;
3071: }
3072:
3073: /* 4. Let the common ancestor be the element
3074: immediately above the formatting element in the stack
3075: of open elements. */
3076: $common_ancestor = $this->stack[$fe_s_pos - 1];
3077:
3078: /* 5. If the furthest block has a parent node, then
3079: remove the furthest block from its parent node. */
3080: if ($furthest_block->parentNode !== null) {
3081: $furthest_block->parentNode->removeChild($furthest_block);
3082: }
3083:
3084: /* 6. Let a bookmark note the position of the
3085: formatting element in the list of active formatting
3086: elements relative to the elements on either side
3087: of it in the list. */
3088: $bookmark = $fe_af_pos;
3089:
3090: /* 7. Let node and last node be the furthest block.
3091: Follow these steps: */
3092: $node = $furthest_block;
3093: $last_node = $furthest_block;
3094:
3095: while (true) {
3096: for ($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
3097: /* 7.1 Let node be the element immediately
3098: prior to node in the stack of open elements. */
3099: $node = $this->stack[$n];
3100:
3101: /* 7.2 If node is not in the list of active
3102: formatting elements, then remove node from
3103: the stack of open elements and then go back
3104: to step 1. */
3105: if (!in_array($node, $this->a_formatting, true)) {
3106: unset($this->stack[$n]);
3107: $this->stack = array_merge($this->stack);
3108:
3109: } else {
3110: break;
3111: }
3112: }
3113:
3114: /* 7.3 Otherwise, if node is the formatting
3115: element, then go to the next step in the overall
3116: algorithm. */
3117: if ($node === $formatting_element) {
3118: break;
3119:
3120: /* 7.4 Otherwise, if last node is the furthest
3121: block, then move the aforementioned bookmark to
3122: be immediately after the node in the list of
3123: active formatting elements. */
3124: } elseif ($last_node === $furthest_block) {
3125: $bookmark = array_search($node, $this->a_formatting, true) + 1;
3126: }
3127:
3128: /* 7.5 If node has any children, perform a
3129: shallow clone of node, replace the entry for
3130: node in the list of active formatting elements
3131: with an entry for the clone, replace the entry
3132: for node in the stack of open elements with an
3133: entry for the clone, and let node be the clone. */
3134: if ($node->hasChildNodes()) {
3135: $clone = $node->cloneNode();
3136: $s_pos = array_search($node, $this->stack, true);
3137: $a_pos = array_search($node, $this->a_formatting, true);
3138:
3139: $this->stack[$s_pos] = $clone;
3140: $this->a_formatting[$a_pos] = $clone;
3141: $node = $clone;
3142: }
3143:
3144: /* 7.6 Insert last node into node, first removing
3145: it from its previous parent node if any. */
3146: if ($last_node->parentNode !== null) {
3147: $last_node->parentNode->removeChild($last_node);
3148: }
3149:
3150: $node->appendChild($last_node);
3151:
3152: /* 7.7 Let last node be node. */
3153: $last_node = $node;
3154: }
3155:
3156: /* 8. Insert whatever last node ended up being in
3157: the previous step into the common ancestor node,
3158: first removing it from its previous parent node if
3159: any. */
3160: if ($last_node->parentNode !== null) {
3161: $last_node->parentNode->removeChild($last_node);
3162: }
3163:
3164: $common_ancestor->appendChild($last_node);
3165:
3166: /* 9. Perform a shallow clone of the formatting
3167: element. */
3168: $clone = $formatting_element->cloneNode();
3169:
3170: /* 10. Take all of the child nodes of the furthest
3171: block and append them to the clone created in the
3172: last step. */
3173: while ($furthest_block->hasChildNodes()) {
3174: $child = $furthest_block->firstChild;
3175: $furthest_block->removeChild($child);
3176: $clone->appendChild($child);
3177: }
3178:
3179: /* 11. Append that clone to the furthest block. */
3180: $furthest_block->appendChild($clone);
3181:
3182: /* 12. Remove the formatting element from the list
3183: of active formatting elements, and insert the clone
3184: into the list of active formatting elements at the
3185: position of the aforementioned bookmark. */
3186: $fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
3187: unset($this->a_formatting[$fe_af_pos]);
3188: $this->a_formatting = array_merge($this->a_formatting);
3189:
3190: $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
3191: $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));
3192: $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
3193:
3194: /* 13. Remove the formatting element from the stack
3195: of open elements, and insert the clone into the stack
3196: of open elements immediately after (i.e. in a more
3197: deeply nested position than) the position of the
3198: furthest block in that stack. */
3199: $fe_s_pos = array_search($formatting_element, $this->stack, true);
3200: $fb_s_pos = array_search($furthest_block, $this->stack, true);
3201: unset($this->stack[$fe_s_pos]);
3202:
3203: $s_part1 = array_slice($this->stack, 0, $fb_s_pos);
3204: $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));
3205: $this->stack = array_merge($s_part1, array($clone), $s_part2);
3206:
3207: /* 14. Jump back to step 1 in this series of steps. */
3208: unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
3209: }
3210: break;
3211:
3212: /* An end tag token whose tag name is one of: "button",
3213: "marquee", "object" */
3214: case 'button':
3215: case 'marquee':
3216: case 'object':
3217: /* If the stack of open elements has an element in scope whose
3218: tag name matches the tag name of the token, then generate implied
3219: tags. */
3220: if ($this->elementInScope($token['name'])) {
3221: $this->generateImpliedEndTags();
3222:
3223: /* Now, if the current node is not an element with the same
3224: tag name as the token, then this is a parse error. */
3225: // k
3226:
3227: /* Now, if the stack of open elements has an element in scope
3228: whose tag name matches the tag name of the token, then pop
3229: elements from the stack until that element has been popped from
3230: the stack, and clear the list of active formatting elements up
3231: to the last marker. */
3232: for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3233: if ($this->stack[$n]->nodeName === $token['name']) {
3234: $n = -1;
3235: }
3236:
3237: array_pop($this->stack);
3238: }
3239:
3240: $marker = end(array_keys($this->a_formatting, self::MARKER, true));
3241:
3242: for ($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
3243: array_pop($this->a_formatting);
3244: }
3245: }
3246: break;
3247:
3248: /* Or an end tag whose tag name is one of: "area", "basefont",
3249: "bgsound", "br", "embed", "hr", "iframe", "image", "img",
3250: "input", "isindex", "noembed", "noframes", "param", "select",
3251: "spacer", "table", "textarea", "wbr" */
3252: case 'area':
3253: case 'basefont':
3254: case 'bgsound':
3255: case 'br':
3256: case 'embed':
3257: case 'hr':
3258: case 'iframe':
3259: case 'image':
3260: case 'img':
3261: case 'input':
3262: case 'isindex':
3263: case 'noembed':
3264: case 'noframes':
3265: case 'param':
3266: case 'select':
3267: case 'spacer':
3268: case 'table':
3269: case 'textarea':
3270: case 'wbr':
3271: // Parse error. Ignore the token.
3272: break;
3273:
3274: /* An end tag token not covered by the previous entries */
3275: default:
3276: for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3277: /* Initialise node to be the current node (the bottommost
3278: node of the stack). */
3279: $node = end($this->stack);
3280:
3281: /* If node has the same tag name as the end tag token,
3282: then: */
3283: if ($token['name'] === $node->nodeName) {
3284: /* Generate implied end tags. */
3285: $this->generateImpliedEndTags();
3286:
3287: /* If the tag name of the end tag token does not
3288: match the tag name of the current node, this is a
3289: parse error. */
3290: // k
3291:
3292: /* Pop all the nodes from the current node up to
3293: node, including node, then stop this algorithm. */
3294: for ($x = count($this->stack) - $n; $x >= $n; $x--) {
3295: array_pop($this->stack);
3296: }
3297:
3298: } else {
3299: $category = $this->getElementCategory($node);
3300:
3301: if ($category !== self::SPECIAL && $category !== self::SCOPING) {
3302: /* Otherwise, if node is in neither the formatting
3303: category nor the phrasing category, then this is a
3304: parse error. Stop this algorithm. The end tag token
3305: is ignored. */
3306: return false;
3307: }
3308: }
3309: }
3310: break;
3311: }
3312: break;
3313: }
3314: }
3315:
3316: private function inTable($token)
3317: {
3318: $clear = array('html', 'table');
3319:
3320: /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3321: U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3322: or U+0020 SPACE */
3323: if ($token['type'] === HTML5::CHARACTR &&
3324: preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
3325: ) {
3326: /* Append the character to the current node. */
3327: $text = $this->dom->createTextNode($token['data']);
3328: end($this->stack)->appendChild($text);
3329:
3330: /* A comment token */
3331: } elseif ($token['type'] === HTML5::COMMENT) {
3332: /* Append a Comment node to the current node with the data
3333: attribute set to the data given in the comment token. */
3334: $comment = $this->dom->createComment($token['data']);
3335: end($this->stack)->appendChild($comment);
3336:
3337: /* A start tag whose tag name is "caption" */
3338: } elseif ($token['type'] === HTML5::STARTTAG &&
3339: $token['name'] === 'caption'
3340: ) {
3341: /* Clear the stack back to a table context. */
3342: $this->clearStackToTableContext($clear);
3343:
3344: /* Insert a marker at the end of the list of active
3345: formatting elements. */
3346: $this->a_formatting[] = self::MARKER;
3347:
3348: /* Insert an HTML element for the token, then switch the
3349: insertion mode to "in caption". */
3350: $this->insertElement($token);
3351: $this->mode = self::IN_CAPTION;
3352:
3353: /* A start tag whose tag name is "colgroup" */
3354: } elseif ($token['type'] === HTML5::STARTTAG &&
3355: $token['name'] === 'colgroup'
3356: ) {
3357: /* Clear the stack back to a table context. */
3358: $this->clearStackToTableContext($clear);
3359:
3360: /* Insert an HTML element for the token, then switch the
3361: insertion mode to "in column group". */
3362: $this->insertElement($token);
3363: $this->mode = self::IN_CGROUP;
3364:
3365: /* A start tag whose tag name is "col" */
3366: } elseif ($token['type'] === HTML5::STARTTAG &&
3367: $token['name'] === 'col'
3368: ) {
3369: $this->inTable(
3370: array(
3371: 'name' => 'colgroup',
3372: 'type' => HTML5::STARTTAG,
3373: 'attr' => array()
3374: )
3375: );
3376:
3377: $this->inColumnGroup($token);
3378:
3379: /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
3380: } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3381: $token['name'],
3382: array('tbody', 'tfoot', 'thead')
3383: )
3384: ) {
3385: /* Clear the stack back to a table context. */
3386: $this->clearStackToTableContext($clear);
3387:
3388: /* Insert an HTML element for the token, then switch the insertion
3389: mode to "in table body". */
3390: $this->insertElement($token);
3391: $this->mode = self::IN_TBODY;
3392:
3393: /* A start tag whose tag name is one of: "td", "th", "tr" */
3394: } elseif ($token['type'] === HTML5::STARTTAG &&
3395: in_array($token['name'], array('td', 'th', 'tr'))
3396: ) {
3397: /* Act as if a start tag token with the tag name "tbody" had been
3398: seen, then reprocess the current token. */
3399: $this->inTable(
3400: array(
3401: 'name' => 'tbody',
3402: 'type' => HTML5::STARTTAG,
3403: 'attr' => array()
3404: )
3405: );
3406:
3407: return $this->inTableBody($token);
3408:
3409: /* A start tag whose tag name is "table" */
3410: } elseif ($token['type'] === HTML5::STARTTAG &&
3411: $token['name'] === 'table'
3412: ) {
3413: /* Parse error. Act as if an end tag token with the tag name "table"
3414: had been seen, then, if that token wasn't ignored, reprocess the
3415: current token. */
3416: $this->inTable(
3417: array(
3418: 'name' => 'table',
3419: 'type' => HTML5::ENDTAG
3420: )
3421: );
3422:
3423: return $this->mainPhase($token);
3424:
3425: /* An end tag whose tag name is "table" */
3426: } elseif ($token['type'] === HTML5::ENDTAG &&
3427: $token['name'] === 'table'
3428: ) {
3429: /* If the stack of open elements does not have an element in table
3430: scope with the same tag name as the token, this is a parse error.
3431: Ignore the token. (innerHTML case) */
3432: if (!$this->elementInScope($token['name'], true)) {
3433: return false;
3434:
3435: /* Otherwise: */
3436: } else {
3437: /* Generate implied end tags. */
3438: $this->generateImpliedEndTags();
3439:
3440: /* Now, if the current node is not a table element, then this
3441: is a parse error. */
3442: // w/e
3443:
3444: /* Pop elements from this stack until a table element has been
3445: popped from the stack. */
3446: while (true) {
3447: $current = end($this->stack)->nodeName;
3448: array_pop($this->stack);
3449:
3450: if ($current === 'table') {
3451: break;
3452: }
3453: }
3454:
3455: /* Reset the insertion mode appropriately. */
3456: $this->resetInsertionMode();
3457: }
3458:
3459: /* An end tag whose tag name is one of: "body", "caption", "col",
3460: "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
3461: } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3462: $token['name'],
3463: array(
3464: 'body',
3465: 'caption',
3466: 'col',
3467: 'colgroup',
3468: 'html',
3469: 'tbody',
3470: 'td',
3471: 'tfoot',
3472: 'th',
3473: 'thead',
3474: 'tr'
3475: )
3476: )
3477: ) {
3478: // Parse error. Ignore the token.
3479:
3480: /* Anything else */
3481: } else {
3482: /* Parse error. Process the token as if the insertion mode was "in
3483: body", with the following exception: */
3484:
3485: /* If the current node is a table, tbody, tfoot, thead, or tr
3486: element, then, whenever a node would be inserted into the current
3487: node, it must instead be inserted into the foster parent element. */
3488: if (in_array(
3489: end($this->stack)->nodeName,
3490: array('table', 'tbody', 'tfoot', 'thead', 'tr')
3491: )
3492: ) {
3493: /* The foster parent element is the parent element of the last
3494: table element in the stack of open elements, if there is a
3495: table element and it has such a parent element. If there is no
3496: table element in the stack of open elements (innerHTML case),
3497: then the foster parent element is the first element in the
3498: stack of open elements (the html element). Otherwise, if there
3499: is a table element in the stack of open elements, but the last
3500: table element in the stack of open elements has no parent, or
3501: its parent node is not an element, then the foster parent
3502: element is the element before the last table element in the
3503: stack of open elements. */
3504: for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3505: if ($this->stack[$n]->nodeName === 'table') {
3506: $table = $this->stack[$n];
3507: break;
3508: }
3509: }
3510:
3511: if (isset($table) && $table->parentNode !== null) {
3512: $this->foster_parent = $table->parentNode;
3513:
3514: } elseif (!isset($table)) {
3515: $this->foster_parent = $this->stack[0];
3516:
3517: } elseif (isset($table) && ($table->parentNode === null ||
3518: $table->parentNode->nodeType !== XML_ELEMENT_NODE)
3519: ) {
3520: $this->foster_parent = $this->stack[$n - 1];
3521: }
3522: }
3523:
3524: $this->inBody($token);
3525: }
3526: }
3527:
3528: private function inCaption($token)
3529: {
3530: /* An end tag whose tag name is "caption" */
3531: if ($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') {
3532: /* If the stack of open elements does not have an element in table
3533: scope with the same tag name as the token, this is a parse error.
3534: Ignore the token. (innerHTML case) */
3535: if (!$this->elementInScope($token['name'], true)) {
3536: // Ignore
3537:
3538: /* Otherwise: */
3539: } else {
3540: /* Generate implied end tags. */
3541: $this->generateImpliedEndTags();
3542:
3543: /* Now, if the current node is not a caption element, then this
3544: is a parse error. */
3545: // w/e
3546:
3547: /* Pop elements from this stack until a caption element has
3548: been popped from the stack. */
3549: while (true) {
3550: $node = end($this->stack)->nodeName;
3551: array_pop($this->stack);
3552:
3553: if ($node === 'caption') {
3554: break;
3555: }
3556: }
3557:
3558: /* Clear the list of active formatting elements up to the last
3559: marker. */
3560: $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3561:
3562: /* Switch the insertion mode to "in table". */
3563: $this->mode = self::IN_TABLE;
3564: }
3565:
3566: /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3567: "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
3568: name is "table" */
3569: } elseif (($token['type'] === HTML5::STARTTAG && in_array(
3570: $token['name'],
3571: array(
3572: 'caption',
3573: 'col',
3574: 'colgroup',
3575: 'tbody',
3576: 'td',
3577: 'tfoot',
3578: 'th',
3579: 'thead',
3580: 'tr'
3581: )
3582: )) || ($token['type'] === HTML5::ENDTAG &&
3583: $token['name'] === 'table')
3584: ) {
3585: /* Parse error. Act as if an end tag with the tag name "caption"
3586: had been seen, then, if that token wasn't ignored, reprocess the
3587: current token. */
3588: $this->inCaption(
3589: array(
3590: 'name' => 'caption',
3591: 'type' => HTML5::ENDTAG
3592: )
3593: );
3594:
3595: return $this->inTable($token);
3596:
3597: /* An end tag whose tag name is one of: "body", "col", "colgroup",
3598: "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
3599: } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3600: $token['name'],
3601: array(
3602: 'body',
3603: 'col',
3604: 'colgroup',
3605: 'html',
3606: 'tbody',
3607: 'tfoot',
3608: 'th',
3609: 'thead',
3610: 'tr'
3611: )
3612: )
3613: ) {
3614: // Parse error. Ignore the token.
3615:
3616: /* Anything else */
3617: } else {
3618: /* Process the token as if the insertion mode was "in body". */
3619: $this->inBody($token);
3620: }
3621: }
3622:
3623: private function inColumnGroup($token)
3624: {
3625: /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3626: U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3627: or U+0020 SPACE */
3628: if ($token['type'] === HTML5::CHARACTR &&
3629: preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
3630: ) {
3631: /* Append the character to the current node. */
3632: $text = $this->dom->createTextNode($token['data']);
3633: end($this->stack)->appendChild($text);
3634:
3635: /* A comment token */
3636: } elseif ($token['type'] === HTML5::COMMENT) {
3637: /* Append a Comment node to the current node with the data
3638: attribute set to the data given in the comment token. */
3639: $comment = $this->dom->createComment($token['data']);
3640: end($this->stack)->appendChild($comment);
3641:
3642: /* A start tag whose tag name is "col" */
3643: } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') {
3644: /* Insert a col element for the token. Immediately pop the current
3645: node off the stack of open elements. */
3646: $this->insertElement($token);
3647: array_pop($this->stack);
3648:
3649: /* An end tag whose tag name is "colgroup" */
3650: } elseif ($token['type'] === HTML5::ENDTAG &&
3651: $token['name'] === 'colgroup'
3652: ) {
3653: /* If the current node is the root html element, then this is a
3654: parse error, ignore the token. (innerHTML case) */
3655: if (end($this->stack)->nodeName === 'html') {
3656: // Ignore
3657:
3658: /* Otherwise, pop the current node (which will be a colgroup
3659: element) from the stack of open elements. Switch the insertion
3660: mode to "in table". */
3661: } else {
3662: array_pop($this->stack);
3663: $this->mode = self::IN_TABLE;
3664: }
3665:
3666: /* An end tag whose tag name is "col" */
3667: } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') {
3668: /* Parse error. Ignore the token. */
3669:
3670: /* Anything else */
3671: } else {
3672: /* Act as if an end tag with the tag name "colgroup" had been seen,
3673: and then, if that token wasn't ignored, reprocess the current token. */
3674: $this->inColumnGroup(
3675: array(
3676: 'name' => 'colgroup',
3677: 'type' => HTML5::ENDTAG
3678: )
3679: );
3680:
3681: return $this->inTable($token);
3682: }
3683: }
3684:
3685: private function inTableBody($token)
3686: {
3687: $clear = array('tbody', 'tfoot', 'thead', 'html');
3688:
3689: /* A start tag whose tag name is "tr" */
3690: if ($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') {
3691: /* Clear the stack back to a table body context. */
3692: $this->clearStackToTableContext($clear);
3693:
3694: /* Insert a tr element for the token, then switch the insertion
3695: mode to "in row". */
3696: $this->insertElement($token);
3697: $this->mode = self::IN_ROW;
3698:
3699: /* A start tag whose tag name is one of: "th", "td" */
3700: } elseif ($token['type'] === HTML5::STARTTAG &&
3701: ($token['name'] === 'th' || $token['name'] === 'td')
3702: ) {
3703: /* Parse error. Act as if a start tag with the tag name "tr" had
3704: been seen, then reprocess the current token. */
3705: $this->inTableBody(
3706: array(
3707: 'name' => 'tr',
3708: 'type' => HTML5::STARTTAG,
3709: 'attr' => array()
3710: )
3711: );
3712:
3713: return $this->inRow($token);
3714:
3715: /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3716: } elseif ($token['type'] === HTML5::ENDTAG &&
3717: in_array($token['name'], array('tbody', 'tfoot', 'thead'))
3718: ) {
3719: /* If the stack of open elements does not have an element in table
3720: scope with the same tag name as the token, this is a parse error.
3721: Ignore the token. */
3722: if (!$this->elementInScope($token['name'], true)) {
3723: // Ignore
3724:
3725: /* Otherwise: */
3726: } else {
3727: /* Clear the stack back to a table body context. */
3728: $this->clearStackToTableContext($clear);
3729:
3730: /* Pop the current node from the stack of open elements. Switch
3731: the insertion mode to "in table". */
3732: array_pop($this->stack);
3733: $this->mode = self::IN_TABLE;
3734: }
3735:
3736: /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3737: "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
3738: } elseif (($token['type'] === HTML5::STARTTAG && in_array(
3739: $token['name'],
3740: array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead')
3741: )) ||
3742: ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')
3743: ) {
3744: /* If the stack of open elements does not have a tbody, thead, or
3745: tfoot element in table scope, this is a parse error. Ignore the
3746: token. (innerHTML case) */
3747: if (!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
3748: // Ignore.
3749:
3750: /* Otherwise: */
3751: } else {
3752: /* Clear the stack back to a table body context. */
3753: $this->clearStackToTableContext($clear);
3754:
3755: /* Act as if an end tag with the same tag name as the current
3756: node ("tbody", "tfoot", or "thead") had been seen, then
3757: reprocess the current token. */
3758: $this->inTableBody(
3759: array(
3760: 'name' => end($this->stack)->nodeName,
3761: 'type' => HTML5::ENDTAG
3762: )
3763: );
3764:
3765: return $this->mainPhase($token);
3766: }
3767:
3768: /* An end tag whose tag name is one of: "body", "caption", "col",
3769: "colgroup", "html", "td", "th", "tr" */
3770: } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3771: $token['name'],
3772: array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr')
3773: )
3774: ) {
3775: /* Parse error. Ignore the token. */
3776:
3777: /* Anything else */
3778: } else {
3779: /* Process the token as if the insertion mode was "in table". */
3780: $this->inTable($token);
3781: }
3782: }
3783:
3784: private function inRow($token)
3785: {
3786: $clear = array('tr', 'html');
3787:
3788: /* A start tag whose tag name is one of: "th", "td" */
3789: if ($token['type'] === HTML5::STARTTAG &&
3790: ($token['name'] === 'th' || $token['name'] === 'td')
3791: ) {
3792: /* Clear the stack back to a table row context. */
3793: $this->clearStackToTableContext($clear);
3794:
3795: /* Insert an HTML element for the token, then switch the insertion
3796: mode to "in cell". */
3797: $this->insertElement($token);
3798: $this->mode = self::IN_CELL;
3799:
3800: /* Insert a marker at the end of the list of active formatting
3801: elements. */
3802: $this->a_formatting[] = self::MARKER;
3803:
3804: /* An end tag whose tag name is "tr" */
3805: } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') {
3806: /* If the stack of open elements does not have an element in table
3807: scope with the same tag name as the token, this is a parse error.
3808: Ignore the token. (innerHTML case) */
3809: if (!$this->elementInScope($token['name'], true)) {
3810: // Ignore.
3811:
3812: /* Otherwise: */
3813: } else {
3814: /* Clear the stack back to a table row context. */
3815: $this->clearStackToTableContext($clear);
3816:
3817: /* Pop the current node (which will be a tr element) from the
3818: stack of open elements. Switch the insertion mode to "in table
3819: body". */
3820: array_pop($this->stack);
3821: $this->mode = self::IN_TBODY;
3822: }
3823:
3824: /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3825: "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
3826: } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3827: $token['name'],
3828: array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr')
3829: )
3830: ) {
3831: /* Act as if an end tag with the tag name "tr" had been seen, then,
3832: if that token wasn't ignored, reprocess the current token. */
3833: $this->inRow(
3834: array(
3835: 'name' => 'tr',
3836: 'type' => HTML5::ENDTAG
3837: )
3838: );
3839:
3840: return $this->inCell($token);
3841:
3842: /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3843: } elseif ($token['type'] === HTML5::ENDTAG &&
3844: in_array($token['name'], array('tbody', 'tfoot', 'thead'))
3845: ) {
3846: /* If the stack of open elements does not have an element in table
3847: scope with the same tag name as the token, this is a parse error.
3848: Ignore the token. */
3849: if (!$this->elementInScope($token['name'], true)) {
3850: // Ignore.
3851:
3852: /* Otherwise: */
3853: } else {
3854: /* Otherwise, act as if an end tag with the tag name "tr" had
3855: been seen, then reprocess the current token. */
3856: $this->inRow(
3857: array(
3858: 'name' => 'tr',
3859: 'type' => HTML5::ENDTAG
3860: )
3861: );
3862:
3863: return $this->inCell($token);
3864: }
3865:
3866: /* An end tag whose tag name is one of: "body", "caption", "col",
3867: "colgroup", "html", "td", "th" */
3868: } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3869: $token['name'],
3870: array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr')
3871: )
3872: ) {
3873: /* Parse error. Ignore the token. */
3874:
3875: /* Anything else */
3876: } else {
3877: /* Process the token as if the insertion mode was "in table". */
3878: $this->inTable($token);
3879: }
3880: }
3881:
3882: private function inCell($token)
3883: {
3884: /* An end tag whose tag name is one of: "td", "th" */
3885: if ($token['type'] === HTML5::ENDTAG &&
3886: ($token['name'] === 'td' || $token['name'] === 'th')
3887: ) {
3888: /* If the stack of open elements does not have an element in table
3889: scope with the same tag name as that of the token, then this is a
3890: parse error and the token must be ignored. */
3891: if (!$this->elementInScope($token['name'], true)) {
3892: // Ignore.
3893:
3894: /* Otherwise: */
3895: } else {
3896: /* Generate implied end tags, except for elements with the same
3897: tag name as the token. */
3898: $this->generateImpliedEndTags(array($token['name']));
3899:
3900: /* Now, if the current node is not an element with the same tag
3901: name as the token, then this is a parse error. */
3902: // k
3903:
3904: /* Pop elements from this stack until an element with the same
3905: tag name as the token has been popped from the stack. */
3906: while (true) {
3907: $node = end($this->stack)->nodeName;
3908: array_pop($this->stack);
3909:
3910: if ($node === $token['name']) {
3911: break;
3912: }
3913: }
3914:
3915: /* Clear the list of active formatting elements up to the last
3916: marker. */
3917: $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3918:
3919: /* Switch the insertion mode to "in row". (The current node
3920: will be a tr element at this point.) */
3921: $this->mode = self::IN_ROW;
3922: }
3923:
3924: /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3925: "tbody", "td", "tfoot", "th", "thead", "tr" */
3926: } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3927: $token['name'],
3928: array(
3929: 'caption',
3930: 'col',
3931: 'colgroup',
3932: 'tbody',
3933: 'td',
3934: 'tfoot',
3935: 'th',
3936: 'thead',
3937: 'tr'
3938: )
3939: )
3940: ) {
3941: /* If the stack of open elements does not have a td or th element
3942: in table scope, then this is a parse error; ignore the token.
3943: (innerHTML case) */
3944: if (!$this->elementInScope(array('td', 'th'), true)) {
3945: // Ignore.
3946:
3947: /* Otherwise, close the cell (see below) and reprocess the current
3948: token. */
3949: } else {
3950: $this->closeCell();
3951: return $this->inRow($token);
3952: }
3953:
3954: /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3955: "tbody", "td", "tfoot", "th", "thead", "tr" */
3956: } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3957: $token['name'],
3958: array(
3959: 'caption',
3960: 'col',
3961: 'colgroup',
3962: 'tbody',
3963: 'td',
3964: 'tfoot',
3965: 'th',
3966: 'thead',
3967: 'tr'
3968: )
3969: )
3970: ) {
3971: /* If the stack of open elements does not have a td or th element
3972: in table scope, then this is a parse error; ignore the token.
3973: (innerHTML case) */
3974: if (!$this->elementInScope(array('td', 'th'), true)) {
3975: // Ignore.
3976:
3977: /* Otherwise, close the cell (see below) and reprocess the current
3978: token. */
3979: } else {
3980: $this->closeCell();
3981: return $this->inRow($token);
3982: }
3983:
3984: /* An end tag whose tag name is one of: "body", "caption", "col",
3985: "colgroup", "html" */
3986: } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3987: $token['name'],
3988: array('body', 'caption', 'col', 'colgroup', 'html')
3989: )
3990: ) {
3991: /* Parse error. Ignore the token. */
3992:
3993: /* An end tag whose tag name is one of: "table", "tbody", "tfoot",
3994: "thead", "tr" */
3995: } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3996: $token['name'],
3997: array('table', 'tbody', 'tfoot', 'thead', 'tr')
3998: )
3999: ) {
4000: /* If the stack of open elements does not have an element in table
4001: scope with the same tag name as that of the token (which can only
4002: happen for "tbody", "tfoot" and "thead", or, in the innerHTML case),
4003: then this is a parse error and the token must be ignored. */
4004: if (!$this->elementInScope($token['name'], true)) {
4005: // Ignore.
4006:
4007: /* Otherwise, close the cell (see below) and reprocess the current
4008: token. */
4009: } else {
4010: $this->closeCell();
4011: return $this->inRow($token);
4012: }
4013:
4014: /* Anything else */
4015: } else {
4016: /* Process the token as if the insertion mode was "in body". */
4017: $this->inBody($token);
4018: }
4019: }
4020:
4021: private function inSelect($token)
4022: {
4023: /* Handle the token as follows: */
4024:
4025: /* A character token */
4026: if ($token['type'] === HTML5::CHARACTR) {
4027: /* Append the token's character to the current node. */
4028: $this->insertText($token['data']);
4029:
4030: /* A comment token */
4031: } elseif ($token['type'] === HTML5::COMMENT) {
4032: /* Append a Comment node to the current node with the data
4033: attribute set to the data given in the comment token. */
4034: $this->insertComment($token['data']);
4035:
4036: /* A start tag token whose tag name is "option" */
4037: } elseif ($token['type'] === HTML5::STARTTAG &&
4038: $token['name'] === 'option'
4039: ) {
4040: /* If the current node is an option element, act as if an end tag
4041: with the tag name "option" had been seen. */
4042: if (end($this->stack)->nodeName === 'option') {
4043: $this->inSelect(
4044: array(
4045: 'name' => 'option',
4046: 'type' => HTML5::ENDTAG
4047: )
4048: );
4049: }
4050:
4051: /* Insert an HTML element for the token. */
4052: $this->insertElement($token);
4053:
4054: /* A start tag token whose tag name is "optgroup" */
4055: } elseif ($token['type'] === HTML5::STARTTAG &&
4056: $token['name'] === 'optgroup'
4057: ) {
4058: /* If the current node is an option element, act as if an end tag
4059: with the tag name "option" had been seen. */
4060: if (end($this->stack)->nodeName === 'option') {
4061: $this->inSelect(
4062: array(
4063: 'name' => 'option',
4064: 'type' => HTML5::ENDTAG
4065: )
4066: );
4067: }
4068:
4069: /* If the current node is an optgroup element, act as if an end tag
4070: with the tag name "optgroup" had been seen. */
4071: if (end($this->stack)->nodeName === 'optgroup') {
4072: $this->inSelect(
4073: array(
4074: 'name' => 'optgroup',
4075: 'type' => HTML5::ENDTAG
4076: )
4077: );
4078: }
4079:
4080: /* Insert an HTML element for the token. */
4081: $this->insertElement($token);
4082:
4083: /* An end tag token whose tag name is "optgroup" */
4084: } elseif ($token['type'] === HTML5::ENDTAG &&
4085: $token['name'] === 'optgroup'
4086: ) {
4087: /* First, if the current node is an option element, and the node
4088: immediately before it in the stack of open elements is an optgroup
4089: element, then act as if an end tag with the tag name "option" had
4090: been seen. */
4091: $elements_in_stack = count($this->stack);
4092:
4093: if ($this->stack[$elements_in_stack - 1]->nodeName === 'option' &&
4094: $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup'
4095: ) {
4096: $this->inSelect(
4097: array(
4098: 'name' => 'option',
4099: 'type' => HTML5::ENDTAG
4100: )
4101: );
4102: }
4103:
4104: /* If the current node is an optgroup element, then pop that node
4105: from the stack of open elements. Otherwise, this is a parse error,
4106: ignore the token. */
4107: if ($this->stack[$elements_in_stack - 1] === 'optgroup') {
4108: array_pop($this->stack);
4109: }
4110:
4111: /* An end tag token whose tag name is "option" */
4112: } elseif ($token['type'] === HTML5::ENDTAG &&
4113: $token['name'] === 'option'
4114: ) {
4115: /* If the current node is an option element, then pop that node
4116: from the stack of open elements. Otherwise, this is a parse error,
4117: ignore the token. */
4118: if (end($this->stack)->nodeName === 'option') {
4119: array_pop($this->stack);
4120: }
4121:
4122: /* An end tag whose tag name is "select" */
4123: } elseif ($token['type'] === HTML5::ENDTAG &&
4124: $token['name'] === 'select'
4125: ) {
4126: /* If the stack of open elements does not have an element in table
4127: scope with the same tag name as the token, this is a parse error.
4128: Ignore the token. (innerHTML case) */
4129: if (!$this->elementInScope($token['name'], true)) {
4130: // w/e
4131:
4132: /* Otherwise: */
4133: } else {
4134: /* Pop elements from the stack of open elements until a select
4135: element has been popped from the stack. */
4136: while (true) {
4137: $current = end($this->stack)->nodeName;
4138: array_pop($this->stack);
4139:
4140: if ($current === 'select') {
4141: break;
4142: }
4143: }
4144:
4145: /* Reset the insertion mode appropriately. */
4146: $this->resetInsertionMode();
4147: }
4148:
4149: /* A start tag whose tag name is "select" */
4150: } elseif ($token['name'] === 'select' &&
4151: $token['type'] === HTML5::STARTTAG
4152: ) {
4153: /* Parse error. Act as if the token had been an end tag with the
4154: tag name "select" instead. */
4155: $this->inSelect(
4156: array(
4157: 'name' => 'select',
4158: 'type' => HTML5::ENDTAG
4159: )
4160: );
4161:
4162: /* An end tag whose tag name is one of: "caption", "table", "tbody",
4163: "tfoot", "thead", "tr", "td", "th" */
4164: } elseif (in_array(
4165: $token['name'],
4166: array(
4167: 'caption',
4168: 'table',
4169: 'tbody',
4170: 'tfoot',
4171: 'thead',
4172: 'tr',
4173: 'td',
4174: 'th'
4175: )
4176: ) && $token['type'] === HTML5::ENDTAG
4177: ) {
4178: /* Parse error. */
4179: // w/e
4180:
4181: /* If the stack of open elements has an element in table scope with
4182: the same tag name as that of the token, then act as if an end tag
4183: with the tag name "select" had been seen, and reprocess the token.
4184: Otherwise, ignore the token. */
4185: if ($this->elementInScope($token['name'], true)) {
4186: $this->inSelect(
4187: array(
4188: 'name' => 'select',
4189: 'type' => HTML5::ENDTAG
4190: )
4191: );
4192:
4193: $this->mainPhase($token);
4194: }
4195:
4196: /* Anything else */
4197: } else {
4198: /* Parse error. Ignore the token. */
4199: }
4200: }
4201:
4202: private function afterBody($token)
4203: {
4204: /* Handle the token as follows: */
4205:
4206: /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4207: U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4208: or U+0020 SPACE */
4209: if ($token['type'] === HTML5::CHARACTR &&
4210: preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4211: ) {
4212: /* Process the token as it would be processed if the insertion mode
4213: was "in body". */
4214: $this->inBody($token);
4215:
4216: /* A comment token */
4217: } elseif ($token['type'] === HTML5::COMMENT) {
4218: /* Append a Comment node to the first element in the stack of open
4219: elements (the html element), with the data attribute set to the
4220: data given in the comment token. */
4221: $comment = $this->dom->createComment($token['data']);
4222: $this->stack[0]->appendChild($comment);
4223:
4224: /* An end tag with the tag name "html" */
4225: } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') {
4226: /* If the parser was originally created in order to handle the
4227: setting of an element's innerHTML attribute, this is a parse error;
4228: ignore the token. (The element will be an html element in this
4229: case.) (innerHTML case) */
4230:
4231: /* Otherwise, switch to the trailing end phase. */
4232: $this->phase = self::END_PHASE;
4233:
4234: /* Anything else */
4235: } else {
4236: /* Parse error. Set the insertion mode to "in body" and reprocess
4237: the token. */
4238: $this->mode = self::IN_BODY;
4239: return $this->inBody($token);
4240: }
4241: }
4242:
4243: private function inFrameset($token)
4244: {
4245: /* Handle the token as follows: */
4246:
4247: /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4248: U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4249: U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
4250: if ($token['type'] === HTML5::CHARACTR &&
4251: preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4252: ) {
4253: /* Append the character to the current node. */
4254: $this->insertText($token['data']);
4255:
4256: /* A comment token */
4257: } elseif ($token['type'] === HTML5::COMMENT) {
4258: /* Append a Comment node to the current node with the data
4259: attribute set to the data given in the comment token. */
4260: $this->insertComment($token['data']);
4261:
4262: /* A start tag with the tag name "frameset" */
4263: } elseif ($token['name'] === 'frameset' &&
4264: $token['type'] === HTML5::STARTTAG
4265: ) {
4266: $this->insertElement($token);
4267:
4268: /* An end tag with the tag name "frameset" */
4269: } elseif ($token['name'] === 'frameset' &&
4270: $token['type'] === HTML5::ENDTAG
4271: ) {
4272: /* If the current node is the root html element, then this is a
4273: parse error; ignore the token. (innerHTML case) */
4274: if (end($this->stack)->nodeName === 'html') {
4275: // Ignore
4276:
4277: } else {
4278: /* Otherwise, pop the current node from the stack of open
4279: elements. */
4280: array_pop($this->stack);
4281:
4282: /* If the parser was not originally created in order to handle
4283: the setting of an element's innerHTML attribute (innerHTML case),
4284: and the current node is no longer a frameset element, then change
4285: the insertion mode to "after frameset". */
4286: $this->mode = self::AFTR_FRAME;
4287: }
4288:
4289: /* A start tag with the tag name "frame" */
4290: } elseif ($token['name'] === 'frame' &&
4291: $token['type'] === HTML5::STARTTAG
4292: ) {
4293: /* Insert an HTML element for the token. */
4294: $this->insertElement($token);
4295:
4296: /* Immediately pop the current node off the stack of open elements. */
4297: array_pop($this->stack);
4298:
4299: /* A start tag with the tag name "noframes" */
4300: } elseif ($token['name'] === 'noframes' &&
4301: $token['type'] === HTML5::STARTTAG
4302: ) {
4303: /* Process the token as if the insertion mode had been "in body". */
4304: $this->inBody($token);
4305:
4306: /* Anything else */
4307: } else {
4308: /* Parse error. Ignore the token. */
4309: }
4310: }
4311:
4312: private function afterFrameset($token)
4313: {
4314: /* Handle the token as follows: */
4315:
4316: /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4317: U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4318: U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
4319: if ($token['type'] === HTML5::CHARACTR &&
4320: preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4321: ) {
4322: /* Append the character to the current node. */
4323: $this->insertText($token['data']);
4324:
4325: /* A comment token */
4326: } elseif ($token['type'] === HTML5::COMMENT) {
4327: /* Append a Comment node to the current node with the data
4328: attribute set to the data given in the comment token. */
4329: $this->insertComment($token['data']);
4330:
4331: /* An end tag with the tag name "html" */
4332: } elseif ($token['name'] === 'html' &&
4333: $token['type'] === HTML5::ENDTAG
4334: ) {
4335: /* Switch to the trailing end phase. */
4336: $this->phase = self::END_PHASE;
4337:
4338: /* A start tag with the tag name "noframes" */
4339: } elseif ($token['name'] === 'noframes' &&
4340: $token['type'] === HTML5::STARTTAG
4341: ) {
4342: /* Process the token as if the insertion mode had been "in body". */
4343: $this->inBody($token);
4344:
4345: /* Anything else */
4346: } else {
4347: /* Parse error. Ignore the token. */
4348: }
4349: }
4350:
4351: private function trailingEndPhase($token)
4352: {
4353: /* After the main phase, as each token is emitted from the tokenisation
4354: stage, it must be processed as described in this section. */
4355:
4356: /* A DOCTYPE token */
4357: if ($token['type'] === HTML5::DOCTYPE) {
4358: // Parse error. Ignore the token.
4359:
4360: /* A comment token */
4361: } elseif ($token['type'] === HTML5::COMMENT) {
4362: /* Append a Comment node to the Document object with the data
4363: attribute set to the data given in the comment token. */
4364: $comment = $this->dom->createComment($token['data']);
4365: $this->dom->appendChild($comment);
4366:
4367: /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4368: U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4369: or U+0020 SPACE */
4370: } elseif ($token['type'] === HTML5::CHARACTR &&
4371: preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4372: ) {
4373: /* Process the token as it would be processed in the main phase. */
4374: $this->mainPhase($token);
4375:
4376: /* A character token that is not one of U+0009 CHARACTER TABULATION,
4377: U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4378: or U+0020 SPACE. Or a start tag token. Or an end tag token. */
4379: } elseif (($token['type'] === HTML5::CHARACTR &&
4380: preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
4381: $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG
4382: ) {
4383: /* Parse error. Switch back to the main phase and reprocess the
4384: token. */
4385: $this->phase = self::MAIN_PHASE;
4386: return $this->mainPhase($token);
4387:
4388: /* An end-of-file token */
4389: } elseif ($token['type'] === HTML5::EOF) {
4390: /* OMG DONE!! */
4391: }
4392: }
4393:
4394: private function insertElement($token, $append = true, $check = false)
4395: {
4396: // Proprietary workaround for libxml2's limitations with tag names
4397: if ($check) {
4398: // Slightly modified HTML5 tag-name modification,
4399: // removing anything that's not an ASCII letter, digit, or hyphen
4400: $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']);
4401: // Remove leading hyphens and numbers
4402: $token['name'] = ltrim($token['name'], '-0..9');
4403: // In theory, this should ever be needed, but just in case
4404: if ($token['name'] === '') {
4405: $token['name'] = 'span';
4406: } // arbitrary generic choice
4407: }
4408:
4409: $el = $this->dom->createElement($token['name']);
4410:
4411: foreach ($token['attr'] as $attr) {
4412: if (!$el->hasAttribute($attr['name'])) {
4413: $el->setAttribute($attr['name'], (string)$attr['value']);
4414: }
4415: }
4416:
4417: $this->appendToRealParent($el);
4418: $this->stack[] = $el;
4419:
4420: return $el;
4421: }
4422:
4423: private function insertText($data)
4424: {
4425: $text = $this->dom->createTextNode($data);
4426: $this->appendToRealParent($text);
4427: }
4428:
4429: private function insertComment($data)
4430: {
4431: $comment = $this->dom->createComment($data);
4432: $this->appendToRealParent($comment);
4433: }
4434:
4435: private function appendToRealParent($node)
4436: {
4437: if ($this->foster_parent === null) {
4438: end($this->stack)->appendChild($node);
4439:
4440: } elseif ($this->foster_parent !== null) {
4441: /* If the foster parent element is the parent element of the
4442: last table element in the stack of open elements, then the new
4443: node must be inserted immediately before the last table element
4444: in the stack of open elements in the foster parent element;
4445: otherwise, the new node must be appended to the foster parent
4446: element. */
4447: for ($n = count($this->stack) - 1; $n >= 0; $n--) {
4448: if ($this->stack[$n]->nodeName === 'table' &&
4449: $this->stack[$n]->parentNode !== null
4450: ) {
4451: $table = $this->stack[$n];
4452: break;
4453: }
4454: }
4455:
4456: if (isset($table) && $this->foster_parent->isSameNode($table->parentNode)) {
4457: $this->foster_parent->insertBefore($node, $table);
4458: } else {
4459: $this->foster_parent->appendChild($node);
4460: }
4461:
4462: $this->foster_parent = null;
4463: }
4464: }
4465:
4466: private function elementInScope($el, $table = false)
4467: {
4468: if (is_array($el)) {
4469: foreach ($el as $element) {
4470: if ($this->elementInScope($element, $table)) {
4471: return true;
4472: }
4473: }
4474:
4475: return false;
4476: }
4477:
4478: $leng = count($this->stack);
4479:
4480: for ($n = 0; $n < $leng; $n++) {
4481: /* 1. Initialise node to be the current node (the bottommost node of
4482: the stack). */
4483: $node = $this->stack[$leng - 1 - $n];
4484:
4485: if ($node->tagName === $el) {
4486: /* 2. If node is the target node, terminate in a match state. */
4487: return true;
4488:
4489: } elseif ($node->tagName === 'table') {
4490: /* 3. Otherwise, if node is a table element, terminate in a failure
4491: state. */
4492: return false;
4493:
4494: } elseif ($table === true && in_array(
4495: $node->tagName,
4496: array(
4497: 'caption',
4498: 'td',
4499: 'th',
4500: 'button',
4501: 'marquee',
4502: 'object'
4503: )
4504: )
4505: ) {
4506: /* 4. Otherwise, if the algorithm is the "has an element in scope"
4507: variant (rather than the "has an element in table scope" variant),
4508: and node is one of the following, terminate in a failure state. */
4509: return false;
4510:
4511: } elseif ($node === $node->ownerDocument->documentElement) {
4512: /* 5. Otherwise, if node is an html element (root element), terminate
4513: in a failure state. (This can only happen if the node is the topmost
4514: node of the stack of open elements, and prevents the next step from
4515: being invoked if there are no more elements in the stack.) */
4516: return false;
4517: }
4518:
4519: /* Otherwise, set node to the previous entry in the stack of open
4520: elements and return to step 2. (This will never fail, since the loop
4521: will always terminate in the previous step if the top of the stack
4522: is reached.) */
4523: }
4524: }
4525:
4526: private function reconstructActiveFormattingElements()
4527: {
4528: /* 1. If there are no entries in the list of active formatting elements,
4529: then there is nothing to reconstruct; stop this algorithm. */
4530: $formatting_elements = count($this->a_formatting);
4531:
4532: if ($formatting_elements === 0) {
4533: return false;
4534: }
4535:
4536: /* 3. Let entry be the last (most recently added) element in the list
4537: of active formatting elements. */
4538: $entry = end($this->a_formatting);
4539:
4540: /* 2. If the last (most recently added) entry in the list of active
4541: formatting elements is a marker, or if it is an element that is in the
4542: stack of open elements, then there is nothing to reconstruct; stop this
4543: algorithm. */
4544: if ($entry === self::MARKER || in_array($entry, $this->stack, true)) {
4545: return false;
4546: }
4547:
4548: for ($a = $formatting_elements - 1; $a >= 0; true) {
4549: /* 4. If there are no entries before entry in the list of active
4550: formatting elements, then jump to step 8. */
4551: if ($a === 0) {
4552: $step_seven = false;
4553: break;
4554: }
4555:
4556: /* 5. Let entry be the entry one earlier than entry in the list of
4557: active formatting elements. */
4558: $a--;
4559: $entry = $this->a_formatting[$a];
4560:
4561: /* 6. If entry is neither a marker nor an element that is also in
4562: thetack of open elements, go to step 4. */
4563: if ($entry === self::MARKER || in_array($entry, $this->stack, true)) {
4564: break;
4565: }
4566: }
4567:
4568: while (true) {
4569: /* 7. Let entry be the element one later than entry in the list of
4570: active formatting elements. */
4571: if (isset($step_seven) && $step_seven === true) {
4572: $a++;
4573: $entry = $this->a_formatting[$a];
4574: }
4575:
4576: /* 8. Perform a shallow clone of the element entry to obtain clone. */
4577: $clone = $entry->cloneNode();
4578:
4579: /* 9. Append clone to the current node and push it onto the stack
4580: of open elements so that it is the new current node. */
4581: end($this->stack)->appendChild($clone);
4582: $this->stack[] = $clone;
4583:
4584: /* 10. Replace the entry for entry in the list with an entry for
4585: clone. */
4586: $this->a_formatting[$a] = $clone;
4587:
4588: /* 11. If the entry for clone in the list of active formatting
4589: elements is not the last entry in the list, return to step 7. */
4590: if (end($this->a_formatting) !== $clone) {
4591: $step_seven = true;
4592: } else {
4593: break;
4594: }
4595: }
4596: }
4597:
4598: private function clearTheActiveFormattingElementsUpToTheLastMarker()
4599: {
4600: /* When the steps below require the UA to clear the list of active
4601: formatting elements up to the last marker, the UA must perform the
4602: following steps: */
4603:
4604: while (true) {
4605: /* 1. Let entry be the last (most recently added) entry in the list
4606: of active formatting elements. */
4607: $entry = end($this->a_formatting);
4608:
4609: /* 2. Remove entry from the list of active formatting elements. */
4610: array_pop($this->a_formatting);
4611:
4612: /* 3. If entry was a marker, then stop the algorithm at this point.
4613: The list has been cleared up to the last marker. */
4614: if ($entry === self::MARKER) {
4615: break;
4616: }
4617: }
4618: }
4619:
4620: private function generateImpliedEndTags($exclude = array())
4621: {
4622: /* When the steps below require the UA to generate implied end tags,
4623: then, if the current node is a dd element, a dt element, an li element,
4624: a p element, a td element, a th element, or a tr element, the UA must
4625: act as if an end tag with the respective tag name had been seen and
4626: then generate implied end tags again. */
4627: $node = end($this->stack);
4628: $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);
4629:
4630: while (in_array(end($this->stack)->nodeName, $elements)) {
4631: array_pop($this->stack);
4632: }
4633: }
4634:
4635: private function getElementCategory($node)
4636: {
4637: $name = $node->tagName;
4638: if (in_array($name, $this->special)) {
4639: return self::SPECIAL;
4640: } elseif (in_array($name, $this->scoping)) {
4641: return self::SCOPING;
4642: } elseif (in_array($name, $this->formatting)) {
4643: return self::FORMATTING;
4644: } else {
4645: return self::PHRASING;
4646: }
4647: }
4648:
4649: private function clearStackToTableContext($elements)
4650: {
4651: /* When the steps above require the UA to clear the stack back to a
4652: table context, it means that the UA must, while the current node is not
4653: a table element or an html element, pop elements from the stack of open
4654: elements. If this causes any elements to be popped from the stack, then
4655: this is a parse error. */
4656: while (true) {
4657: $node = end($this->stack)->nodeName;
4658:
4659: if (in_array($node, $elements)) {
4660: break;
4661: } else {
4662: array_pop($this->stack);
4663: }
4664: }
4665: }
4666:
4667: private function resetInsertionMode()
4668: {
4669: /* 1. Let last be false. */
4670: $last = false;
4671: $leng = count($this->stack);
4672:
4673: for ($n = $leng - 1; $n >= 0; $n--) {
4674: /* 2. Let node be the last node in the stack of open elements. */
4675: $node = $this->stack[$n];
4676:
4677: /* 3. If node is the first node in the stack of open elements, then
4678: set last to true. If the element whose innerHTML attribute is being
4679: set is neither a td element nor a th element, then set node to the
4680: element whose innerHTML attribute is being set. (innerHTML case) */
4681: if ($this->stack[0]->isSameNode($node)) {
4682: $last = true;
4683: }
4684:
4685: /* 4. If node is a select element, then switch the insertion mode to
4686: "in select" and abort these steps. (innerHTML case) */
4687: if ($node->nodeName === 'select') {
4688: $this->mode = self::IN_SELECT;
4689: break;
4690:
4691: /* 5. If node is a td or th element, then switch the insertion mode
4692: to "in cell" and abort these steps. */
4693: } elseif ($node->nodeName === 'td' || $node->nodeName === 'th') {
4694: $this->mode = self::IN_CELL;
4695: break;
4696:
4697: /* 6. If node is a tr element, then switch the insertion mode to
4698: "in row" and abort these steps. */
4699: } elseif ($node->nodeName === 'tr') {
4700: $this->mode = self::IN_ROW;
4701: break;
4702:
4703: /* 7. If node is a tbody, thead, or tfoot element, then switch the
4704: insertion mode to "in table body" and abort these steps. */
4705: } elseif (in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) {
4706: $this->mode = self::IN_TBODY;
4707: break;
4708:
4709: /* 8. If node is a caption element, then switch the insertion mode
4710: to "in caption" and abort these steps. */
4711: } elseif ($node->nodeName === 'caption') {
4712: $this->mode = self::IN_CAPTION;
4713: break;
4714:
4715: /* 9. If node is a colgroup element, then switch the insertion mode
4716: to "in column group" and abort these steps. (innerHTML case) */
4717: } elseif ($node->nodeName === 'colgroup') {
4718: $this->mode = self::IN_CGROUP;
4719: break;
4720:
4721: /* 10. If node is a table element, then switch the insertion mode
4722: to "in table" and abort these steps. */
4723: } elseif ($node->nodeName === 'table') {
4724: $this->mode = self::IN_TABLE;
4725: break;
4726:
4727: /* 11. If node is a head element, then switch the insertion mode
4728: to "in body" ("in body"! not "in head"!) and abort these steps.
4729: (innerHTML case) */
4730: } elseif ($node->nodeName === 'head') {
4731: $this->mode = self::IN_BODY;
4732: break;
4733:
4734: /* 12. If node is a body element, then switch the insertion mode to
4735: "in body" and abort these steps. */
4736: } elseif ($node->nodeName === 'body') {
4737: $this->mode = self::IN_BODY;
4738: break;
4739:
4740: /* 13. If node is a frameset element, then switch the insertion
4741: mode to "in frameset" and abort these steps. (innerHTML case) */
4742: } elseif ($node->nodeName === 'frameset') {
4743: $this->mode = self::IN_FRAME;
4744: break;
4745:
4746: /* 14. If node is an html element, then: if the head element
4747: pointer is null, switch the insertion mode to "before head",
4748: otherwise, switch the insertion mode to "after head". In either
4749: case, abort these steps. (innerHTML case) */
4750: } elseif ($node->nodeName === 'html') {
4751: $this->mode = ($this->head_pointer === null)
4752: ? self::BEFOR_HEAD
4753: : self::AFTER_HEAD;
4754:
4755: break;
4756:
4757: /* 15. If last is true, then set the insertion mode to "in body"
4758: and abort these steps. (innerHTML case) */
4759: } elseif ($last) {
4760: $this->mode = self::IN_BODY;
4761: break;
4762: }
4763: }
4764: }
4765:
4766: private function closeCell()
4767: {
4768: /* If the stack of open elements has a td or th element in table scope,
4769: then act as if an end tag token with that tag name had been seen. */
4770: foreach (array('td', 'th') as $cell) {
4771: if ($this->elementInScope($cell, true)) {
4772: $this->inCell(
4773: array(
4774: 'name' => $cell,
4775: 'type' => HTML5::ENDTAG
4776: )
4777: );
4778:
4779: break;
4780: }
4781: }
4782: }
4783:
4784: public function save()
4785: {
4786: return $this->dom;
4787: }
4788: }
4789: