1: <?php
2:
3: /**
4: * Takes a well formed list of tokens and fixes their nesting.
5: *
6: * HTML elements dictate which elements are allowed to be their children,
7: * for example, you can't have a p tag in a span tag. Other elements have
8: * much more rigorous definitions: tables, for instance, require a specific
9: * order for their elements. There are also constraints not expressible by
10: * document type definitions, such as the chameleon nature of ins/del
11: * tags and global child exclusions.
12: *
13: * The first major objective of this strategy is to iterate through all
14: * the nodes and determine whether or not their children conform to the
15: * element's definition. If they do not, the child definition may
16: * optionally supply an amended list of elements that is valid or
17: * require that the entire node be deleted (and the previous node
18: * rescanned).
19: *
20: * The second objective is to ensure that explicitly excluded elements of
21: * an element do not appear in its children. Code that accomplishes this
22: * task is pervasive through the strategy, though the two are distinct tasks
23: * and could, theoretically, be seperated (although it's not recommended).
24: *
25: * @note Whether or not unrecognized children are silently dropped or
26: * translated into text depends on the child definitions.
27: *
28: * @todo Enable nodes to be bubbled out of the structure. This is
29: * easier with our new algorithm.
30: */
31:
32: class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
33: {
34:
35: /**
36: * @param HTMLPurifier_Token[] $tokens
37: * @param HTMLPurifier_Config $config
38: * @param HTMLPurifier_Context $context
39: * @return array|HTMLPurifier_Token[]
40: */
41: public function execute($tokens, $config, $context)
42: {
43:
44: //####################################################################//
45: // Pre-processing
46:
47: // O(n) pass to convert to a tree, so that we can efficiently
48: // refer to substrings
49: $top_node = HTMLPurifier_Arborize::arborize($tokens, $config, $context);
50:
51: // get a copy of the HTML definition
52: $definition = $config->getHTMLDefinition();
53:
54: $excludes_enabled = !$config->get('Core.DisableExcludes');
55:
56: // setup the context variable 'IsInline', for chameleon processing
57: // is 'false' when we are not inline, 'true' when it must always
58: // be inline, and an integer when it is inline for a certain
59: // branch of the document tree
60: $is_inline = $definition->info_parent_def->descendants_are_inline;
61: $context->register('IsInline', $is_inline);
62:
63: // setup error collector
64: $e =& $context->get('ErrorCollector', true);
65:
66: //####################################################################//
67: // Loop initialization
68:
69: // stack that contains all elements that are excluded
70: // it is organized by parent elements, similar to $stack,
71: // but it is only populated when an element with exclusions is
72: // processed, i.e. there won't be empty exclusions.
73: $exclude_stack = array($definition->info_parent_def->excludes);
74:
75: // variable that contains the start token while we are processing
76: // nodes. This enables error reporting to do its job
77: $node = $top_node;
78: // dummy token
79: list($token, $d) = $node->toTokenPair();
80: $context->register('CurrentNode', $node);
81: $context->register('CurrentToken', $token);
82:
83: //####################################################################//
84: // Loop
85:
86: // We need to implement a post-order traversal iteratively, to
87: // avoid running into stack space limits. This is pretty tricky
88: // to reason about, so we just manually stack-ify the recursive
89: // variant:
90: //
91: // function f($node) {
92: // foreach ($node->children as $child) {
93: // f($child);
94: // }
95: // validate($node);
96: // }
97: //
98: // Thus, we will represent a stack frame as array($node,
99: // $is_inline, stack of children)
100: // e.g. array_reverse($node->children) - already processed
101: // children.
102:
103: $parent_def = $definition->info_parent_def;
104: $stack = array(
105: array($top_node,
106: $parent_def->descendants_are_inline,
107: $parent_def->excludes, // exclusions
108: 0)
109: );
110:
111: while (!empty($stack)) {
112: list($node, $is_inline, $excludes, $ix) = array_pop($stack);
113: // recursive call
114: $go = false;
115: $def = empty($stack) ? $definition->info_parent_def : $definition->info[$node->name];
116: while (isset($node->children[$ix])) {
117: $child = $node->children[$ix++];
118: if ($child instanceof HTMLPurifier_Node_Element) {
119: $go = true;
120: $stack[] = array($node, $is_inline, $excludes, $ix);
121: $stack[] = array($child,
122: // ToDo: I don't think it matters if it's def or
123: // child_def, but double check this...
124: $is_inline || $def->descendants_are_inline,
125: empty($def->excludes) ? $excludes
126: : array_merge($excludes, $def->excludes),
127: 0);
128: break;
129: }
130: };
131: if ($go) continue;
132: list($token, $d) = $node->toTokenPair();
133: // base case
134: if ($excludes_enabled && isset($excludes[$node->name])) {
135: $node->dead = true;
136: if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded');
137: } else {
138: // XXX I suppose it would be slightly more efficient to
139: // avoid the allocation here and have children
140: // strategies handle it
141: $children = array();
142: foreach ($node->children as $child) {
143: if (!$child->dead) $children[] = $child;
144: }
145: $result = $def->child->validateChildren($children, $config, $context);
146: if ($result === true) {
147: // nop
148: $node->children = $children;
149: } elseif ($result === false) {
150: $node->dead = true;
151: if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node removed');
152: } else {
153: $node->children = $result;
154: if ($e) {
155: // XXX This will miss mutations of internal nodes. Perhaps defer to the child validators
156: if (empty($result) && !empty($children)) {
157: $e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed');
158: } else if ($result != $children) {
159: $e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized');
160: }
161: }
162: }
163: }
164: }
165:
166: //####################################################################//
167: // Post-processing
168:
169: // remove context variables
170: $context->destroy('IsInline');
171: $context->destroy('CurrentNode');
172: $context->destroy('CurrentToken');
173:
174: //####################################################################//
175: // Return
176:
177: return HTMLPurifier_Arborize::flatten($node, $config, $context);
178: }
179: }
180:
181: // vim: et sw=4 sts=4
182: