1: | <?php |
2: | |
3: | /** |
4: | * Takes a well formed list of tokens and fixes their nesting. |
5: | * |
6: | * HTML elements dictate which elements are allowed to be their children, |
7: | * for example, you can't have a p tag in a span tag. Other elements have |
8: | * much more rigorous definitions: tables, for instance, require a specific |
9: | * order for their elements. There are also constraints not expressible by |
10: | * document type definitions, such as the chameleon nature of ins/del |
11: | * tags and global child exclusions. |
12: | * |
13: | * The first major objective of this strategy is to iterate through all |
14: | * the nodes and determine whether or not their children conform to the |
15: | * element's definition. If they do not, the child definition may |
16: | * optionally supply an amended list of elements that is valid or |
17: | * require that the entire node be deleted (and the previous node |
18: | * rescanned). |
19: | * |
20: | * The second objective is to ensure that explicitly excluded elements of |
21: | * an element do not appear in its children. Code that accomplishes this |
22: | * task is pervasive through the strategy, though the two are distinct tasks |
23: | * and could, theoretically, be seperated (although it's not recommended). |
24: | * |
25: | * @note Whether or not unrecognized children are silently dropped or |
26: | * translated into text depends on the child definitions. |
27: | * |
28: | * @todo Enable nodes to be bubbled out of the structure. This is |
29: | * easier with our new algorithm. |
30: | */ |
31: | |
32: | class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy |
33: | { |
34: | |
35: | /** |
36: | * @param HTMLPurifier_Token[] $tokens |
37: | * @param HTMLPurifier_Config $config |
38: | * @param HTMLPurifier_Context $context |
39: | * @return array|HTMLPurifier_Token[] |
40: | */ |
41: | public function execute($tokens, $config, $context) |
42: | { |
43: | |
44: | //####################################################################// |
45: | // Pre-processing |
46: | |
47: | // O(n) pass to convert to a tree, so that we can efficiently |
48: | // refer to substrings |
49: | $top_node = HTMLPurifier_Arborize::arborize($tokens, $config, $context); |
50: | |
51: | // get a copy of the HTML definition |
52: | $definition = $config->getHTMLDefinition(); |
53: | |
54: | $excludes_enabled = !$config->get('Core.DisableExcludes'); |
55: | |
56: | // setup the context variable 'IsInline', for chameleon processing |
57: | // is 'false' when we are not inline, 'true' when it must always |
58: | // be inline, and an integer when it is inline for a certain |
59: | // branch of the document tree |
60: | $is_inline = $definition->info_parent_def->descendants_are_inline; |
61: | $context->register('IsInline', $is_inline); |
62: | |
63: | // setup error collector |
64: | $e =& $context->get('ErrorCollector', true); |
65: | |
66: | //####################################################################// |
67: | // Loop initialization |
68: | |
69: | // stack that contains all elements that are excluded |
70: | // it is organized by parent elements, similar to $stack, |
71: | // but it is only populated when an element with exclusions is |
72: | // processed, i.e. there won't be empty exclusions. |
73: | $exclude_stack = array($definition->info_parent_def->excludes); |
74: | |
75: | // variable that contains the start token while we are processing |
76: | // nodes. This enables error reporting to do its job |
77: | $node = $top_node; |
78: | // dummy token |
79: | list($token, $d) = $node->toTokenPair(); |
80: | $context->register('CurrentNode', $node); |
81: | $context->register('CurrentToken', $token); |
82: | |
83: | //####################################################################// |
84: | // Loop |
85: | |
86: | // We need to implement a post-order traversal iteratively, to |
87: | // avoid running into stack space limits. This is pretty tricky |
88: | // to reason about, so we just manually stack-ify the recursive |
89: | // variant: |
90: | // |
91: | // function f($node) { |
92: | // foreach ($node->children as $child) { |
93: | // f($child); |
94: | // } |
95: | // validate($node); |
96: | // } |
97: | // |
98: | // Thus, we will represent a stack frame as array($node, |
99: | // $is_inline, stack of children) |
100: | // e.g. array_reverse($node->children) - already processed |
101: | // children. |
102: | |
103: | $parent_def = $definition->info_parent_def; |
104: | $stack = array( |
105: | array($top_node, |
106: | $parent_def->descendants_are_inline, |
107: | $parent_def->excludes, // exclusions |
108: | 0) |
109: | ); |
110: | |
111: | while (!empty($stack)) { |
112: | list($node, $is_inline, $excludes, $ix) = array_pop($stack); |
113: | // recursive call |
114: | $go = false; |
115: | $def = empty($stack) ? $definition->info_parent_def : $definition->info[$node->name]; |
116: | while (isset($node->children[$ix])) { |
117: | $child = $node->children[$ix++]; |
118: | if ($child instanceof HTMLPurifier_Node_Element) { |
119: | $go = true; |
120: | $stack[] = array($node, $is_inline, $excludes, $ix); |
121: | $stack[] = array($child, |
122: | // ToDo: I don't think it matters if it's def or |
123: | // child_def, but double check this... |
124: | $is_inline || $def->descendants_are_inline, |
125: | empty($def->excludes) ? $excludes |
126: | : array_merge($excludes, $def->excludes), |
127: | 0); |
128: | break; |
129: | } |
130: | }; |
131: | if ($go) continue; |
132: | list($token, $d) = $node->toTokenPair(); |
133: | // base case |
134: | if ($excludes_enabled && isset($excludes[$node->name])) { |
135: | $node->dead = true; |
136: | if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded'); |
137: | } else { |
138: | // XXX I suppose it would be slightly more efficient to |
139: | // avoid the allocation here and have children |
140: | // strategies handle it |
141: | $children = array(); |
142: | foreach ($node->children as $child) { |
143: | if (!$child->dead) $children[] = $child; |
144: | } |
145: | $result = $def->child->validateChildren($children, $config, $context); |
146: | if ($result === true) { |
147: | // nop |
148: | $node->children = $children; |
149: | } elseif ($result === false) { |
150: | $node->dead = true; |
151: | if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node removed'); |
152: | } else { |
153: | $node->children = $result; |
154: | if ($e) { |
155: | // XXX This will miss mutations of internal nodes. Perhaps defer to the child validators |
156: | if (empty($result) && !empty($children)) { |
157: | $e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed'); |
158: | } else if ($result != $children) { |
159: | $e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized'); |
160: | } |
161: | } |
162: | } |
163: | } |
164: | } |
165: | |
166: | //####################################################################// |
167: | // Post-processing |
168: | |
169: | // remove context variables |
170: | $context->destroy('IsInline'); |
171: | $context->destroy('CurrentNode'); |
172: | $context->destroy('CurrentToken'); |
173: | |
174: | //####################################################################// |
175: | // Return |
176: | |
177: | return HTMLPurifier_Arborize::flatten($node, $config, $context); |
178: | } |
179: | } |
180: | |
181: | // vim: et sw=4 sts=4 |
182: |