| 1: | <?php |
| 2: | |
| 3: | /** |
| 4: | * Takes a well formed list of tokens and fixes their nesting. |
| 5: | * |
| 6: | * HTML elements dictate which elements are allowed to be their children, |
| 7: | * for example, you can't have a p tag in a span tag. Other elements have |
| 8: | * much more rigorous definitions: tables, for instance, require a specific |
| 9: | * order for their elements. There are also constraints not expressible by |
| 10: | * document type definitions, such as the chameleon nature of ins/del |
| 11: | * tags and global child exclusions. |
| 12: | * |
| 13: | * The first major objective of this strategy is to iterate through all |
| 14: | * the nodes and determine whether or not their children conform to the |
| 15: | * element's definition. If they do not, the child definition may |
| 16: | * optionally supply an amended list of elements that is valid or |
| 17: | * require that the entire node be deleted (and the previous node |
| 18: | * rescanned). |
| 19: | * |
| 20: | * The second objective is to ensure that explicitly excluded elements of |
| 21: | * an element do not appear in its children. Code that accomplishes this |
| 22: | * task is pervasive through the strategy, though the two are distinct tasks |
| 23: | * and could, theoretically, be seperated (although it's not recommended). |
| 24: | * |
| 25: | * @note Whether or not unrecognized children are silently dropped or |
| 26: | * translated into text depends on the child definitions. |
| 27: | * |
| 28: | * @todo Enable nodes to be bubbled out of the structure. This is |
| 29: | * easier with our new algorithm. |
| 30: | */ |
| 31: | |
| 32: | class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy |
| 33: | { |
| 34: | |
| 35: | /** |
| 36: | * @param HTMLPurifier_Token[] $tokens |
| 37: | * @param HTMLPurifier_Config $config |
| 38: | * @param HTMLPurifier_Context $context |
| 39: | * @return array|HTMLPurifier_Token[] |
| 40: | */ |
| 41: | public function execute($tokens, $config, $context) |
| 42: | { |
| 43: | |
| 44: | //####################################################################// |
| 45: | // Pre-processing |
| 46: | |
| 47: | // O(n) pass to convert to a tree, so that we can efficiently |
| 48: | // refer to substrings |
| 49: | $top_node = HTMLPurifier_Arborize::arborize($tokens, $config, $context); |
| 50: | |
| 51: | // get a copy of the HTML definition |
| 52: | $definition = $config->getHTMLDefinition(); |
| 53: | |
| 54: | $excludes_enabled = !$config->get('Core.DisableExcludes'); |
| 55: | |
| 56: | // setup the context variable 'IsInline', for chameleon processing |
| 57: | // is 'false' when we are not inline, 'true' when it must always |
| 58: | // be inline, and an integer when it is inline for a certain |
| 59: | // branch of the document tree |
| 60: | $is_inline = $definition->info_parent_def->descendants_are_inline; |
| 61: | $context->register('IsInline', $is_inline); |
| 62: | |
| 63: | // setup error collector |
| 64: | $e =& $context->get('ErrorCollector', true); |
| 65: | |
| 66: | //####################################################################// |
| 67: | // Loop initialization |
| 68: | |
| 69: | // stack that contains all elements that are excluded |
| 70: | // it is organized by parent elements, similar to $stack, |
| 71: | // but it is only populated when an element with exclusions is |
| 72: | // processed, i.e. there won't be empty exclusions. |
| 73: | $exclude_stack = array($definition->info_parent_def->excludes); |
| 74: | |
| 75: | // variable that contains the start token while we are processing |
| 76: | // nodes. This enables error reporting to do its job |
| 77: | $node = $top_node; |
| 78: | // dummy token |
| 79: | list($token, $d) = $node->toTokenPair(); |
| 80: | $context->register('CurrentNode', $node); |
| 81: | $context->register('CurrentToken', $token); |
| 82: | |
| 83: | //####################################################################// |
| 84: | // Loop |
| 85: | |
| 86: | // We need to implement a post-order traversal iteratively, to |
| 87: | // avoid running into stack space limits. This is pretty tricky |
| 88: | // to reason about, so we just manually stack-ify the recursive |
| 89: | // variant: |
| 90: | // |
| 91: | // function f($node) { |
| 92: | // foreach ($node->children as $child) { |
| 93: | // f($child); |
| 94: | // } |
| 95: | // validate($node); |
| 96: | // } |
| 97: | // |
| 98: | // Thus, we will represent a stack frame as array($node, |
| 99: | // $is_inline, stack of children) |
| 100: | // e.g. array_reverse($node->children) - already processed |
| 101: | // children. |
| 102: | |
| 103: | $parent_def = $definition->info_parent_def; |
| 104: | $stack = array( |
| 105: | array($top_node, |
| 106: | $parent_def->descendants_are_inline, |
| 107: | $parent_def->excludes, // exclusions |
| 108: | 0) |
| 109: | ); |
| 110: | |
| 111: | while (!empty($stack)) { |
| 112: | list($node, $is_inline, $excludes, $ix) = array_pop($stack); |
| 113: | // recursive call |
| 114: | $go = false; |
| 115: | $def = empty($stack) ? $definition->info_parent_def : $definition->info[$node->name]; |
| 116: | while (isset($node->children[$ix])) { |
| 117: | $child = $node->children[$ix++]; |
| 118: | if ($child instanceof HTMLPurifier_Node_Element) { |
| 119: | $go = true; |
| 120: | $stack[] = array($node, $is_inline, $excludes, $ix); |
| 121: | $stack[] = array($child, |
| 122: | // ToDo: I don't think it matters if it's def or |
| 123: | // child_def, but double check this... |
| 124: | $is_inline || $def->descendants_are_inline, |
| 125: | empty($def->excludes) ? $excludes |
| 126: | : array_merge($excludes, $def->excludes), |
| 127: | 0); |
| 128: | break; |
| 129: | } |
| 130: | }; |
| 131: | if ($go) continue; |
| 132: | list($token, $d) = $node->toTokenPair(); |
| 133: | // base case |
| 134: | if ($excludes_enabled && isset($excludes[$node->name])) { |
| 135: | $node->dead = true; |
| 136: | if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded'); |
| 137: | } else { |
| 138: | // XXX I suppose it would be slightly more efficient to |
| 139: | // avoid the allocation here and have children |
| 140: | // strategies handle it |
| 141: | $children = array(); |
| 142: | foreach ($node->children as $child) { |
| 143: | if (!$child->dead) $children[] = $child; |
| 144: | } |
| 145: | $result = $def->child->validateChildren($children, $config, $context); |
| 146: | if ($result === true) { |
| 147: | // nop |
| 148: | $node->children = $children; |
| 149: | } elseif ($result === false) { |
| 150: | $node->dead = true; |
| 151: | if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node removed'); |
| 152: | } else { |
| 153: | $node->children = $result; |
| 154: | if ($e) { |
| 155: | // XXX This will miss mutations of internal nodes. Perhaps defer to the child validators |
| 156: | if (empty($result) && !empty($children)) { |
| 157: | $e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed'); |
| 158: | } else if ($result != $children) { |
| 159: | $e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized'); |
| 160: | } |
| 161: | } |
| 162: | } |
| 163: | } |
| 164: | } |
| 165: | |
| 166: | //####################################################################// |
| 167: | // Post-processing |
| 168: | |
| 169: | // remove context variables |
| 170: | $context->destroy('IsInline'); |
| 171: | $context->destroy('CurrentNode'); |
| 172: | $context->destroy('CurrentToken'); |
| 173: | |
| 174: | //####################################################################// |
| 175: | // Return |
| 176: | |
| 177: | return HTMLPurifier_Arborize::flatten($node, $config, $context); |
| 178: | } |
| 179: | } |
| 180: | |
| 181: | // vim: et sw=4 sts=4 |
| 182: |