| 1: | <?php | 
| 2: | |
| 3: | /** | 
| 4: | * Takes a well formed list of tokens and fixes their nesting. | 
| 5: | * | 
| 6: | * HTML elements dictate which elements are allowed to be their children, | 
| 7: | * for example, you can't have a p tag in a span tag. Other elements have | 
| 8: | * much more rigorous definitions: tables, for instance, require a specific | 
| 9: | * order for their elements. There are also constraints not expressible by | 
| 10: | * document type definitions, such as the chameleon nature of ins/del | 
| 11: | * tags and global child exclusions. | 
| 12: | * | 
| 13: | * The first major objective of this strategy is to iterate through all | 
| 14: | * the nodes and determine whether or not their children conform to the | 
| 15: | * element's definition. If they do not, the child definition may | 
| 16: | * optionally supply an amended list of elements that is valid or | 
| 17: | * require that the entire node be deleted (and the previous node | 
| 18: | * rescanned). | 
| 19: | * | 
| 20: | * The second objective is to ensure that explicitly excluded elements of | 
| 21: | * an element do not appear in its children. Code that accomplishes this | 
| 22: | * task is pervasive through the strategy, though the two are distinct tasks | 
| 23: | * and could, theoretically, be seperated (although it's not recommended). | 
| 24: | * | 
| 25: | * @note Whether or not unrecognized children are silently dropped or | 
| 26: | * translated into text depends on the child definitions. | 
| 27: | * | 
| 28: | * @todo Enable nodes to be bubbled out of the structure. This is | 
| 29: | * easier with our new algorithm. | 
| 30: | */ | 
| 31: | |
| 32: | class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy | 
| 33: | { | 
| 34: | |
| 35: | /** | 
| 36: | * @param HTMLPurifier_Token[] $tokens | 
| 37: | * @param HTMLPurifier_Config $config | 
| 38: | * @param HTMLPurifier_Context $context | 
| 39: | * @return array|HTMLPurifier_Token[] | 
| 40: | */ | 
| 41: | public function execute($tokens, $config, $context) | 
| 42: | { | 
| 43: | |
| 44: | //####################################################################// | 
| 45: | // Pre-processing | 
| 46: | |
| 47: | // O(n) pass to convert to a tree, so that we can efficiently | 
| 48: | // refer to substrings | 
| 49: | $top_node = HTMLPurifier_Arborize::arborize($tokens, $config, $context); | 
| 50: | |
| 51: | // get a copy of the HTML definition | 
| 52: | $definition = $config->getHTMLDefinition(); | 
| 53: | |
| 54: | $excludes_enabled = !$config->get('Core.DisableExcludes'); | 
| 55: | |
| 56: | // setup the context variable 'IsInline', for chameleon processing | 
| 57: | // is 'false' when we are not inline, 'true' when it must always | 
| 58: | // be inline, and an integer when it is inline for a certain | 
| 59: | // branch of the document tree | 
| 60: | $is_inline = $definition->info_parent_def->descendants_are_inline; | 
| 61: | $context->register('IsInline', $is_inline); | 
| 62: | |
| 63: | // setup error collector | 
| 64: | $e =& $context->get('ErrorCollector', true); | 
| 65: | |
| 66: | //####################################################################// | 
| 67: | // Loop initialization | 
| 68: | |
| 69: | // stack that contains all elements that are excluded | 
| 70: | // it is organized by parent elements, similar to $stack, | 
| 71: | // but it is only populated when an element with exclusions is | 
| 72: | // processed, i.e. there won't be empty exclusions. | 
| 73: | $exclude_stack = array($definition->info_parent_def->excludes); | 
| 74: | |
| 75: | // variable that contains the start token while we are processing | 
| 76: | // nodes. This enables error reporting to do its job | 
| 77: | $node = $top_node; | 
| 78: | // dummy token | 
| 79: | list($token, $d) = $node->toTokenPair(); | 
| 80: | $context->register('CurrentNode', $node); | 
| 81: | $context->register('CurrentToken', $token); | 
| 82: | |
| 83: | //####################################################################// | 
| 84: | // Loop | 
| 85: | |
| 86: | // We need to implement a post-order traversal iteratively, to | 
| 87: | // avoid running into stack space limits. This is pretty tricky | 
| 88: | // to reason about, so we just manually stack-ify the recursive | 
| 89: | // variant: | 
| 90: | // | 
| 91: | // function f($node) { | 
| 92: | // foreach ($node->children as $child) { | 
| 93: | // f($child); | 
| 94: | // } | 
| 95: | // validate($node); | 
| 96: | // } | 
| 97: | // | 
| 98: | // Thus, we will represent a stack frame as array($node, | 
| 99: | // $is_inline, stack of children) | 
| 100: | // e.g. array_reverse($node->children) - already processed | 
| 101: | // children. | 
| 102: | |
| 103: | $parent_def = $definition->info_parent_def; | 
| 104: | $stack = array( | 
| 105: | array($top_node, | 
| 106: | $parent_def->descendants_are_inline, | 
| 107: | $parent_def->excludes, // exclusions | 
| 108: | 0) | 
| 109: | ); | 
| 110: | |
| 111: | while (!empty($stack)) { | 
| 112: | list($node, $is_inline, $excludes, $ix) = array_pop($stack); | 
| 113: | // recursive call | 
| 114: | $go = false; | 
| 115: | $def = empty($stack) ? $definition->info_parent_def : $definition->info[$node->name]; | 
| 116: | while (isset($node->children[$ix])) { | 
| 117: | $child = $node->children[$ix++]; | 
| 118: | if ($child instanceof HTMLPurifier_Node_Element) { | 
| 119: | $go = true; | 
| 120: | $stack[] = array($node, $is_inline, $excludes, $ix); | 
| 121: | $stack[] = array($child, | 
| 122: | // ToDo: I don't think it matters if it's def or | 
| 123: | // child_def, but double check this... | 
| 124: | $is_inline || $def->descendants_are_inline, | 
| 125: | empty($def->excludes) ? $excludes | 
| 126: | : array_merge($excludes, $def->excludes), | 
| 127: | 0); | 
| 128: | break; | 
| 129: | } | 
| 130: | }; | 
| 131: | if ($go) continue; | 
| 132: | list($token, $d) = $node->toTokenPair(); | 
| 133: | // base case | 
| 134: | if ($excludes_enabled && isset($excludes[$node->name])) { | 
| 135: | $node->dead = true; | 
| 136: | if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded'); | 
| 137: | } else { | 
| 138: | // XXX I suppose it would be slightly more efficient to | 
| 139: | // avoid the allocation here and have children | 
| 140: | // strategies handle it | 
| 141: | $children = array(); | 
| 142: | foreach ($node->children as $child) { | 
| 143: | if (!$child->dead) $children[] = $child; | 
| 144: | } | 
| 145: | $result = $def->child->validateChildren($children, $config, $context); | 
| 146: | if ($result === true) { | 
| 147: | // nop | 
| 148: | $node->children = $children; | 
| 149: | } elseif ($result === false) { | 
| 150: | $node->dead = true; | 
| 151: | if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node removed'); | 
| 152: | } else { | 
| 153: | $node->children = $result; | 
| 154: | if ($e) { | 
| 155: | // XXX This will miss mutations of internal nodes. Perhaps defer to the child validators | 
| 156: | if (empty($result) && !empty($children)) { | 
| 157: | $e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed'); | 
| 158: | } else if ($result != $children) { | 
| 159: | $e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized'); | 
| 160: | } | 
| 161: | } | 
| 162: | } | 
| 163: | } | 
| 164: | } | 
| 165: | |
| 166: | //####################################################################// | 
| 167: | // Post-processing | 
| 168: | |
| 169: | // remove context variables | 
| 170: | $context->destroy('IsInline'); | 
| 171: | $context->destroy('CurrentNode'); | 
| 172: | $context->destroy('CurrentToken'); | 
| 173: | |
| 174: | //####################################################################// | 
| 175: | // Return | 
| 176: | |
| 177: | return HTMLPurifier_Arborize::flatten($node, $config, $context); | 
| 178: | } | 
| 179: | } | 
| 180: | |
| 181: | // vim: et sw=4 sts=4 | 
| 182: |