Source for file FixNesting.php
Documentation is available at FixNesting.php
* Takes a well formed list of tokens and fixes their nesting.
* HTML elements dictate which elements are allowed to be their children,
* for example, you can't have a p tag in a span tag. Other elements have
* much more rigorous definitions: tables, for instance, require a specific
* order for their elements. There are also constraints not expressible by
* document type definitions, such as the chameleon nature of ins/del
* tags and global child exclusions.
* The first major objective of this strategy is to iterate through all the
* nodes (not tokens) of the list of tokens and determine whether or not
* their children conform to the element's definition. If they do not, the
* child definition may optionally supply an amended list of elements that
* is valid or require that the entire node be deleted (and the previous
* The second objective is to ensure that explicitly excluded elements of
* an element do not appear in its children. Code that accomplishes this
* task is pervasive through the strategy, though the two are distinct tasks
* and could, theoretically, be seperated (although it's not recommended).
* @note Whether or not unrecognized children are silently dropped or
* translated into text depends on the child definitions.
* @todo Enable nodes to be bubbled out of the structure.
public function execute($tokens, $config, $context) {
//####################################################################//
// get a copy of the HTML definition
$definition =
$config->getHTMLDefinition();
// insert implicit "parent" node, will be removed at end.
$parent_name =
$definition->info_parent;
// setup the context variable 'IsInline', for chameleon processing
// is 'false' when we are not inline, 'true' when it must always
// be inline, and an integer when it is inline for a certain
// branch of the document tree
$is_inline =
$definition->info_parent_def->descendants_are_inline;
$context->register('IsInline', $is_inline);
$e =
& $context->get('ErrorCollector', true);
//####################################################################//
// stack that contains the indexes of all parents,
// $stack[count($stack)-1] being the current parent
// stack that contains all elements that are excluded
// it is organized by parent elements, similar to $stack,
// but it is only populated when an element with exclusions is
// processed, i.e. there won't be empty exclusions.
$exclude_stack =
array();
// variable that contains the start token while we are processing
// nodes. This enables error reporting to do its job
$context->register('CurrentToken', $start_token);
//####################################################################//
// iterate through all start nodes. Determining the start node
// is complicated so it has been omitted from the loop construct
for ($i =
0, $size =
count($tokens) ; $i <
$size; ) {
//################################################################//
// Gather information on children
// child token accumulator
// scroll to the end of this node, report number, and collect
for ($j =
$i, $depth =
0; ; $j++
) {
// skip token assignment on first iteration, this is the
// token we currently are on
if ($depth ==
1) continue;
// skip token assignment on last iteration, this is the
// end token of the token we're currently on
$child_tokens[] =
$tokens[$j];
// $i is index of start token
// $j is index of end token
$start_token =
$tokens[$i]; // to make token available via CurrentToken
//################################################################//
// Gather information on parent
// calculate parent information
if ($count =
count($stack)) {
$parent_index =
$stack[$count-
1];
$parent_name =
$tokens[$parent_index]->name;
if ($parent_index ==
0) {
$parent_def =
$definition->info_parent_def;
$parent_def =
$definition->info[$parent_name];
// processing as if the parent were the "root" node
// unknown info, it won't be used anyway, in the future,
// we may want to enforce one element only (this is
// necessary for HTML Purifier to clean entire documents
$parent_index =
$parent_name =
$parent_def =
null;
if ($is_inline ===
false) {
// check if conditions make it inline
if (!empty($parent_def) &&
$parent_def->descendants_are_inline) {
// check if we're out of inline
if ($count ===
$is_inline) {
//################################################################//
// Determine whether element is explicitly excluded SGML-style
// determine whether or not element is excluded by checking all
// parent exclusions. The array should not be very large, two
if (!empty($exclude_stack)) {
foreach ($exclude_stack as $lookup) {
if (isset
($lookup[$tokens[$i]->name])) {
// no need to continue processing
//################################################################//
// Perform child validation
// there is an exclusion, remove the entire node
$excludes =
array(); // not used, but good to initialize anyway
// special processing for the first node
$def =
$definition->info_parent_def;
$def =
$definition->info[$tokens[$i]->name];
if (!empty($def->child)) {
// have DTD child def validate children
$result =
$def->child->validateChildren(
$child_tokens, $config, $context);
// weird, no child definition, get rid of everything
// determine whether or not this element has any exclusions
$excludes =
$def->excludes;
// $result is now a bool or array
//################################################################//
// Process result by interpreting $result
if ($result ===
true ||
$child_tokens ===
$result) {
// register start token as a parental node start
// register exclusions if there are any
if (!empty($excludes)) $exclude_stack[] =
$excludes;
// move cursor to next possible start node
} elseif($result ===
false) {
$e->send(E_ERROR, 'Strategy_FixNesting: Node excluded');
$e->send(E_ERROR, 'Strategy_FixNesting: Node removed');
// calculate length of inner tokens and current tokens
// there is no start token to register,
// current node is now the next possible start node
// unless it turns out that we need to do a double-check
// this is a rought heuristic that covers 100% of HTML's
// cases and 99% of all other cases. A child definition
// that would be tricked by this would be something like:
// ( | a b c) where it's all or nothing. Fortunately,
// our current implementation claims that that case would
// not allow empty, even if it did
if (!$parent_def->child->allow_empty) {
// we need to do a double-check
// PROJECTED OPTIMIZATION: Process all children elements before
// reprocessing parent node.
// replace node with $result
// calculate length of inner tokens
if (empty($result) &&
$length) {
$e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed');
$e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized');
// register start token as a parental node start
// register exclusions if there are any
if (!empty($excludes)) $exclude_stack[] =
$excludes;
// move cursor to next possible start node
//################################################################//
// Scroll to next start node
// We assume, at this point, that $i is the index of the token
// that is the first possible new start point for a node.
// Test if the token indeed is a start tag, if not, move forward
// pop a token index off the stack if we ended a node
// pop an exclusion lookup off exclusion stack if
// we ended node and that node had exclusions
if ($i ==
0 ||
$i ==
$size -
1) {
// use specialized var if it's the super-parent
$s_excludes =
$definition->info_parent_def->excludes;
$s_excludes =
$definition->info[$tokens[$i]->name]->excludes;
//####################################################################//
// remove implicit parent tokens at the beginning and end
// remove context variables
$context->destroy('IsInline');
$context->destroy('CurrentToken');
//####################################################################//
Documentation generated on Thu, 19 Jun 2008 18:49:12 -0400 by phpDocumentor 1.4.2