HTMLPurifier 4.4.0
|
00001 <?php 00002 00031 class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy 00032 { 00033 00034 public function execute($tokens, $config, $context) { 00035 //####################################################################// 00036 // Pre-processing 00037 00038 // get a copy of the HTML definition 00039 $definition = $config->getHTMLDefinition(); 00040 00041 // insert implicit "parent" node, will be removed at end. 00042 // DEFINITION CALL 00043 $parent_name = $definition->info_parent; 00044 array_unshift($tokens, new HTMLPurifier_Token_Start($parent_name)); 00045 $tokens[] = new HTMLPurifier_Token_End($parent_name); 00046 00047 // setup the context variable 'IsInline', for chameleon processing 00048 // is 'false' when we are not inline, 'true' when it must always 00049 // be inline, and an integer when it is inline for a certain 00050 // branch of the document tree 00051 $is_inline = $definition->info_parent_def->descendants_are_inline; 00052 $context->register('IsInline', $is_inline); 00053 00054 // setup error collector 00055 $e =& $context->get('ErrorCollector', true); 00056 00057 //####################################################################// 00058 // Loop initialization 00059 00060 // stack that contains the indexes of all parents, 00061 // $stack[count($stack)-1] being the current parent 00062 $stack = array(); 00063 00064 // stack that contains all elements that are excluded 00065 // it is organized by parent elements, similar to $stack, 00066 // but it is only populated when an element with exclusions is 00067 // processed, i.e. there won't be empty exclusions. 00068 $exclude_stack = array(); 00069 00070 // variable that contains the start token while we are processing 00071 // nodes. This enables error reporting to do its job 00072 $start_token = false; 00073 $context->register('CurrentToken', $start_token); 00074 00075 //####################################################################// 00076 // Loop 00077 00078 // iterate through all start nodes. Determining the start node 00079 // is complicated so it has been omitted from the loop construct 00080 for ($i = 0, $size = count($tokens) ; $i < $size; ) { 00081 00082 //################################################################// 00083 // Gather information on children 00084 00085 // child token accumulator 00086 $child_tokens = array(); 00087 00088 // scroll to the end of this node, report number, and collect 00089 // all children 00090 for ($j = $i, $depth = 0; ; $j++) { 00091 if ($tokens[$j] instanceof HTMLPurifier_Token_Start) { 00092 $depth++; 00093 // skip token assignment on first iteration, this is the 00094 // token we currently are on 00095 if ($depth == 1) continue; 00096 } elseif ($tokens[$j] instanceof HTMLPurifier_Token_End) { 00097 $depth--; 00098 // skip token assignment on last iteration, this is the 00099 // end token of the token we're currently on 00100 if ($depth == 0) break; 00101 } 00102 $child_tokens[] = $tokens[$j]; 00103 } 00104 00105 // $i is index of start token 00106 // $j is index of end token 00107 00108 $start_token = $tokens[$i]; // to make token available via CurrentToken 00109 00110 //################################################################// 00111 // Gather information on parent 00112 00113 // calculate parent information 00114 if ($count = count($stack)) { 00115 $parent_index = $stack[$count-1]; 00116 $parent_name = $tokens[$parent_index]->name; 00117 if ($parent_index == 0) { 00118 $parent_def = $definition->info_parent_def; 00119 } else { 00120 $parent_def = $definition->info[$parent_name]; 00121 } 00122 } else { 00123 // processing as if the parent were the "root" node 00124 // unknown info, it won't be used anyway, in the future, 00125 // we may want to enforce one element only (this is 00126 // necessary for HTML Purifier to clean entire documents 00127 $parent_index = $parent_name = $parent_def = null; 00128 } 00129 00130 // calculate context 00131 if ($is_inline === false) { 00132 // check if conditions make it inline 00133 if (!empty($parent_def) && $parent_def->descendants_are_inline) { 00134 $is_inline = $count - 1; 00135 } 00136 } else { 00137 // check if we're out of inline 00138 if ($count === $is_inline) { 00139 $is_inline = false; 00140 } 00141 } 00142 00143 //################################################################// 00144 // Determine whether element is explicitly excluded SGML-style 00145 00146 // determine whether or not element is excluded by checking all 00147 // parent exclusions. The array should not be very large, two 00148 // elements at most. 00149 $excluded = false; 00150 if (!empty($exclude_stack)) { 00151 foreach ($exclude_stack as $lookup) { 00152 if (isset($lookup[$tokens[$i]->name])) { 00153 $excluded = true; 00154 // no need to continue processing 00155 break; 00156 } 00157 } 00158 } 00159 00160 //################################################################// 00161 // Perform child validation 00162 00163 if ($excluded) { 00164 // there is an exclusion, remove the entire node 00165 $result = false; 00166 $excludes = array(); // not used, but good to initialize anyway 00167 } else { 00168 // DEFINITION CALL 00169 if ($i === 0) { 00170 // special processing for the first node 00171 $def = $definition->info_parent_def; 00172 } else { 00173 $def = $definition->info[$tokens[$i]->name]; 00174 00175 } 00176 00177 if (!empty($def->child)) { 00178 // have DTD child def validate children 00179 $result = $def->child->validateChildren( 00180 $child_tokens, $config, $context); 00181 } else { 00182 // weird, no child definition, get rid of everything 00183 $result = false; 00184 } 00185 00186 // determine whether or not this element has any exclusions 00187 $excludes = $def->excludes; 00188 } 00189 00190 // $result is now a bool or array 00191 00192 //################################################################// 00193 // Process result by interpreting $result 00194 00195 if ($result === true || $child_tokens === $result) { 00196 // leave the node as is 00197 00198 // register start token as a parental node start 00199 $stack[] = $i; 00200 00201 // register exclusions if there are any 00202 if (!empty($excludes)) $exclude_stack[] = $excludes; 00203 00204 // move cursor to next possible start node 00205 $i++; 00206 00207 } elseif($result === false) { 00208 // remove entire node 00209 00210 if ($e) { 00211 if ($excluded) { 00212 $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded'); 00213 } else { 00214 $e->send(E_ERROR, 'Strategy_FixNesting: Node removed'); 00215 } 00216 } 00217 00218 // calculate length of inner tokens and current tokens 00219 $length = $j - $i + 1; 00220 00221 // perform removal 00222 array_splice($tokens, $i, $length); 00223 00224 // update size 00225 $size -= $length; 00226 00227 // there is no start token to register, 00228 // current node is now the next possible start node 00229 // unless it turns out that we need to do a double-check 00230 00231 // this is a rought heuristic that covers 100% of HTML's 00232 // cases and 99% of all other cases. A child definition 00233 // that would be tricked by this would be something like: 00234 // ( | a b c) where it's all or nothing. Fortunately, 00235 // our current implementation claims that that case would 00236 // not allow empty, even if it did 00237 if (!$parent_def->child->allow_empty) { 00238 // we need to do a double-check 00239 $i = $parent_index; 00240 array_pop($stack); 00241 } 00242 00243 // PROJECTED OPTIMIZATION: Process all children elements before 00244 // reprocessing parent node. 00245 00246 } else { 00247 // replace node with $result 00248 00249 // calculate length of inner tokens 00250 $length = $j - $i - 1; 00251 00252 if ($e) { 00253 if (empty($result) && $length) { 00254 $e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed'); 00255 } else { 00256 $e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized'); 00257 } 00258 } 00259 00260 // perform replacement 00261 array_splice($tokens, $i + 1, $length, $result); 00262 00263 // update size 00264 $size -= $length; 00265 $size += count($result); 00266 00267 // register start token as a parental node start 00268 $stack[] = $i; 00269 00270 // register exclusions if there are any 00271 if (!empty($excludes)) $exclude_stack[] = $excludes; 00272 00273 // move cursor to next possible start node 00274 $i++; 00275 00276 } 00277 00278 //################################################################// 00279 // Scroll to next start node 00280 00281 // We assume, at this point, that $i is the index of the token 00282 // that is the first possible new start point for a node. 00283 00284 // Test if the token indeed is a start tag, if not, move forward 00285 // and test again. 00286 $size = count($tokens); 00287 while ($i < $size and !$tokens[$i] instanceof HTMLPurifier_Token_Start) { 00288 if ($tokens[$i] instanceof HTMLPurifier_Token_End) { 00289 // pop a token index off the stack if we ended a node 00290 array_pop($stack); 00291 // pop an exclusion lookup off exclusion stack if 00292 // we ended node and that node had exclusions 00293 if ($i == 0 || $i == $size - 1) { 00294 // use specialized var if it's the super-parent 00295 $s_excludes = $definition->info_parent_def->excludes; 00296 } else { 00297 $s_excludes = $definition->info[$tokens[$i]->name]->excludes; 00298 } 00299 if ($s_excludes) { 00300 array_pop($exclude_stack); 00301 } 00302 } 00303 $i++; 00304 } 00305 00306 } 00307 00308 //####################################################################// 00309 // Post-processing 00310 00311 // remove implicit parent tokens at the beginning and end 00312 array_shift($tokens); 00313 array_pop($tokens); 00314 00315 // remove context variables 00316 $context->destroy('IsInline'); 00317 $context->destroy('CurrentToken'); 00318 00319 //####################################################################// 00320 // Return 00321 00322 return $tokens; 00323 00324 } 00325 00326 } 00327 00328 // vim: et sw=4 sts=4