HTMLPurifier 4.4.0
|
00001 <?php 00002 00027 class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer 00028 { 00029 00030 private $factory; 00031 00032 public function __construct() { 00033 // setup the factory 00034 parent::__construct(); 00035 $this->factory = new HTMLPurifier_TokenFactory(); 00036 } 00037 00038 public function tokenizeHTML($html, $config, $context) { 00039 00040 $html = $this->normalize($html, $config, $context); 00041 00042 // attempt to armor stray angled brackets that cannot possibly 00043 // form tags and thus are probably being used as emoticons 00044 if ($config->get('Core.AggressivelyFixLt')) { 00045 $char = '[^a-z!\/]'; 00046 $comment = "/<!--(.*?)(-->|\z)/is"; 00047 $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html); 00048 do { 00049 $old = $html; 00050 $html = preg_replace("/<($char)/i", '<\\1', $html); 00051 } while ($html !== $old); 00052 $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments 00053 } 00054 00055 // preprocess html, essential for UTF-8 00056 $html = $this->wrapHTML($html, $config, $context); 00057 00058 $doc = new DOMDocument(); 00059 $doc->encoding = 'UTF-8'; // theoretically, the above has this covered 00060 00061 set_error_handler(array($this, 'muteErrorHandler')); 00062 $doc->loadHTML($html); 00063 restore_error_handler(); 00064 00065 $tokens = array(); 00066 $this->tokenizeDOM( 00067 $doc->getElementsByTagName('html')->item(0)-> // <html> 00068 getElementsByTagName('body')->item(0)-> // <body> 00069 getElementsByTagName('div')->item(0) // <div> 00070 , $tokens); 00071 return $tokens; 00072 } 00073 00081 protected function tokenizeDOM($node, &$tokens) { 00082 00083 $level = 0; 00084 $nodes = array($level => array($node)); 00085 $closingNodes = array(); 00086 do { 00087 while (!empty($nodes[$level])) { 00088 $node = array_shift($nodes[$level]); // FIFO 00089 $collect = $level > 0 ? true : false; 00090 $needEndingTag = $this->createStartNode($node, $tokens, $collect); 00091 if ($needEndingTag) { 00092 $closingNodes[$level][] = $node; 00093 } 00094 if ($node->childNodes && $node->childNodes->length) { 00095 $level++; 00096 $nodes[$level] = array(); 00097 foreach ($node->childNodes as $childNode) { 00098 array_push($nodes[$level], $childNode); 00099 } 00100 } 00101 } 00102 $level--; 00103 if ($level && isset($closingNodes[$level])) { 00104 while($node = array_pop($closingNodes[$level])) { 00105 $this->createEndNode($node, $tokens); 00106 } 00107 } 00108 } while ($level > 0); 00109 } 00110 00119 protected function createStartNode($node, &$tokens, $collect) { 00120 // intercept non element nodes. WE MUST catch all of them, 00121 // but we're not getting the character reference nodes because 00122 // those should have been preprocessed 00123 if ($node->nodeType === XML_TEXT_NODE) { 00124 $tokens[] = $this->factory->createText($node->data); 00125 return false; 00126 } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) { 00127 // undo libxml's special treatment of <script> and <style> tags 00128 $last = end($tokens); 00129 $data = $node->data; 00130 // (note $node->tagname is already normalized) 00131 if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) { 00132 $new_data = trim($data); 00133 if (substr($new_data, 0, 4) === '<!--') { 00134 $data = substr($new_data, 4); 00135 if (substr($data, -3) === '-->') { 00136 $data = substr($data, 0, -3); 00137 } else { 00138 // Highly suspicious! Not sure what to do... 00139 } 00140 } 00141 } 00142 $tokens[] = $this->factory->createText($this->parseData($data)); 00143 return false; 00144 } elseif ($node->nodeType === XML_COMMENT_NODE) { 00145 // this is code is only invoked for comments in script/style in versions 00146 // of libxml pre-2.6.28 (regular comments, of course, are still 00147 // handled regularly) 00148 $tokens[] = $this->factory->createComment($node->data); 00149 return false; 00150 } elseif ( 00151 // not-well tested: there may be other nodes we have to grab 00152 $node->nodeType !== XML_ELEMENT_NODE 00153 ) { 00154 return false; 00155 } 00156 00157 $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array(); 00158 00159 // We still have to make sure that the element actually IS empty 00160 if (!$node->childNodes->length) { 00161 if ($collect) { 00162 $tokens[] = $this->factory->createEmpty($node->tagName, $attr); 00163 } 00164 return false; 00165 } else { 00166 if ($collect) { 00167 $tokens[] = $this->factory->createStart( 00168 $tag_name = $node->tagName, // somehow, it get's dropped 00169 $attr 00170 ); 00171 } 00172 return true; 00173 } 00174 } 00175 00176 protected function createEndNode($node, &$tokens) { 00177 $tokens[] = $this->factory->createEnd($node->tagName); 00178 } 00179 00180 00187 protected function transformAttrToAssoc($node_map) { 00188 // NamedNodeMap is documented very well, so we're using undocumented 00189 // features, namely, the fact that it implements Iterator and 00190 // has a ->length attribute 00191 if ($node_map->length === 0) return array(); 00192 $array = array(); 00193 foreach ($node_map as $attr) { 00194 $array[$attr->name] = $attr->value; 00195 } 00196 return $array; 00197 } 00198 00202 public function muteErrorHandler($errno, $errstr) {} 00203 00208 public function callbackUndoCommentSubst($matches) { 00209 return '<!--' . strtr($matches[1], array('&'=>'&','<'=>'<')) . $matches[2]; 00210 } 00211 00216 public function callbackArmorCommentEntities($matches) { 00217 return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2]; 00218 } 00219 00223 protected function wrapHTML($html, $config, $context) { 00224 $def = $config->getDefinition('HTML'); 00225 $ret = ''; 00226 00227 if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) { 00228 $ret .= '<!DOCTYPE html '; 00229 if (!empty($def->doctype->dtdPublic)) $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" '; 00230 if (!empty($def->doctype->dtdSystem)) $ret .= '"' . $def->doctype->dtdSystem . '" '; 00231 $ret .= '>'; 00232 } 00233 00234 $ret .= '<html><head>'; 00235 $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'; 00236 // No protection if $html contains a stray </div>! 00237 $ret .= '</head><body><div>'.$html.'</div></body></html>'; 00238 return $ret; 00239 } 00240 00241 } 00242 00243 // vim: et sw=4 sts=4