HTMLPurifier 4.4.0
/home/ezyang/Dev/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php
Go to the documentation of this file.
00001 <?php
00002 
00027 class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
00028 {
00029 
00030     private $factory;
00031 
00032     public function __construct() {
00033         // setup the factory
00034         parent::__construct();
00035         $this->factory = new HTMLPurifier_TokenFactory();
00036     }
00037 
00038     public function tokenizeHTML($html, $config, $context) {
00039 
00040         $html = $this->normalize($html, $config, $context);
00041 
00042         // attempt to armor stray angled brackets that cannot possibly
00043         // form tags and thus are probably being used as emoticons
00044         if ($config->get('Core.AggressivelyFixLt')) {
00045             $char = '[^a-z!\/]';
00046             $comment = "/<!--(.*?)(-->|\z)/is";
00047             $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
00048             do {
00049                 $old = $html;
00050                 $html = preg_replace("/<($char)/i", '&lt;\\1', $html);
00051             } while ($html !== $old);
00052             $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
00053         }
00054 
00055         // preprocess html, essential for UTF-8
00056         $html = $this->wrapHTML($html, $config, $context);
00057 
00058         $doc = new DOMDocument();
00059         $doc->encoding = 'UTF-8'; // theoretically, the above has this covered
00060 
00061         set_error_handler(array($this, 'muteErrorHandler'));
00062         $doc->loadHTML($html);
00063         restore_error_handler();
00064 
00065         $tokens = array();
00066         $this->tokenizeDOM(
00067             $doc->getElementsByTagName('html')->item(0)-> // <html>
00068                   getElementsByTagName('body')->item(0)-> //   <body>
00069                   getElementsByTagName('div')->item(0)    //     <div>
00070             , $tokens);
00071         return $tokens;
00072     }
00073 
00081     protected function tokenizeDOM($node, &$tokens) {
00082 
00083         $level = 0;
00084         $nodes = array($level => array($node));
00085         $closingNodes = array();
00086         do {
00087             while (!empty($nodes[$level])) {
00088                 $node = array_shift($nodes[$level]); // FIFO
00089                 $collect = $level > 0 ? true : false;
00090                 $needEndingTag = $this->createStartNode($node, $tokens, $collect);
00091                 if ($needEndingTag) {
00092                     $closingNodes[$level][] = $node;
00093                 }
00094                 if ($node->childNodes && $node->childNodes->length) {
00095                     $level++;
00096                     $nodes[$level] = array();
00097                     foreach ($node->childNodes as $childNode) {
00098                         array_push($nodes[$level], $childNode);
00099                     }
00100                 }
00101             }
00102             $level--;
00103             if ($level && isset($closingNodes[$level])) {
00104                 while($node = array_pop($closingNodes[$level])) {
00105                     $this->createEndNode($node, $tokens);
00106                 }
00107             }
00108         } while ($level > 0);
00109     }
00110 
00119     protected function createStartNode($node, &$tokens, $collect) {
00120         // intercept non element nodes. WE MUST catch all of them,
00121         // but we're not getting the character reference nodes because
00122         // those should have been preprocessed
00123         if ($node->nodeType === XML_TEXT_NODE) {
00124             $tokens[] = $this->factory->createText($node->data);
00125             return false;
00126         } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
00127             // undo libxml's special treatment of <script> and <style> tags
00128             $last = end($tokens);
00129             $data = $node->data;
00130             // (note $node->tagname is already normalized)
00131             if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) {
00132                 $new_data = trim($data);
00133                 if (substr($new_data, 0, 4) === '<!--') {
00134                     $data = substr($new_data, 4);
00135                     if (substr($data, -3) === '-->') {
00136                         $data = substr($data, 0, -3);
00137                     } else {
00138                         // Highly suspicious! Not sure what to do...
00139                     }
00140                 }
00141             }
00142             $tokens[] = $this->factory->createText($this->parseData($data));
00143             return false;
00144         } elseif ($node->nodeType === XML_COMMENT_NODE) {
00145             // this is code is only invoked for comments in script/style in versions
00146             // of libxml pre-2.6.28 (regular comments, of course, are still
00147             // handled regularly)
00148             $tokens[] = $this->factory->createComment($node->data);
00149             return false;
00150         } elseif (
00151             // not-well tested: there may be other nodes we have to grab
00152             $node->nodeType !== XML_ELEMENT_NODE
00153         ) {
00154             return false;
00155         }
00156 
00157         $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
00158 
00159         // We still have to make sure that the element actually IS empty
00160         if (!$node->childNodes->length) {
00161             if ($collect) {
00162                 $tokens[] = $this->factory->createEmpty($node->tagName, $attr);
00163             }
00164             return false;
00165         } else {
00166             if ($collect) {
00167                 $tokens[] = $this->factory->createStart(
00168                     $tag_name = $node->tagName, // somehow, it get's dropped
00169                     $attr
00170                 );
00171             }
00172             return true;
00173         }
00174     }
00175 
00176     protected function createEndNode($node, &$tokens) {
00177         $tokens[] = $this->factory->createEnd($node->tagName);
00178     }
00179 
00180 
00187     protected function transformAttrToAssoc($node_map) {
00188         // NamedNodeMap is documented very well, so we're using undocumented
00189         // features, namely, the fact that it implements Iterator and
00190         // has a ->length attribute
00191         if ($node_map->length === 0) return array();
00192         $array = array();
00193         foreach ($node_map as $attr) {
00194             $array[$attr->name] = $attr->value;
00195         }
00196         return $array;
00197     }
00198 
00202     public function muteErrorHandler($errno, $errstr) {}
00203 
00208     public function callbackUndoCommentSubst($matches) {
00209         return '<!--' . strtr($matches[1], array('&amp;'=>'&','&lt;'=>'<')) . $matches[2];
00210     }
00211 
00216     public function callbackArmorCommentEntities($matches) {
00217         return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
00218     }
00219 
00223     protected function wrapHTML($html, $config, $context) {
00224         $def = $config->getDefinition('HTML');
00225         $ret = '';
00226 
00227         if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
00228             $ret .= '<!DOCTYPE html ';
00229             if (!empty($def->doctype->dtdPublic)) $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
00230             if (!empty($def->doctype->dtdSystem)) $ret .= '"' . $def->doctype->dtdSystem . '" ';
00231             $ret .= '>';
00232         }
00233 
00234         $ret .= '<html><head>';
00235         $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
00236         // No protection if $html contains a stray </div>!
00237         $ret .= '</head><body><div>'.$html.'</div></body></html>';
00238         return $ret;
00239     }
00240 
00241 }
00242 
00243 // vim: et sw=4 sts=4