library/HTMLPurifier/Lexer.php

Go to the documentation of this file.
00001 <?php
00002 
00042 class HTMLPurifier_Lexer
00043 {
00044     
00045     // -- STATIC ----------------------------------------------------------
00046     
00062     public static function create($config) {
00063         
00064         if (!($config instanceof HTMLPurifier_Config)) {
00065             $lexer = $config;
00066             trigger_error("Passing a prototype to 
00067               HTMLPurifier_Lexer::create() is deprecated, please instead
00068               use %Core.LexerImpl", E_USER_WARNING);
00069         } else {
00070             $lexer = $config->get('Core', 'LexerImpl');
00071         }
00072         
00073         if (is_object($lexer)) {
00074             return $lexer;
00075         }
00076         
00077         if (is_null($lexer)) { do {
00078             // auto-detection algorithm
00079             
00080             // once PHP DOM implements native line numbers, or we
00081             // hack out something using XSLT, remove this stipulation
00082             $line_numbers = $config->get('Core', 'MaintainLineNumbers');
00083             if (
00084                 $line_numbers === true ||
00085                 ($line_numbers === null && $config->get('Core', 'CollectErrors'))
00086             ) {
00087                 $lexer = 'DirectLex';
00088                 break;
00089             }
00090             
00091             if (class_exists('DOMDocument')) {
00092                 // check for DOM support, because, surprisingly enough,
00093                 // it's *not* part of the core!
00094                 $lexer = 'DOMLex';
00095             } else {
00096                 $lexer = 'DirectLex';
00097             }
00098             
00099         } while(0); } // do..while so we can break
00100         
00101         // instantiate recognized string names
00102         switch ($lexer) {
00103             case 'DOMLex':
00104                 return new HTMLPurifier_Lexer_DOMLex();
00105             case 'DirectLex':
00106                 return new HTMLPurifier_Lexer_DirectLex();
00107             case 'PH5P':
00108                 return new HTMLPurifier_Lexer_PH5P();
00109             default:
00110                 trigger_error("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer), E_USER_ERROR);
00111         }
00112         
00113     }
00114     
00115     // -- CONVENIENCE MEMBERS ---------------------------------------------
00116     
00117     public function __construct() {
00118         $this->_entity_parser = new HTMLPurifier_EntityParser();
00119     }
00120     
00124     protected $_special_entity2str =
00125             array(
00126                     '&quot;' => '"',
00127                     '&amp;'  => '&',
00128                     '&lt;'   => '<',
00129                     '&gt;'   => '>',
00130                     '&#39;'  => "'",
00131                     '&#039;' => "'",
00132                     '&#x27;' => "'"
00133             );
00134     
00149     public function parseData($string) {
00150         
00151         // following functions require at least one character
00152         if ($string === '') return '';
00153         
00154         // subtracts amps that cannot possibly be escaped
00155         $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
00156             ($string[strlen($string)-1] === '&' ? 1 : 0);
00157         
00158         if (!$num_amp) return $string; // abort if no entities
00159         $num_esc_amp = substr_count($string, '&amp;');
00160         $string = strtr($string, $this->_special_entity2str);
00161         
00162         // code duplication for sake of optimization, see above
00163         $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
00164             ($string[strlen($string)-1] === '&' ? 1 : 0);
00165         
00166         if ($num_amp_2 <= $num_esc_amp) return $string;
00167         
00168         // hmm... now we have some uncommon entities. Use the callback.
00169         $string = $this->_entity_parser->substituteSpecialEntities($string);
00170         return $string;
00171     }
00172     
00179     public function tokenizeHTML($string, $config, $context) {
00180         trigger_error('Call to abstract class', E_USER_ERROR);
00181     }
00182     
00189     protected static function escapeCDATA($string) {
00190         return preg_replace_callback(
00191             '/<!\[CDATA\[(.+?)\]\]>/s',
00192             array('HTMLPurifier_Lexer', 'CDATACallback'),
00193             $string
00194         );
00195     }
00196     
00200     protected static function escapeCommentedCDATA($string) {
00201         return preg_replace_callback(
00202             '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
00203             array('HTMLPurifier_Lexer', 'CDATACallback'),
00204             $string
00205         );
00206     }
00207     
00217     protected static function CDATACallback($matches) {
00218         // not exactly sure why the character set is needed, but whatever
00219         return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
00220     }
00221     
00227     public function normalize($html, $config, $context) {
00228         
00229         // extract body from document if applicable
00230         if ($config->get('Core', 'ConvertDocumentToFragment')) {
00231             $html = $this->extractBody($html);
00232         }
00233         
00234         // normalize newlines to \n
00235         $html = str_replace("\r\n", "\n", $html);
00236         $html = str_replace("\r", "\n", $html);
00237         
00238         if ($config->get('HTML', 'Trusted')) {
00239             // escape convoluted CDATA
00240             $html = $this->escapeCommentedCDATA($html);
00241         }
00242         
00243         // escape CDATA
00244         $html = $this->escapeCDATA($html);
00245         
00246         // expand entities that aren't the big five
00247         $html = $this->_entity_parser->substituteNonSpecialEntities($html);
00248         
00249         // clean into wellformed UTF-8 string for an SGML context: this has
00250         // to be done after entity expansion because the entities sometimes
00251         // represent non-SGML characters (horror, horror!)
00252         $html = HTMLPurifier_Encoder::cleanUTF8($html);
00253         
00254         return $html;
00255     }
00256     
00261     public function extractBody($html) {
00262         $matches = array();
00263         $result = preg_match('!<body[^>]*>(.+?)</body>!is', $html, $matches);
00264         if ($result) {
00265             return $matches[1];
00266         } else {
00267             return $html;
00268         }
00269     }
00270     
00271 }
00272 

Generated on Thu Jun 19 18:47:26 2008 for HTMLPurifier by  doxygen 1.5.3