HTMLPurifier 4.4.0
|
00001 <?php 00002 00042 class HTMLPurifier_Lexer 00043 { 00044 00049 public $tracksLineNumbers = false; 00050 00051 // -- STATIC ---------------------------------------------------------- 00052 00068 public static function create($config) { 00069 00070 if (!($config instanceof HTMLPurifier_Config)) { 00071 $lexer = $config; 00072 trigger_error("Passing a prototype to 00073 HTMLPurifier_Lexer::create() is deprecated, please instead 00074 use %Core.LexerImpl", E_USER_WARNING); 00075 } else { 00076 $lexer = $config->get('Core.LexerImpl'); 00077 } 00078 00079 $needs_tracking = 00080 $config->get('Core.MaintainLineNumbers') || 00081 $config->get('Core.CollectErrors'); 00082 00083 $inst = null; 00084 if (is_object($lexer)) { 00085 $inst = $lexer; 00086 } else { 00087 00088 if (is_null($lexer)) { do { 00089 // auto-detection algorithm 00090 00091 if ($needs_tracking) { 00092 $lexer = 'DirectLex'; 00093 break; 00094 } 00095 00096 if ( 00097 class_exists('DOMDocument') && 00098 method_exists('DOMDocument', 'loadHTML') && 00099 !extension_loaded('domxml') 00100 ) { 00101 // check for DOM support, because while it's part of the 00102 // core, it can be disabled compile time. Also, the PECL 00103 // domxml extension overrides the default DOM, and is evil 00104 // and nasty and we shan't bother to support it 00105 $lexer = 'DOMLex'; 00106 } else { 00107 $lexer = 'DirectLex'; 00108 } 00109 00110 } while(0); } // do..while so we can break 00111 00112 // instantiate recognized string names 00113 switch ($lexer) { 00114 case 'DOMLex': 00115 $inst = new HTMLPurifier_Lexer_DOMLex(); 00116 break; 00117 case 'DirectLex': 00118 $inst = new HTMLPurifier_Lexer_DirectLex(); 00119 break; 00120 case 'PH5P': 00121 $inst = new HTMLPurifier_Lexer_PH5P(); 00122 break; 00123 default: 00124 throw new HTMLPurifier_Exception("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer)); 00125 } 00126 } 00127 00128 if (!$inst) throw new HTMLPurifier_Exception('No lexer was instantiated'); 00129 00130 // once PHP DOM implements native line numbers, or we 00131 // hack out something using XSLT, remove this stipulation 00132 if ($needs_tracking && !$inst->tracksLineNumbers) { 00133 throw new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'); 00134 } 00135 00136 return $inst; 00137 00138 } 00139 00140 // -- CONVENIENCE MEMBERS --------------------------------------------- 00141 00142 public function __construct() { 00143 $this->_entity_parser = new HTMLPurifier_EntityParser(); 00144 } 00145 00149 protected $_special_entity2str = 00150 array( 00151 '"' => '"', 00152 '&' => '&', 00153 '<' => '<', 00154 '>' => '>', 00155 ''' => "'", 00156 ''' => "'", 00157 ''' => "'" 00158 ); 00159 00174 public function parseData($string) { 00175 00176 // following functions require at least one character 00177 if ($string === '') return ''; 00178 00179 // subtracts amps that cannot possibly be escaped 00180 $num_amp = substr_count($string, '&') - substr_count($string, '& ') - 00181 ($string[strlen($string)-1] === '&' ? 1 : 0); 00182 00183 if (!$num_amp) return $string; // abort if no entities 00184 $num_esc_amp = substr_count($string, '&'); 00185 $string = strtr($string, $this->_special_entity2str); 00186 00187 // code duplication for sake of optimization, see above 00188 $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') - 00189 ($string[strlen($string)-1] === '&' ? 1 : 0); 00190 00191 if ($num_amp_2 <= $num_esc_amp) return $string; 00192 00193 // hmm... now we have some uncommon entities. Use the callback. 00194 $string = $this->_entity_parser->substituteSpecialEntities($string); 00195 return $string; 00196 } 00197 00204 public function tokenizeHTML($string, $config, $context) { 00205 trigger_error('Call to abstract class', E_USER_ERROR); 00206 } 00207 00214 protected static function escapeCDATA($string) { 00215 return preg_replace_callback( 00216 '/<!\[CDATA\[(.+?)\]\]>/s', 00217 array('HTMLPurifier_Lexer', 'CDATACallback'), 00218 $string 00219 ); 00220 } 00221 00225 protected static function escapeCommentedCDATA($string) { 00226 return preg_replace_callback( 00227 '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s', 00228 array('HTMLPurifier_Lexer', 'CDATACallback'), 00229 $string 00230 ); 00231 } 00232 00236 protected static function removeIEConditional($string) { 00237 return preg_replace( 00238 '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings 00239 '', 00240 $string 00241 ); 00242 } 00243 00253 protected static function CDATACallback($matches) { 00254 // not exactly sure why the character set is needed, but whatever 00255 return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8'); 00256 } 00257 00263 public function normalize($html, $config, $context) { 00264 00265 // normalize newlines to \n 00266 if ($config->get('Core.NormalizeNewlines')) { 00267 $html = str_replace("\r\n", "\n", $html); 00268 $html = str_replace("\r", "\n", $html); 00269 } 00270 00271 if ($config->get('HTML.Trusted')) { 00272 // escape convoluted CDATA 00273 $html = $this->escapeCommentedCDATA($html); 00274 } 00275 00276 // escape CDATA 00277 $html = $this->escapeCDATA($html); 00278 00279 $html = $this->removeIEConditional($html); 00280 00281 // extract body from document if applicable 00282 if ($config->get('Core.ConvertDocumentToFragment')) { 00283 $e = false; 00284 if ($config->get('Core.CollectErrors')) { 00285 $e =& $context->get('ErrorCollector'); 00286 } 00287 $new_html = $this->extractBody($html); 00288 if ($e && $new_html != $html) { 00289 $e->send(E_WARNING, 'Lexer: Extracted body'); 00290 } 00291 $html = $new_html; 00292 } 00293 00294 // expand entities that aren't the big five 00295 $html = $this->_entity_parser->substituteNonSpecialEntities($html); 00296 00297 // clean into wellformed UTF-8 string for an SGML context: this has 00298 // to be done after entity expansion because the entities sometimes 00299 // represent non-SGML characters (horror, horror!) 00300 $html = HTMLPurifier_Encoder::cleanUTF8($html); 00301 00302 // if processing instructions are to removed, remove them now 00303 if ($config->get('Core.RemoveProcessingInstructions')) { 00304 $html = preg_replace('#<\?.+?\?>#s', '', $html); 00305 } 00306 00307 return $html; 00308 } 00309 00314 public function extractBody($html) { 00315 $matches = array(); 00316 $result = preg_match('!<body[^>]*>(.*)</body>!is', $html, $matches); 00317 if ($result) { 00318 return $matches[1]; 00319 } else { 00320 return $html; 00321 } 00322 } 00323 00324 } 00325 00326 // vim: et sw=4 sts=4