00001 <?php
00002
00042 class HTMLPurifier_Lexer
00043 {
00044
00045
00046
00062 public static function create($config) {
00063
00064 if (!($config instanceof HTMLPurifier_Config)) {
00065 $lexer = $config;
00066 trigger_error("Passing a prototype to
00067 HTMLPurifier_Lexer::create() is deprecated, please instead
00068 use %Core.LexerImpl", E_USER_WARNING);
00069 } else {
00070 $lexer = $config->get('Core', 'LexerImpl');
00071 }
00072
00073 if (is_object($lexer)) {
00074 return $lexer;
00075 }
00076
00077 if (is_null($lexer)) { do {
00078
00079
00080
00081
00082 $line_numbers = $config->get('Core', 'MaintainLineNumbers');
00083 if (
00084 $line_numbers === true ||
00085 ($line_numbers === null && $config->get('Core', 'CollectErrors'))
00086 ) {
00087 $lexer = 'DirectLex';
00088 break;
00089 }
00090
00091 if (class_exists('DOMDocument')) {
00092
00093
00094 $lexer = 'DOMLex';
00095 } else {
00096 $lexer = 'DirectLex';
00097 }
00098
00099 } while(0); }
00100
00101
00102 switch ($lexer) {
00103 case 'DOMLex':
00104 return new HTMLPurifier_Lexer_DOMLex();
00105 case 'DirectLex':
00106 return new HTMLPurifier_Lexer_DirectLex();
00107 case 'PH5P':
00108 return new HTMLPurifier_Lexer_PH5P();
00109 default:
00110 trigger_error("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer), E_USER_ERROR);
00111 }
00112
00113 }
00114
00115
00116
00117 public function __construct() {
00118 $this->_entity_parser = new HTMLPurifier_EntityParser();
00119 }
00120
00124 protected $_special_entity2str =
00125 array(
00126 '"' => '"',
00127 '&' => '&',
00128 '<' => '<',
00129 '>' => '>',
00130 ''' => "'",
00131 ''' => "'",
00132 ''' => "'"
00133 );
00134
00149 public function parseData($string) {
00150
00151
00152 if ($string === '') return '';
00153
00154
00155 $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
00156 ($string[strlen($string)-1] === '&' ? 1 : 0);
00157
00158 if (!$num_amp) return $string;
00159 $num_esc_amp = substr_count($string, '&');
00160 $string = strtr($string, $this->_special_entity2str);
00161
00162
00163 $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
00164 ($string[strlen($string)-1] === '&' ? 1 : 0);
00165
00166 if ($num_amp_2 <= $num_esc_amp) return $string;
00167
00168
00169 $string = $this->_entity_parser->substituteSpecialEntities($string);
00170 return $string;
00171 }
00172
00179 public function tokenizeHTML($string, $config, $context) {
00180 trigger_error('Call to abstract class', E_USER_ERROR);
00181 }
00182
00189 protected static function escapeCDATA($string) {
00190 return preg_replace_callback(
00191 '/<!\[CDATA\[(.+?)\]\]>/s',
00192 array('HTMLPurifier_Lexer', 'CDATACallback'),
00193 $string
00194 );
00195 }
00196
00200 protected static function escapeCommentedCDATA($string) {
00201 return preg_replace_callback(
00202 '#<!--
00203 array('HTMLPurifier_Lexer', 'CDATACallback'),
00204 $string
00205 );
00206 }
00207
00217 protected static function CDATACallback($matches) {
00218
00219 return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
00220 }
00221
00227 public function normalize($html, $config, $context) {
00228
00229
00230 if ($config->get('Core', 'ConvertDocumentToFragment')) {
00231 $html = $this->extractBody($html);
00232 }
00233
00234
00235 $html = str_replace("\r\n", "\n", $html);
00236 $html = str_replace("\r", "\n", $html);
00237
00238 if ($config->get('HTML', 'Trusted')) {
00239
00240 $html = $this->escapeCommentedCDATA($html);
00241 }
00242
00243
00244 $html = $this->escapeCDATA($html);
00245
00246
00247 $html = $this->_entity_parser->substituteNonSpecialEntities($html);
00248
00249
00250
00251
00252 $html = HTMLPurifier_Encoder::cleanUTF8($html);
00253
00254 return $html;
00255 }
00256
00261 public function extractBody($html) {
00262 $matches = array();
00263 $result = preg_match('!<body[^>]*>(.+?)</body>!is', $html, $matches);
00264 if ($result) {
00265 return $matches[1];
00266 } else {
00267 return $html;
00268 }
00269 }
00270
00271 }
00272