HTMLPurifier 4.4.0
|
00001 <?php 00002 00013 class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer 00014 { 00015 00016 public $tracksLineNumbers = true; 00017 00021 protected $_whitespace = "\x20\x09\x0D\x0A"; 00022 00027 protected function scriptCallback($matches) { 00028 return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3]; 00029 } 00030 00031 public function tokenizeHTML($html, $config, $context) { 00032 00033 // special normalization for script tags without any armor 00034 // our "armor" heurstic is a < sign any number of whitespaces after 00035 // the first script tag 00036 if ($config->get('HTML.Trusted')) { 00037 $html = preg_replace_callback('#(<script[^>]*>)(\s*[^<].+?)(</script>)#si', 00038 array($this, 'scriptCallback'), $html); 00039 } 00040 00041 $html = $this->normalize($html, $config, $context); 00042 00043 $cursor = 0; // our location in the text 00044 $inside_tag = false; // whether or not we're parsing the inside of a tag 00045 $array = array(); // result array 00046 00047 // This is also treated to mean maintain *column* numbers too 00048 $maintain_line_numbers = $config->get('Core.MaintainLineNumbers'); 00049 00050 if ($maintain_line_numbers === null) { 00051 // automatically determine line numbering by checking 00052 // if error collection is on 00053 $maintain_line_numbers = $config->get('Core.CollectErrors'); 00054 } 00055 00056 if ($maintain_line_numbers) { 00057 $current_line = 1; 00058 $current_col = 0; 00059 $length = strlen($html); 00060 } else { 00061 $current_line = false; 00062 $current_col = false; 00063 $length = false; 00064 } 00065 $context->register('CurrentLine', $current_line); 00066 $context->register('CurrentCol', $current_col); 00067 $nl = "\n"; 00068 // how often to manually recalculate. This will ALWAYS be right, 00069 // but it's pretty wasteful. Set to 0 to turn off 00070 $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval'); 00071 00072 $e = false; 00073 if ($config->get('Core.CollectErrors')) { 00074 $e =& $context->get('ErrorCollector'); 00075 } 00076 00077 // for testing synchronization 00078 $loops = 0; 00079 00080 while(++$loops) { 00081 00082 // $cursor is either at the start of a token, or inside of 00083 // a tag (i.e. there was a < immediately before it), as indicated 00084 // by $inside_tag 00085 00086 if ($maintain_line_numbers) { 00087 00088 // $rcursor, however, is always at the start of a token. 00089 $rcursor = $cursor - (int) $inside_tag; 00090 00091 // Column number is cheap, so we calculate it every round. 00092 // We're interested at the *end* of the newline string, so 00093 // we need to add strlen($nl) == 1 to $nl_pos before subtracting it 00094 // from our "rcursor" position. 00095 $nl_pos = strrpos($html, $nl, $rcursor - $length); 00096 $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1); 00097 00098 // recalculate lines 00099 if ( 00100 $synchronize_interval && // synchronization is on 00101 $cursor > 0 && // cursor is further than zero 00102 $loops % $synchronize_interval === 0 // time to synchronize! 00103 ) { 00104 $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor); 00105 } 00106 00107 } 00108 00109 $position_next_lt = strpos($html, '<', $cursor); 00110 $position_next_gt = strpos($html, '>', $cursor); 00111 00112 // triggers on "<b>asdf</b>" but not "asdf <b></b>" 00113 // special case to set up context 00114 if ($position_next_lt === $cursor) { 00115 $inside_tag = true; 00116 $cursor++; 00117 } 00118 00119 if (!$inside_tag && $position_next_lt !== false) { 00120 // We are not inside tag and there still is another tag to parse 00121 $token = new 00122 HTMLPurifier_Token_Text( 00123 $this->parseData( 00124 substr( 00125 $html, $cursor, $position_next_lt - $cursor 00126 ) 00127 ) 00128 ); 00129 if ($maintain_line_numbers) { 00130 $token->rawPosition($current_line, $current_col); 00131 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor); 00132 } 00133 $array[] = $token; 00134 $cursor = $position_next_lt + 1; 00135 $inside_tag = true; 00136 continue; 00137 } elseif (!$inside_tag) { 00138 // We are not inside tag but there are no more tags 00139 // If we're already at the end, break 00140 if ($cursor === strlen($html)) break; 00141 // Create Text of rest of string 00142 $token = new 00143 HTMLPurifier_Token_Text( 00144 $this->parseData( 00145 substr( 00146 $html, $cursor 00147 ) 00148 ) 00149 ); 00150 if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col); 00151 $array[] = $token; 00152 break; 00153 } elseif ($inside_tag && $position_next_gt !== false) { 00154 // We are in tag and it is well formed 00155 // Grab the internals of the tag 00156 $strlen_segment = $position_next_gt - $cursor; 00157 00158 if ($strlen_segment < 1) { 00159 // there's nothing to process! 00160 $token = new HTMLPurifier_Token_Text('<'); 00161 $cursor++; 00162 continue; 00163 } 00164 00165 $segment = substr($html, $cursor, $strlen_segment); 00166 00167 if ($segment === false) { 00168 // somehow, we attempted to access beyond the end of 00169 // the string, defense-in-depth, reported by Nate Abele 00170 break; 00171 } 00172 00173 // Check if it's a comment 00174 if ( 00175 substr($segment, 0, 3) === '!--' 00176 ) { 00177 // re-determine segment length, looking for --> 00178 $position_comment_end = strpos($html, '-->', $cursor); 00179 if ($position_comment_end === false) { 00180 // uh oh, we have a comment that extends to 00181 // infinity. Can't be helped: set comment 00182 // end position to end of string 00183 if ($e) $e->send(E_WARNING, 'Lexer: Unclosed comment'); 00184 $position_comment_end = strlen($html); 00185 $end = true; 00186 } else { 00187 $end = false; 00188 } 00189 $strlen_segment = $position_comment_end - $cursor; 00190 $segment = substr($html, $cursor, $strlen_segment); 00191 $token = new 00192 HTMLPurifier_Token_Comment( 00193 substr( 00194 $segment, 3, $strlen_segment - 3 00195 ) 00196 ); 00197 if ($maintain_line_numbers) { 00198 $token->rawPosition($current_line, $current_col); 00199 $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment); 00200 } 00201 $array[] = $token; 00202 $cursor = $end ? $position_comment_end : $position_comment_end + 3; 00203 $inside_tag = false; 00204 continue; 00205 } 00206 00207 // Check if it's an end tag 00208 $is_end_tag = (strpos($segment,'/') === 0); 00209 if ($is_end_tag) { 00210 $type = substr($segment, 1); 00211 $token = new HTMLPurifier_Token_End($type); 00212 if ($maintain_line_numbers) { 00213 $token->rawPosition($current_line, $current_col); 00214 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); 00215 } 00216 $array[] = $token; 00217 $inside_tag = false; 00218 $cursor = $position_next_gt + 1; 00219 continue; 00220 } 00221 00222 // Check leading character is alnum, if not, we may 00223 // have accidently grabbed an emoticon. Translate into 00224 // text and go our merry way 00225 if (!ctype_alpha($segment[0])) { 00226 // XML: $segment[0] !== '_' && $segment[0] !== ':' 00227 if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt'); 00228 $token = new HTMLPurifier_Token_Text('<'); 00229 if ($maintain_line_numbers) { 00230 $token->rawPosition($current_line, $current_col); 00231 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); 00232 } 00233 $array[] = $token; 00234 $inside_tag = false; 00235 continue; 00236 } 00237 00238 // Check if it is explicitly self closing, if so, remove 00239 // trailing slash. Remember, we could have a tag like <br>, so 00240 // any later token processing scripts must convert improperly 00241 // classified EmptyTags from StartTags. 00242 $is_self_closing = (strrpos($segment,'/') === $strlen_segment-1); 00243 if ($is_self_closing) { 00244 $strlen_segment--; 00245 $segment = substr($segment, 0, $strlen_segment); 00246 } 00247 00248 // Check if there are any attributes 00249 $position_first_space = strcspn($segment, $this->_whitespace); 00250 00251 if ($position_first_space >= $strlen_segment) { 00252 if ($is_self_closing) { 00253 $token = new HTMLPurifier_Token_Empty($segment); 00254 } else { 00255 $token = new HTMLPurifier_Token_Start($segment); 00256 } 00257 if ($maintain_line_numbers) { 00258 $token->rawPosition($current_line, $current_col); 00259 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); 00260 } 00261 $array[] = $token; 00262 $inside_tag = false; 00263 $cursor = $position_next_gt + 1; 00264 continue; 00265 } 00266 00267 // Grab out all the data 00268 $type = substr($segment, 0, $position_first_space); 00269 $attribute_string = 00270 trim( 00271 substr( 00272 $segment, $position_first_space 00273 ) 00274 ); 00275 if ($attribute_string) { 00276 $attr = $this->parseAttributeString( 00277 $attribute_string 00278 , $config, $context 00279 ); 00280 } else { 00281 $attr = array(); 00282 } 00283 00284 if ($is_self_closing) { 00285 $token = new HTMLPurifier_Token_Empty($type, $attr); 00286 } else { 00287 $token = new HTMLPurifier_Token_Start($type, $attr); 00288 } 00289 if ($maintain_line_numbers) { 00290 $token->rawPosition($current_line, $current_col); 00291 $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor); 00292 } 00293 $array[] = $token; 00294 $cursor = $position_next_gt + 1; 00295 $inside_tag = false; 00296 continue; 00297 } else { 00298 // inside tag, but there's no ending > sign 00299 if ($e) $e->send(E_WARNING, 'Lexer: Missing gt'); 00300 $token = new 00301 HTMLPurifier_Token_Text( 00302 '<' . 00303 $this->parseData( 00304 substr($html, $cursor) 00305 ) 00306 ); 00307 if ($maintain_line_numbers) $token->rawPosition($current_line, $current_col); 00308 // no cursor scroll? Hmm... 00309 $array[] = $token; 00310 break; 00311 } 00312 break; 00313 } 00314 00315 $context->destroy('CurrentLine'); 00316 $context->destroy('CurrentCol'); 00317 return $array; 00318 } 00319 00323 protected function substrCount($haystack, $needle, $offset, $length) { 00324 static $oldVersion; 00325 if ($oldVersion === null) { 00326 $oldVersion = version_compare(PHP_VERSION, '5.1', '<'); 00327 } 00328 if ($oldVersion) { 00329 $haystack = substr($haystack, $offset, $length); 00330 return substr_count($haystack, $needle); 00331 } else { 00332 return substr_count($haystack, $needle, $offset, $length); 00333 } 00334 } 00335 00342 public function parseAttributeString($string, $config, $context) { 00343 $string = (string) $string; // quick typecast 00344 00345 if ($string == '') return array(); // no attributes 00346 00347 $e = false; 00348 if ($config->get('Core.CollectErrors')) { 00349 $e =& $context->get('ErrorCollector'); 00350 } 00351 00352 // let's see if we can abort as quickly as possible 00353 // one equal sign, no spaces => one attribute 00354 $num_equal = substr_count($string, '='); 00355 $has_space = strpos($string, ' '); 00356 if ($num_equal === 0 && !$has_space) { 00357 // bool attribute 00358 return array($string => $string); 00359 } elseif ($num_equal === 1 && !$has_space) { 00360 // only one attribute 00361 list($key, $quoted_value) = explode('=', $string); 00362 $quoted_value = trim($quoted_value); 00363 if (!$key) { 00364 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key'); 00365 return array(); 00366 } 00367 if (!$quoted_value) return array($key => ''); 00368 $first_char = @$quoted_value[0]; 00369 $last_char = @$quoted_value[strlen($quoted_value)-1]; 00370 00371 $same_quote = ($first_char == $last_char); 00372 $open_quote = ($first_char == '"' || $first_char == "'"); 00373 00374 if ( $same_quote && $open_quote) { 00375 // well behaved 00376 $value = substr($quoted_value, 1, strlen($quoted_value) - 2); 00377 } else { 00378 // not well behaved 00379 if ($open_quote) { 00380 if ($e) $e->send(E_ERROR, 'Lexer: Missing end quote'); 00381 $value = substr($quoted_value, 1); 00382 } else { 00383 $value = $quoted_value; 00384 } 00385 } 00386 if ($value === false) $value = ''; 00387 return array($key => $this->parseData($value)); 00388 } 00389 00390 // setup loop environment 00391 $array = array(); // return assoc array of attributes 00392 $cursor = 0; // current position in string (moves forward) 00393 $size = strlen($string); // size of the string (stays the same) 00394 00395 // if we have unquoted attributes, the parser expects a terminating 00396 // space, so let's guarantee that there's always a terminating space. 00397 $string .= ' '; 00398 00399 while(true) { 00400 00401 if ($cursor >= $size) { 00402 break; 00403 } 00404 00405 $cursor += ($value = strspn($string, $this->_whitespace, $cursor)); 00406 // grab the key 00407 00408 $key_begin = $cursor; //we're currently at the start of the key 00409 00410 // scroll past all characters that are the key (not whitespace or =) 00411 $cursor += strcspn($string, $this->_whitespace . '=', $cursor); 00412 00413 $key_end = $cursor; // now at the end of the key 00414 00415 $key = substr($string, $key_begin, $key_end - $key_begin); 00416 00417 if (!$key) { 00418 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key'); 00419 $cursor += strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop 00420 continue; // empty key 00421 } 00422 00423 // scroll past all whitespace 00424 $cursor += strspn($string, $this->_whitespace, $cursor); 00425 00426 if ($cursor >= $size) { 00427 $array[$key] = $key; 00428 break; 00429 } 00430 00431 // if the next character is an equal sign, we've got a regular 00432 // pair, otherwise, it's a bool attribute 00433 $first_char = @$string[$cursor]; 00434 00435 if ($first_char == '=') { 00436 // key="value" 00437 00438 $cursor++; 00439 $cursor += strspn($string, $this->_whitespace, $cursor); 00440 00441 if ($cursor === false) { 00442 $array[$key] = ''; 00443 break; 00444 } 00445 00446 // we might be in front of a quote right now 00447 00448 $char = @$string[$cursor]; 00449 00450 if ($char == '"' || $char == "'") { 00451 // it's quoted, end bound is $char 00452 $cursor++; 00453 $value_begin = $cursor; 00454 $cursor = strpos($string, $char, $cursor); 00455 $value_end = $cursor; 00456 } else { 00457 // it's not quoted, end bound is whitespace 00458 $value_begin = $cursor; 00459 $cursor += strcspn($string, $this->_whitespace, $cursor); 00460 $value_end = $cursor; 00461 } 00462 00463 // we reached a premature end 00464 if ($cursor === false) { 00465 $cursor = $size; 00466 $value_end = $cursor; 00467 } 00468 00469 $value = substr($string, $value_begin, $value_end - $value_begin); 00470 if ($value === false) $value = ''; 00471 $array[$key] = $this->parseData($value); 00472 $cursor++; 00473 00474 } else { 00475 // boolattr 00476 if ($key !== '') { 00477 $array[$key] = $key; 00478 } else { 00479 // purely theoretical 00480 if ($e) $e->send(E_ERROR, 'Lexer: Missing attribute key'); 00481 } 00482 00483 } 00484 } 00485 return $array; 00486 } 00487 00488 } 00489 00490 // vim: et sw=4 sts=4