00001 <?php
00002
00013 class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex {
00014
00015 public function tokenizeHTML($html, $config, $context) {
00016 $new_html = $this->normalize($html, $config, $context);
00017 $new_html = $this->wrapHTML($new_html, $config, $context);
00018 try {
00019 $parser = new HTML5($new_html);
00020 $doc = $parser->save();
00021 } catch (DOMException $e) {
00022
00023 $lexer = new HTMLPurifier_Lexer_DirectLex();
00024 $context->register('PH5PError', $e);
00025 return $lexer->tokenizeHTML($html, $config, $context);
00026 }
00027 $tokens = array();
00028 $this->tokenizeDOM(
00029 $doc->getElementsByTagName('html')->item(0)->
00030 getElementsByTagName('body')->item(0)->
00031 getElementsByTagName('div')->item(0)
00032 , $tokens);
00033 return $tokens;
00034 }
00035
00036 }
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063 class HTML5 {
00064 private $data;
00065 private $char;
00066 private $EOF;
00067 private $state;
00068 private $tree;
00069 private $token;
00070 private $content_model;
00071 private $escape = false;
00072 private $entities = array('AElig;','AElig','AMP;','AMP','Aacute;','Aacute',
00073 'Acirc;','Acirc','Agrave;','Agrave','Alpha;','Aring;','Aring','Atilde;',
00074 'Atilde','Auml;','Auml','Beta;','COPY;','COPY','Ccedil;','Ccedil','Chi;',
00075 'Dagger;','Delta;','ETH;','ETH','Eacute;','Eacute','Ecirc;','Ecirc','Egrave;',
00076 'Egrave','Epsilon;','Eta;','Euml;','Euml','GT;','GT','Gamma;','Iacute;',
00077 'Iacute','Icirc;','Icirc','Igrave;','Igrave','Iota;','Iuml;','Iuml','Kappa;',
00078 'LT;','LT','Lambda;','Mu;','Ntilde;','Ntilde','Nu;','OElig;','Oacute;',
00079 'Oacute','Ocirc;','Ocirc','Ograve;','Ograve','Omega;','Omicron;','Oslash;',
00080 'Oslash','Otilde;','Otilde','Ouml;','Ouml','Phi;','Pi;','Prime;','Psi;',
00081 'QUOT;','QUOT','REG;','REG','Rho;','Scaron;','Sigma;','THORN;','THORN',
00082 'TRADE;','Tau;','Theta;','Uacute;','Uacute','Ucirc;','Ucirc','Ugrave;',
00083 'Ugrave','Upsilon;','Uuml;','Uuml','Xi;','Yacute;','Yacute','Yuml;','Zeta;',
00084 'aacute;','aacute','acirc;','acirc','acute;','acute','aelig;','aelig',
00085 'agrave;','agrave','alefsym;','alpha;','amp;','amp','and;','ang;','apos;',
00086 'aring;','aring','asymp;','atilde;','atilde','auml;','auml','bdquo;','beta;',
00087 'brvbar;','brvbar','bull;','cap;','ccedil;','ccedil','cedil;','cedil',
00088 'cent;','cent','chi;','circ;','clubs;','cong;','copy;','copy','crarr;',
00089 'cup;','curren;','curren','dArr;','dagger;','darr;','deg;','deg','delta;',
00090 'diams;','divide;','divide','eacute;','eacute','ecirc;','ecirc','egrave;',
00091 'egrave','empty;','emsp;','ensp;','epsilon;','equiv;','eta;','eth;','eth',
00092 'euml;','euml','euro;','exist;','fnof;','forall;','frac12;','frac12',
00093 'frac14;','frac14','frac34;','frac34','frasl;','gamma;','ge;','gt;','gt',
00094 'hArr;','harr;','hearts;','hellip;','iacute;','iacute','icirc;','icirc',
00095 'iexcl;','iexcl','igrave;','igrave','image;','infin;','int;','iota;',
00096 'iquest;','iquest','isin;','iuml;','iuml','kappa;','lArr;','lambda;','lang;',
00097 'laquo;','laquo','larr;','lceil;','ldquo;','le;','lfloor;','lowast;','loz;',
00098 'lrm;','lsaquo;','lsquo;','lt;','lt','macr;','macr','mdash;','micro;','micro',
00099 'middot;','middot','minus;','mu;','nabla;','nbsp;','nbsp','ndash;','ne;',
00100 'ni;','not;','not','notin;','nsub;','ntilde;','ntilde','nu;','oacute;',
00101 'oacute','ocirc;','ocirc','oelig;','ograve;','ograve','oline;','omega;',
00102 'omicron;','oplus;','or;','ordf;','ordf','ordm;','ordm','oslash;','oslash',
00103 'otilde;','otilde','otimes;','ouml;','ouml','para;','para','part;','permil;',
00104 'perp;','phi;','pi;','piv;','plusmn;','plusmn','pound;','pound','prime;',
00105 'prod;','prop;','psi;','quot;','quot','rArr;','radic;','rang;','raquo;',
00106 'raquo','rarr;','rceil;','rdquo;','real;','reg;','reg','rfloor;','rho;',
00107 'rlm;','rsaquo;','rsquo;','sbquo;','scaron;','sdot;','sect;','sect','shy;',
00108 'shy','sigma;','sigmaf;','sim;','spades;','sub;','sube;','sum;','sup1;',
00109 'sup1','sup2;','sup2','sup3;','sup3','sup;','supe;','szlig;','szlig','tau;',
00110 'there4;','theta;','thetasym;','thinsp;','thorn;','thorn','tilde;','times;',
00111 'times','trade;','uArr;','uacute;','uacute','uarr;','ucirc;','ucirc',
00112 'ugrave;','ugrave','uml;','uml','upsih;','upsilon;','uuml;','uuml','weierp;',
00113 'xi;','yacute;','yacute','yen;','yen','yuml;','yuml','zeta;','zwj;','zwnj;');
00114
00115 const PCDATA = 0;
00116 const RCDATA = 1;
00117 const CDATA = 2;
00118 const PLAINTEXT = 3;
00119
00120 const DOCTYPE = 0;
00121 const STARTTAG = 1;
00122 const ENDTAG = 2;
00123 const COMMENT = 3;
00124 const CHARACTR = 4;
00125 const EOF = 5;
00126
00127 public function __construct($data) {
00128 $data = str_replace("\r\n", "\n", $data);
00129 $data = str_replace("\r", null, $data);
00130
00131 $this->data = $data;
00132 $this->char = -1;
00133 $this->EOF = strlen($data);
00134 $this->tree = new HTML5TreeConstructer;
00135 $this->content_model = self::PCDATA;
00136
00137 $this->state = 'data';
00138
00139 while($this->state !== null) {
00140 $this->{$this->state.'State'}();
00141 }
00142 }
00143
00144 public function save() {
00145 return $this->tree->save();
00146 }
00147
00148 private function char() {
00149 return ($this->char < $this->EOF)
00150 ? $this->data[$this->char]
00151 : false;
00152 }
00153
00154 private function character($s, $l = 0) {
00155 if($s + $l < $this->EOF) {
00156 if($l === 0) {
00157 return $this->data[$s];
00158 } else {
00159 return substr($this->data, $s, $l);
00160 }
00161 }
00162 }
00163
00164 private function characters($char_class, $start) {
00165 return preg_replace('#^(['.$char_class.']+).*#s', '\\1', substr($this->data, $start));
00166 }
00167
00168 private function dataState() {
00169
00170 $this->char++;
00171 $char = $this->char();
00172
00173 if($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
00174
00175
00176
00177
00178 $this->state = 'entityData';
00179
00180 } elseif($char === '-') {
00181
00182
00183
00184
00185
00186
00187 if(($this->content_model === self::RCDATA || $this->content_model ===
00188 self::CDATA) && $this->escape === false &&
00189 $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--') {
00190 $this->escape = true;
00191 }
00192
00193
00194
00195 $this->emitToken(array(
00196 'type' => self::CHARACTR,
00197 'data' => $char
00198 ));
00199
00200
00201 } elseif($char === '<' && ($this->content_model === self::PCDATA ||
00202 (($this->content_model === self::RCDATA ||
00203 $this->content_model === self::CDATA) && $this->escape === false))) {
00204
00205
00206
00207
00208
00209
00210
00211
00212 $this->state = 'tagOpen';
00213
00214
00215 } elseif($char === '>') {
00216
00217
00218
00219
00220
00221 if(($this->content_model === self::RCDATA ||
00222 $this->content_model === self::CDATA) && $this->escape === true &&
00223 $this->character($this->char, 3) === '-->') {
00224 $this->escape = false;
00225 }
00226
00227
00228
00229 $this->emitToken(array(
00230 'type' => self::CHARACTR,
00231 'data' => $char
00232 ));
00233
00234 } elseif($this->char === $this->EOF) {
00235
00236
00237 $this->EOF();
00238
00239 } elseif($this->content_model === self::PLAINTEXT) {
00240
00241
00242
00243 $this->emitToken(array(
00244 'type' => self::CHARACTR,
00245 'data' => substr($this->data, $this->char)
00246 ));
00247
00248 $this->EOF();
00249
00250 } else {
00251
00252
00253
00254
00255 $len = strcspn($this->data, '<&', $this->char);
00256 $char = substr($this->data, $this->char, $len);
00257 $this->char += $len - 1;
00258
00259 $this->emitToken(array(
00260 'type' => self::CHARACTR,
00261 'data' => $char
00262 ));
00263
00264 $this->state = 'data';
00265 }
00266 }
00267
00268 private function entityDataState() {
00269
00270 $entity = $this->entity();
00271
00272
00273
00274 $char = (!$entity) ? '&' : $entity;
00275 $this->emitToken(array(
00276 'type' => self::CHARACTR,
00277 'data' => $char
00278 ));
00279
00280
00281 $this->state = 'data';
00282 }
00283
00284 private function tagOpenState() {
00285 switch($this->content_model) {
00286 case self::RCDATA:
00287 case self::CDATA:
00288
00289
00290
00291
00292
00293 if($this->character($this->char + 1) === '/') {
00294 $this->char++;
00295 $this->state = 'closeTagOpen';
00296
00297 } else {
00298 $this->emitToken(array(
00299 'type' => self::CHARACTR,
00300 'data' => '<'
00301 ));
00302
00303 $this->state = 'data';
00304 }
00305 break;
00306
00307 case self::PCDATA:
00308
00309
00310 $this->char++;
00311 $char = $this->char();
00312
00313 if($char === '!') {
00314
00315
00316 $this->state = 'markupDeclarationOpen';
00317
00318 } elseif($char === '/') {
00319
00320
00321 $this->state = 'closeTagOpen';
00322
00323 } elseif(preg_match('/^[A-Za-z]$/', $char)) {
00324
00325
00326
00327
00328
00329 $this->token = array(
00330 'name' => strtolower($char),
00331 'type' => self::STARTTAG,
00332 'attr' => array()
00333 );
00334
00335 $this->state = 'tagName';
00336
00337 } elseif($char === '>') {
00338
00339
00340
00341 $this->emitToken(array(
00342 'type' => self::CHARACTR,
00343 'data' => '<>'
00344 ));
00345
00346 $this->state = 'data';
00347
00348 } elseif($char === '?') {
00349
00350
00351 $this->state = 'bogusComment';
00352
00353 } else {
00354
00355
00356
00357 $this->emitToken(array(
00358 'type' => self::CHARACTR,
00359 'data' => '<'
00360 ));
00361
00362 $this->char--;
00363 $this->state = 'data';
00364 }
00365 break;
00366 }
00367 }
00368
00369 private function closeTagOpenState() {
00370 $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
00371 $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
00372
00373 if(($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
00374 (!$the_same || ($the_same && (!preg_match('/[\t\n\x0b\x0c >\/]/',
00375 $this->character($this->char + 1 + strlen($next_node))) || $this->EOF === $this->char)))) {
00376
00377
00378
00379
00380
00381
00382
00383
00384
00385
00386
00387
00388
00389
00390
00391 $this->emitToken(array(
00392 'type' => self::CHARACTR,
00393 'data' => '</'
00394 ));
00395
00396 $this->state = 'data';
00397
00398 } else {
00399
00400
00401
00402 $this->char++;
00403 $char = $this->char();
00404
00405 if(preg_match('/^[A-Za-z]$/', $char)) {
00406
00407
00408
00409
00410
00411 $this->token = array(
00412 'name' => strtolower($char),
00413 'type' => self::ENDTAG
00414 );
00415
00416 $this->state = 'tagName';
00417
00418 } elseif($char === '>') {
00419
00420
00421 $this->state = 'data';
00422
00423 } elseif($this->char === $this->EOF) {
00424
00425
00426
00427 $this->emitToken(array(
00428 'type' => self::CHARACTR,
00429 'data' => '</'
00430 ));
00431
00432 $this->char--;
00433 $this->state = 'data';
00434
00435 } else {
00436
00437 $this->state = 'bogusComment';
00438 }
00439 }
00440 }
00441
00442 private function tagNameState() {
00443
00444 $this->char++;
00445 $char = $this->character($this->char);
00446
00447 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
00448
00449
00450
00451
00452
00453
00454 $this->state = 'beforeAttributeName';
00455
00456 } elseif($char === '>') {
00457
00458
00459 $this->emitToken($this->token);
00460 $this->state = 'data';
00461
00462 } elseif($this->char === $this->EOF) {
00463
00464
00465
00466 $this->emitToken($this->token);
00467
00468 $this->char--;
00469 $this->state = 'data';
00470
00471 } elseif($char === '/') {
00472
00473
00474
00475 $this->state = 'beforeAttributeName';
00476
00477 } else {
00478
00479
00480
00481 $this->token['name'] .= strtolower($char);
00482 $this->state = 'tagName';
00483 }
00484 }
00485
00486 private function beforeAttributeNameState() {
00487
00488 $this->char++;
00489 $char = $this->character($this->char);
00490
00491 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
00492
00493
00494
00495
00496
00497
00498 $this->state = 'beforeAttributeName';
00499
00500 } elseif($char === '>') {
00501
00502
00503 $this->emitToken($this->token);
00504 $this->state = 'data';
00505
00506 } elseif($char === '/') {
00507
00508
00509
00510 $this->state = 'beforeAttributeName';
00511
00512 } elseif($this->char === $this->EOF) {
00513
00514
00515
00516 $this->emitToken($this->token);
00517
00518 $this->char--;
00519 $this->state = 'data';
00520
00521 } else {
00522
00523
00524
00525
00526 $this->token['attr'][] = array(
00527 'name' => strtolower($char),
00528 'value' => null
00529 );
00530
00531 $this->state = 'attributeName';
00532 }
00533 }
00534
00535 private function attributeNameState() {
00536
00537 $this->char++;
00538 $char = $this->character($this->char);
00539
00540 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
00541
00542
00543
00544
00545
00546
00547 $this->state = 'afterAttributeName';
00548
00549 } elseif($char === '=') {
00550
00551
00552 $this->state = 'beforeAttributeValue';
00553
00554 } elseif($char === '>') {
00555
00556
00557 $this->emitToken($this->token);
00558 $this->state = 'data';
00559
00560 } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
00561
00562
00563
00564 $this->state = 'beforeAttributeName';
00565
00566 } elseif($this->char === $this->EOF) {
00567
00568
00569
00570 $this->emitToken($this->token);
00571
00572 $this->char--;
00573 $this->state = 'data';
00574
00575 } else {
00576
00577
00578
00579 $last = count($this->token['attr']) - 1;
00580 $this->token['attr'][$last]['name'] .= strtolower($char);
00581
00582 $this->state = 'attributeName';
00583 }
00584 }
00585
00586 private function afterAttributeNameState() {
00587
00588 $this->char++;
00589 $char = $this->character($this->char);
00590
00591 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
00592
00593
00594
00595
00596
00597
00598 $this->state = 'afterAttributeName';
00599
00600 } elseif($char === '=') {
00601
00602
00603 $this->state = 'beforeAttributeValue';
00604
00605 } elseif($char === '>') {
00606
00607
00608 $this->emitToken($this->token);
00609 $this->state = 'data';
00610
00611 } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
00612
00613
00614
00615 $this->state = 'beforeAttributeName';
00616
00617 } elseif($this->char === $this->EOF) {
00618
00619
00620
00621 $this->emitToken($this->token);
00622
00623 $this->char--;
00624 $this->state = 'data';
00625
00626 } else {
00627
00628
00629
00630
00631 $this->token['attr'][] = array(
00632 'name' => strtolower($char),
00633 'value' => null
00634 );
00635
00636 $this->state = 'attributeName';
00637 }
00638 }
00639
00640 private function beforeAttributeValueState() {
00641
00642 $this->char++;
00643 $char = $this->character($this->char);
00644
00645 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
00646
00647
00648
00649
00650
00651
00652 $this->state = 'beforeAttributeValue';
00653
00654 } elseif($char === '"') {
00655
00656
00657 $this->state = 'attributeValueDoubleQuoted';
00658
00659 } elseif($char === '&') {
00660
00661
00662
00663 $this->char--;
00664 $this->state = 'attributeValueUnquoted';
00665
00666 } elseif($char === '\'') {
00667
00668
00669 $this->state = 'attributeValueSingleQuoted';
00670
00671 } elseif($char === '>') {
00672
00673
00674 $this->emitToken($this->token);
00675 $this->state = 'data';
00676
00677 } else {
00678
00679
00680
00681 $last = count($this->token['attr']) - 1;
00682 $this->token['attr'][$last]['value'] .= $char;
00683
00684 $this->state = 'attributeValueUnquoted';
00685 }
00686 }
00687
00688 private function attributeValueDoubleQuotedState() {
00689
00690 $this->char++;
00691 $char = $this->character($this->char);
00692
00693 if($char === '"') {
00694
00695
00696 $this->state = 'beforeAttributeName';
00697
00698 } elseif($char === '&') {
00699
00700
00701 $this->entityInAttributeValueState('double');
00702
00703 } elseif($this->char === $this->EOF) {
00704
00705
00706
00707 $this->emitToken($this->token);
00708
00709 $this->char--;
00710 $this->state = 'data';
00711
00712 } else {
00713
00714
00715
00716 $last = count($this->token['attr']) - 1;
00717 $this->token['attr'][$last]['value'] .= $char;
00718
00719 $this->state = 'attributeValueDoubleQuoted';
00720 }
00721 }
00722
00723 private function attributeValueSingleQuotedState() {
00724
00725 $this->char++;
00726 $char = $this->character($this->char);
00727
00728 if($char === '\'') {
00729
00730
00731 $this->state = 'beforeAttributeName';
00732
00733 } elseif($char === '&') {
00734
00735
00736 $this->entityInAttributeValueState('single');
00737
00738 } elseif($this->char === $this->EOF) {
00739
00740
00741
00742 $this->emitToken($this->token);
00743
00744 $this->char--;
00745 $this->state = 'data';
00746
00747 } else {
00748
00749
00750
00751 $last = count($this->token['attr']) - 1;
00752 $this->token['attr'][$last]['value'] .= $char;
00753
00754 $this->state = 'attributeValueSingleQuoted';
00755 }
00756 }
00757
00758 private function attributeValueUnquotedState() {
00759
00760 $this->char++;
00761 $char = $this->character($this->char);
00762
00763 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
00764
00765
00766
00767
00768
00769
00770 $this->state = 'beforeAttributeName';
00771
00772 } elseif($char === '&') {
00773
00774
00775 $this->entityInAttributeValueState();
00776
00777 } elseif($char === '>') {
00778
00779
00780 $this->emitToken($this->token);
00781 $this->state = 'data';
00782
00783 } else {
00784
00785
00786
00787 $last = count($this->token['attr']) - 1;
00788 $this->token['attr'][$last]['value'] .= $char;
00789
00790 $this->state = 'attributeValueUnquoted';
00791 }
00792 }
00793
00794 private function entityInAttributeValueState() {
00795
00796 $entity = $this->entity();
00797
00798
00799
00800
00801 $char = (!$entity)
00802 ? '&'
00803 : $entity;
00804
00805 $last = count($this->token['attr']) - 1;
00806 $this->token['attr'][$last]['value'] .= $char;
00807 }
00808
00809 private function bogusCommentState() {
00810
00811
00812
00813
00814
00815
00816
00817
00818 $data = $this->characters('^>', $this->char);
00819 $this->emitToken(array(
00820 'data' => $data,
00821 'type' => self::COMMENT
00822 ));
00823
00824 $this->char += strlen($data);
00825
00826
00827 $this->state = 'data';
00828
00829
00830 if($this->char === $this->EOF) {
00831 $this->char = $this->EOF - 1;
00832 }
00833 }
00834
00835 private function markupDeclarationOpenState() {
00836
00837
00838
00839 if($this->character($this->char + 1, 2) === '--') {
00840 $this->char += 2;
00841 $this->state = 'comment';
00842 $this->token = array(
00843 'data' => null,
00844 'type' => self::COMMENT
00845 );
00846
00847
00848
00849
00850 } elseif(strtolower($this->character($this->char + 1, 7)) === 'doctype') {
00851 $this->char += 7;
00852 $this->state = 'doctype';
00853
00854
00855
00856
00857 } else {
00858 $this->char++;
00859 $this->state = 'bogusComment';
00860 }
00861 }
00862
00863 private function commentState() {
00864
00865 $this->char++;
00866 $char = $this->char();
00867
00868
00869 if($char === '-') {
00870
00871 $this->state = 'commentDash';
00872
00873
00874 } elseif($this->char === $this->EOF) {
00875
00876
00877 $this->emitToken($this->token);
00878 $this->char--;
00879 $this->state = 'data';
00880
00881
00882 } else {
00883
00884
00885 $this->token['data'] .= $char;
00886 }
00887 }
00888
00889 private function commentDashState() {
00890
00891 $this->char++;
00892 $char = $this->char();
00893
00894
00895 if($char === '-') {
00896
00897 $this->state = 'commentEnd';
00898
00899
00900 } elseif($this->char === $this->EOF) {
00901
00902
00903 $this->emitToken($this->token);
00904 $this->char--;
00905 $this->state = 'data';
00906
00907
00908 } else {
00909
00910
00911 $this->token['data'] .= '-'.$char;
00912 $this->state = 'comment';
00913 }
00914 }
00915
00916 private function commentEndState() {
00917
00918 $this->char++;
00919 $char = $this->char();
00920
00921 if($char === '>') {
00922 $this->emitToken($this->token);
00923 $this->state = 'data';
00924
00925 } elseif($char === '-') {
00926 $this->token['data'] .= '-';
00927
00928 } elseif($this->char === $this->EOF) {
00929 $this->emitToken($this->token);
00930 $this->char--;
00931 $this->state = 'data';
00932
00933 } else {
00934 $this->token['data'] .= '--'.$char;
00935 $this->state = 'comment';
00936 }
00937 }
00938
00939 private function doctypeState() {
00940
00941 $this->char++;
00942 $char = $this->char();
00943
00944 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
00945 $this->state = 'beforeDoctypeName';
00946
00947 } else {
00948 $this->char--;
00949 $this->state = 'beforeDoctypeName';
00950 }
00951 }
00952
00953 private function beforeDoctypeNameState() {
00954
00955 $this->char++;
00956 $char = $this->char();
00957
00958 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
00959
00960
00961 } elseif(preg_match('/^[a-z]$/', $char)) {
00962 $this->token = array(
00963 'name' => strtoupper($char),
00964 'type' => self::DOCTYPE,
00965 'error' => true
00966 );
00967
00968 $this->state = 'doctypeName';
00969
00970 } elseif($char === '>') {
00971 $this->emitToken(array(
00972 'name' => null,
00973 'type' => self::DOCTYPE,
00974 'error' => true
00975 ));
00976
00977 $this->state = 'data';
00978
00979 } elseif($this->char === $this->EOF) {
00980 $this->emitToken(array(
00981 'name' => null,
00982 'type' => self::DOCTYPE,
00983 'error' => true
00984 ));
00985
00986 $this->char--;
00987 $this->state = 'data';
00988
00989 } else {
00990 $this->token = array(
00991 'name' => $char,
00992 'type' => self::DOCTYPE,
00993 'error' => true
00994 );
00995
00996 $this->state = 'doctypeName';
00997 }
00998 }
00999
01000 private function doctypeNameState() {
01001
01002 $this->char++;
01003 $char = $this->char();
01004
01005 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
01006 $this->state = 'AfterDoctypeName';
01007
01008 } elseif($char === '>') {
01009 $this->emitToken($this->token);
01010 $this->state = 'data';
01011
01012 } elseif(preg_match('/^[a-z]$/', $char)) {
01013 $this->token['name'] .= strtoupper($char);
01014
01015 } elseif($this->char === $this->EOF) {
01016 $this->emitToken($this->token);
01017 $this->char--;
01018 $this->state = 'data';
01019
01020 } else {
01021 $this->token['name'] .= $char;
01022 }
01023
01024 $this->token['error'] = ($this->token['name'] === 'HTML')
01025 ? false
01026 : true;
01027 }
01028
01029 private function afterDoctypeNameState() {
01030
01031 $this->char++;
01032 $char = $this->char();
01033
01034 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
01035
01036
01037 } elseif($char === '>') {
01038 $this->emitToken($this->token);
01039 $this->state = 'data';
01040
01041 } elseif($this->char === $this->EOF) {
01042 $this->emitToken($this->token);
01043 $this->char--;
01044 $this->state = 'data';
01045
01046 } else {
01047 $this->token['error'] = true;
01048 $this->state = 'bogusDoctype';
01049 }
01050 }
01051
01052 private function bogusDoctypeState() {
01053
01054 $this->char++;
01055 $char = $this->char();
01056
01057 if($char === '>') {
01058 $this->emitToken($this->token);
01059 $this->state = 'data';
01060
01061 } elseif($this->char === $this->EOF) {
01062 $this->emitToken($this->token);
01063 $this->char--;
01064 $this->state = 'data';
01065
01066 } else {
01067
01068 }
01069 }
01070
01071 private function entity() {
01072 $start = $this->char;
01073
01074
01075
01076
01077
01078
01079
01080 switch($this->character($this->char + 1)) {
01081
01082 case '#':
01083
01084
01085
01086 switch($this->character($this->char + 1)) {
01087
01088
01089 case 'x':
01090 case 'X':
01091
01092
01093
01094
01095
01096
01097 $char = 1;
01098 $char_class = '0-9A-Fa-f';
01099 break;
01100
01101
01102 default:
01103
01104
01105
01106 $char = 0;
01107 $char_class = '0-9';
01108 break;
01109 }
01110
01111
01112
01113 $this->char++;
01114 $e_name = $this->characters($char_class, $this->char + $char + 1);
01115 $entity = $this->character($start, $this->char);
01116 $cond = strlen($e_name) > 0;
01117
01118
01119 break;
01120
01121
01122 default:
01123
01124
01125
01126 $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
01127 $len = strlen($e_name);
01128
01129 for($c = 1; $c <= $len; $c++) {
01130 $id = substr($e_name, 0, $c);
01131 $this->char++;
01132
01133 if(in_array($id, $this->entities)) {
01134 if ($e_name[$c-1] !== ';') {
01135 if ($c < $len && $e_name[$c] == ';') {
01136 $this->char++;
01137 }
01138 }
01139 $entity = $id;
01140 break;
01141 }
01142 }
01143
01144 $cond = isset($entity);
01145
01146 break;