library/HTMLPurifier/Lexer/PH5P.php

Go to the documentation of this file.
00001 <?php
00002 
00013 class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex {
00014     
00015     public function tokenizeHTML($html, $config, $context) {
00016         $new_html = $this->normalize($html, $config, $context);
00017         $new_html = $this->wrapHTML($new_html, $config, $context);
00018         try {
00019             $parser = new HTML5($new_html);
00020             $doc = $parser->save();
00021         } catch (DOMException $e) {
00022             // Uh oh, it failed. Punt to DirectLex.
00023             $lexer = new HTMLPurifier_Lexer_DirectLex();
00024             $context->register('PH5PError', $e); // save the error, so we can detect it
00025             return $lexer->tokenizeHTML($html, $config, $context); // use original HTML
00026         }
00027         $tokens = array();
00028         $this->tokenizeDOM(
00029             $doc->getElementsByTagName('html')->item(0)-> // <html>
00030                   getElementsByTagName('body')->item(0)-> //   <body>
00031                   getElementsByTagName('div')->item(0)    //     <div>
00032             , $tokens);
00033         return $tokens;
00034     }
00035     
00036 }
00037 
00038 /*
00039 
00040 Copyright 2007 Jeroen van der Meer <http://jero.net/> 
00041 
00042 Permission is hereby granted, free of charge, to any person obtaining a 
00043 copy of this software and associated documentation files (the 
00044 "Software"), to deal in the Software without restriction, including 
00045 without limitation the rights to use, copy, modify, merge, publish, 
00046 distribute, sublicense, and/or sell copies of the Software, and to 
00047 permit persons to whom the Software is furnished to do so, subject to 
00048 the following conditions: 
00049 
00050 The above copyright notice and this permission notice shall be included 
00051 in all copies or substantial portions of the Software. 
00052 
00053 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 
00054 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
00055 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 
00056 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
00057 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
00058 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
00059 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 
00060 
00061 */
00062 
00063 class HTML5 {
00064     private $data;
00065     private $char;
00066     private $EOF;
00067     private $state;
00068     private $tree;
00069     private $token;
00070     private $content_model;
00071     private $escape = false;
00072     private $entities = array('AElig;','AElig','AMP;','AMP','Aacute;','Aacute',
00073     'Acirc;','Acirc','Agrave;','Agrave','Alpha;','Aring;','Aring','Atilde;',
00074     'Atilde','Auml;','Auml','Beta;','COPY;','COPY','Ccedil;','Ccedil','Chi;',
00075     'Dagger;','Delta;','ETH;','ETH','Eacute;','Eacute','Ecirc;','Ecirc','Egrave;',
00076     'Egrave','Epsilon;','Eta;','Euml;','Euml','GT;','GT','Gamma;','Iacute;',
00077     'Iacute','Icirc;','Icirc','Igrave;','Igrave','Iota;','Iuml;','Iuml','Kappa;',
00078     'LT;','LT','Lambda;','Mu;','Ntilde;','Ntilde','Nu;','OElig;','Oacute;',
00079     'Oacute','Ocirc;','Ocirc','Ograve;','Ograve','Omega;','Omicron;','Oslash;',
00080     'Oslash','Otilde;','Otilde','Ouml;','Ouml','Phi;','Pi;','Prime;','Psi;',
00081     'QUOT;','QUOT','REG;','REG','Rho;','Scaron;','Sigma;','THORN;','THORN',
00082     'TRADE;','Tau;','Theta;','Uacute;','Uacute','Ucirc;','Ucirc','Ugrave;',
00083     'Ugrave','Upsilon;','Uuml;','Uuml','Xi;','Yacute;','Yacute','Yuml;','Zeta;',
00084     'aacute;','aacute','acirc;','acirc','acute;','acute','aelig;','aelig',
00085     'agrave;','agrave','alefsym;','alpha;','amp;','amp','and;','ang;','apos;',
00086     'aring;','aring','asymp;','atilde;','atilde','auml;','auml','bdquo;','beta;',
00087     'brvbar;','brvbar','bull;','cap;','ccedil;','ccedil','cedil;','cedil',
00088     'cent;','cent','chi;','circ;','clubs;','cong;','copy;','copy','crarr;',
00089     'cup;','curren;','curren','dArr;','dagger;','darr;','deg;','deg','delta;',
00090     'diams;','divide;','divide','eacute;','eacute','ecirc;','ecirc','egrave;',
00091     'egrave','empty;','emsp;','ensp;','epsilon;','equiv;','eta;','eth;','eth',
00092     'euml;','euml','euro;','exist;','fnof;','forall;','frac12;','frac12',
00093     'frac14;','frac14','frac34;','frac34','frasl;','gamma;','ge;','gt;','gt',
00094     'hArr;','harr;','hearts;','hellip;','iacute;','iacute','icirc;','icirc',
00095     'iexcl;','iexcl','igrave;','igrave','image;','infin;','int;','iota;',
00096     'iquest;','iquest','isin;','iuml;','iuml','kappa;','lArr;','lambda;','lang;',
00097     'laquo;','laquo','larr;','lceil;','ldquo;','le;','lfloor;','lowast;','loz;',
00098     'lrm;','lsaquo;','lsquo;','lt;','lt','macr;','macr','mdash;','micro;','micro',
00099     'middot;','middot','minus;','mu;','nabla;','nbsp;','nbsp','ndash;','ne;',
00100     'ni;','not;','not','notin;','nsub;','ntilde;','ntilde','nu;','oacute;',
00101     'oacute','ocirc;','ocirc','oelig;','ograve;','ograve','oline;','omega;',
00102     'omicron;','oplus;','or;','ordf;','ordf','ordm;','ordm','oslash;','oslash',
00103     'otilde;','otilde','otimes;','ouml;','ouml','para;','para','part;','permil;',
00104     'perp;','phi;','pi;','piv;','plusmn;','plusmn','pound;','pound','prime;',
00105     'prod;','prop;','psi;','quot;','quot','rArr;','radic;','rang;','raquo;',
00106     'raquo','rarr;','rceil;','rdquo;','real;','reg;','reg','rfloor;','rho;',
00107     'rlm;','rsaquo;','rsquo;','sbquo;','scaron;','sdot;','sect;','sect','shy;',
00108     'shy','sigma;','sigmaf;','sim;','spades;','sub;','sube;','sum;','sup1;',
00109     'sup1','sup2;','sup2','sup3;','sup3','sup;','supe;','szlig;','szlig','tau;',
00110     'there4;','theta;','thetasym;','thinsp;','thorn;','thorn','tilde;','times;',
00111     'times','trade;','uArr;','uacute;','uacute','uarr;','ucirc;','ucirc',
00112     'ugrave;','ugrave','uml;','uml','upsih;','upsilon;','uuml;','uuml','weierp;',
00113     'xi;','yacute;','yacute','yen;','yen','yuml;','yuml','zeta;','zwj;','zwnj;');
00114 
00115     const PCDATA    = 0;
00116     const RCDATA    = 1;
00117     const CDATA     = 2;
00118     const PLAINTEXT = 3;
00119 
00120     const DOCTYPE  = 0;
00121     const STARTTAG = 1;
00122     const ENDTAG   = 2;
00123     const COMMENT  = 3;
00124     const CHARACTR = 4;
00125     const EOF      = 5;
00126 
00127     public function __construct($data) {
00128         $data = str_replace("\r\n", "\n", $data);
00129         $data = str_replace("\r", null, $data);
00130 
00131         $this->data = $data;
00132         $this->char = -1;
00133         $this->EOF  = strlen($data);
00134         $this->tree = new HTML5TreeConstructer;
00135         $this->content_model = self::PCDATA;
00136 
00137         $this->state = 'data';
00138 
00139         while($this->state !== null) {
00140             $this->{$this->state.'State'}();
00141         }
00142     }
00143 
00144     public function save() {
00145         return $this->tree->save();
00146     }
00147 
00148     private function char() {
00149         return ($this->char < $this->EOF)
00150             ? $this->data[$this->char]
00151             : false;
00152     }
00153 
00154     private function character($s, $l = 0) {
00155         if($s + $l < $this->EOF) {
00156             if($l === 0) {
00157                 return $this->data[$s];
00158             } else {
00159                 return substr($this->data, $s, $l);
00160             }
00161         }
00162     }
00163 
00164     private function characters($char_class, $start) {
00165         return preg_replace('#^(['.$char_class.']+).*#s', '\\1', substr($this->data, $start));
00166     }
00167 
00168     private function dataState() {
00169         // Consume the next input character
00170         $this->char++;
00171         $char = $this->char();
00172 
00173         if($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
00174             /* U+0026 AMPERSAND (&)
00175             When the content model flag is set to one of the PCDATA or RCDATA
00176             states: switch to the entity data state. Otherwise: treat it as per
00177             the "anything else"    entry below. */
00178             $this->state = 'entityData';
00179 
00180         } elseif($char === '-') {
00181             /* If the content model flag is set to either the RCDATA state or
00182             the CDATA state, and the escape flag is false, and there are at
00183             least three characters before this one in the input stream, and the
00184             last four characters in the input stream, including this one, are
00185             U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
00186             and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
00187             if(($this->content_model === self::RCDATA || $this->content_model ===
00188             self::CDATA) && $this->escape === false &&
00189             $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--') {
00190                 $this->escape = true;
00191             }
00192 
00193             /* In any case, emit the input character as a character token. Stay
00194             in the data state. */
00195             $this->emitToken(array(
00196                 'type' => self::CHARACTR,
00197                 'data' => $char
00198             ));
00199 
00200         /* U+003C LESS-THAN SIGN (<) */
00201         } elseif($char === '<' && ($this->content_model === self::PCDATA ||
00202         (($this->content_model === self::RCDATA ||
00203         $this->content_model === self::CDATA) && $this->escape === false))) {
00204             /* When the content model flag is set to the PCDATA state: switch
00205             to the tag open state.
00206 
00207             When the content model flag is set to either the RCDATA state or
00208             the CDATA state and the escape flag is false: switch to the tag
00209             open state.
00210 
00211             Otherwise: treat it as per the "anything else" entry below. */
00212             $this->state = 'tagOpen';
00213 
00214         /* U+003E GREATER-THAN SIGN (>) */
00215         } elseif($char === '>') {
00216             /* If the content model flag is set to either the RCDATA state or
00217             the CDATA state, and the escape flag is true, and the last three
00218             characters in the input stream including this one are U+002D
00219             HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
00220             set the escape flag to false. */
00221             if(($this->content_model === self::RCDATA ||
00222             $this->content_model === self::CDATA) && $this->escape === true &&
00223             $this->character($this->char, 3) === '-->') {
00224                 $this->escape = false;
00225             }
00226 
00227             /* In any case, emit the input character as a character token.
00228             Stay in the data state. */
00229             $this->emitToken(array(
00230                 'type' => self::CHARACTR,
00231                 'data' => $char
00232             ));
00233 
00234         } elseif($this->char === $this->EOF) {
00235             /* EOF
00236             Emit an end-of-file token. */
00237             $this->EOF();
00238 
00239         } elseif($this->content_model === self::PLAINTEXT) {
00240             /* When the content model flag is set to the PLAINTEXT state
00241             THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
00242             the text and emit it as a character token. */
00243             $this->emitToken(array(
00244                 'type' => self::CHARACTR,
00245                 'data' => substr($this->data, $this->char)
00246             ));
00247 
00248             $this->EOF();
00249 
00250         } else {
00251             /* Anything else
00252             THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
00253             otherwise would also be treated as a character token and emit it
00254             as a single character token. Stay in the data state. */
00255             $len  = strcspn($this->data, '<&', $this->char);
00256             $char = substr($this->data, $this->char, $len);
00257             $this->char += $len - 1;
00258 
00259             $this->emitToken(array(
00260                 'type' => self::CHARACTR,
00261                 'data' => $char
00262             ));
00263 
00264             $this->state = 'data';
00265         }
00266     }
00267 
00268     private function entityDataState() {
00269         // Attempt to consume an entity.
00270         $entity = $this->entity();
00271 
00272         // If nothing is returned, emit a U+0026 AMPERSAND character token.
00273         // Otherwise, emit the character token that was returned.
00274         $char = (!$entity) ? '&' : $entity;
00275         $this->emitToken(array(
00276             'type' => self::CHARACTR,
00277             'data' => $char
00278         ));
00279 
00280         // Finally, switch to the data state.
00281         $this->state = 'data';
00282     }
00283 
00284     private function tagOpenState() {
00285         switch($this->content_model) {
00286             case self::RCDATA:
00287             case self::CDATA:
00288                 /* If the next input character is a U+002F SOLIDUS (/) character,
00289                 consume it and switch to the close tag open state. If the next
00290                 input character is not a U+002F SOLIDUS (/) character, emit a
00291                 U+003C LESS-THAN SIGN character token and switch to the data
00292                 state to process the next input character. */
00293                 if($this->character($this->char + 1) === '/') {
00294                     $this->char++;
00295                     $this->state = 'closeTagOpen';
00296 
00297                 } else {
00298                     $this->emitToken(array(
00299                         'type' => self::CHARACTR,
00300                         'data' => '<'
00301                     ));
00302 
00303                     $this->state = 'data';
00304                 }
00305             break;
00306 
00307             case self::PCDATA:
00308                 // If the content model flag is set to the PCDATA state
00309                 // Consume the next input character:
00310                 $this->char++;
00311                 $char = $this->char();
00312 
00313                 if($char === '!') {
00314                     /* U+0021 EXCLAMATION MARK (!)
00315                     Switch to the markup declaration open state. */
00316                     $this->state = 'markupDeclarationOpen';
00317 
00318                 } elseif($char === '/') {
00319                     /* U+002F SOLIDUS (/)
00320                     Switch to the close tag open state. */
00321                     $this->state = 'closeTagOpen';
00322 
00323                 } elseif(preg_match('/^[A-Za-z]$/', $char)) {
00324                     /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
00325                     Create a new start tag token, set its tag name to the lowercase
00326                     version of the input character (add 0x0020 to the character's code
00327                     point), then switch to the tag name state. (Don't emit the token
00328                     yet; further details will be filled in before it is emitted.) */
00329                     $this->token = array(
00330                         'name'  => strtolower($char),
00331                         'type'  => self::STARTTAG,
00332                         'attr'  => array()
00333                     );
00334 
00335                     $this->state = 'tagName';
00336 
00337                 } elseif($char === '>') {
00338                     /* U+003E GREATER-THAN SIGN (>)
00339                     Parse error. Emit a U+003C LESS-THAN SIGN character token and a
00340                     U+003E GREATER-THAN SIGN character token. Switch to the data state. */
00341                     $this->emitToken(array(
00342                         'type' => self::CHARACTR,
00343                         'data' => '<>'
00344                     ));
00345 
00346                     $this->state = 'data';
00347 
00348                 } elseif($char === '?') {
00349                     /* U+003F QUESTION MARK (?)
00350                     Parse error. Switch to the bogus comment state. */
00351                     $this->state = 'bogusComment';
00352 
00353                 } else {
00354                     /* Anything else
00355                     Parse error. Emit a U+003C LESS-THAN SIGN character token and
00356                     reconsume the current input character in the data state. */
00357                     $this->emitToken(array(
00358                         'type' => self::CHARACTR,
00359                         'data' => '<'
00360                     ));
00361 
00362                     $this->char--;
00363                     $this->state = 'data';
00364                 }
00365             break;
00366         }
00367     }
00368 
00369     private function closeTagOpenState() {
00370         $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
00371         $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
00372 
00373         if(($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
00374         (!$the_same || ($the_same && (!preg_match('/[\t\n\x0b\x0c >\/]/',
00375         $this->character($this->char + 1 + strlen($next_node))) || $this->EOF === $this->char)))) {
00376             /* If the content model flag is set to the RCDATA or CDATA states then
00377             examine the next few characters. If they do not match the tag name of
00378             the last start tag token emitted (case insensitively), or if they do but
00379             they are not immediately followed by one of the following characters:
00380                 * U+0009 CHARACTER TABULATION
00381                 * U+000A LINE FEED (LF)
00382                 * U+000B LINE TABULATION
00383                 * U+000C FORM FEED (FF)
00384                 * U+0020 SPACE
00385                 * U+003E GREATER-THAN SIGN (>)
00386                 * U+002F SOLIDUS (/)
00387                 * EOF
00388             ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
00389             token, a U+002F SOLIDUS character token, and switch to the data state
00390             to process the next input character. */
00391             $this->emitToken(array(
00392                 'type' => self::CHARACTR,
00393                 'data' => '</'
00394             ));
00395 
00396             $this->state = 'data';
00397 
00398         } else {
00399             /* Otherwise, if the content model flag is set to the PCDATA state,
00400             or if the next few characters do match that tag name, consume the
00401             next input character: */
00402             $this->char++;
00403             $char = $this->char();
00404 
00405             if(preg_match('/^[A-Za-z]$/', $char)) {
00406                 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
00407                 Create a new end tag token, set its tag name to the lowercase version
00408                 of the input character (add 0x0020 to the character's code point), then
00409                 switch to the tag name state. (Don't emit the token yet; further details
00410                 will be filled in before it is emitted.) */
00411                 $this->token = array(
00412                     'name'  => strtolower($char),
00413                     'type'  => self::ENDTAG
00414                 );
00415 
00416                 $this->state = 'tagName';
00417 
00418             } elseif($char === '>') {
00419                 /* U+003E GREATER-THAN SIGN (>)
00420                 Parse error. Switch to the data state. */
00421                 $this->state = 'data';
00422 
00423             } elseif($this->char === $this->EOF) {
00424                 /* EOF
00425                 Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
00426                 SOLIDUS character token. Reconsume the EOF character in the data state. */
00427                 $this->emitToken(array(
00428                     'type' => self::CHARACTR,
00429                     'data' => '</'
00430                 ));
00431 
00432                 $this->char--;
00433                 $this->state = 'data';
00434 
00435             } else {
00436                 /* Parse error. Switch to the bogus comment state. */
00437                 $this->state = 'bogusComment';
00438             }
00439         }
00440     }
00441 
00442     private function tagNameState() {
00443         // Consume the next input character:
00444         $this->char++;
00445         $char = $this->character($this->char);
00446 
00447         if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
00448             /* U+0009 CHARACTER TABULATION
00449             U+000A LINE FEED (LF)
00450             U+000B LINE TABULATION
00451             U+000C FORM FEED (FF)
00452             U+0020 SPACE
00453             Switch to the before attribute name state. */
00454             $this->state = 'beforeAttributeName';
00455 
00456         } elseif($char === '>') {
00457             /* U+003E GREATER-THAN SIGN (>)
00458             Emit the current tag token. Switch to the data state. */
00459             $this->emitToken($this->token);
00460             $this->state = 'data';
00461 
00462         } elseif($this->char === $this->EOF) {
00463             /* EOF
00464             Parse error. Emit the current tag token. Reconsume the EOF
00465             character in the data state. */
00466             $this->emitToken($this->token);
00467 
00468             $this->char--;
00469             $this->state = 'data';
00470 
00471         } elseif($char === '/') {
00472             /* U+002F SOLIDUS (/)
00473             Parse error unless this is a permitted slash. Switch to the before
00474             attribute name state. */
00475             $this->state = 'beforeAttributeName';
00476 
00477         } else {
00478             /* Anything else
00479             Append the current input character to the current tag token's tag name.
00480             Stay in the tag name state. */
00481             $this->token['name'] .= strtolower($char);
00482             $this->state = 'tagName';
00483         }
00484     }
00485 
00486     private function beforeAttributeNameState() {
00487         // Consume the next input character:
00488         $this->char++;
00489         $char = $this->character($this->char);
00490 
00491         if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
00492             /* U+0009 CHARACTER TABULATION
00493             U+000A LINE FEED (LF)
00494             U+000B LINE TABULATION
00495             U+000C FORM FEED (FF)
00496             U+0020 SPACE
00497             Stay in the before attribute name state. */
00498             $this->state = 'beforeAttributeName';
00499 
00500         } elseif($char === '>') {
00501             /* U+003E GREATER-THAN SIGN (>)
00502             Emit the current tag token. Switch to the data state. */
00503             $this->emitToken($this->token);
00504             $this->state = 'data';
00505 
00506         } elseif($char === '/') {
00507             /* U+002F SOLIDUS (/)
00508             Parse error unless this is a permitted slash. Stay in the before
00509             attribute name state. */
00510             $this->state = 'beforeAttributeName';
00511 
00512         } elseif($this->char === $this->EOF) {
00513             /* EOF
00514             Parse error. Emit the current tag token. Reconsume the EOF
00515             character in the data state. */
00516             $this->emitToken($this->token);
00517 
00518             $this->char--;
00519             $this->state = 'data';
00520 
00521         } else {
00522             /* Anything else
00523             Start a new attribute in the current tag token. Set that attribute's
00524             name to the current input character, and its value to the empty string.
00525             Switch to the attribute name state. */
00526             $this->token['attr'][] = array(
00527                 'name'  => strtolower($char),
00528                 'value' => null
00529             );
00530 
00531             $this->state = 'attributeName';
00532         }
00533     }
00534 
00535     private function attributeNameState() {
00536         // Consume the next input character:
00537         $this->char++;
00538         $char = $this->character($this->char);
00539 
00540         if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
00541             /* U+0009 CHARACTER TABULATION
00542             U+000A LINE FEED (LF)
00543             U+000B LINE TABULATION
00544             U+000C FORM FEED (FF)
00545             U+0020 SPACE
00546             Stay in the before attribute name state. */
00547             $this->state = 'afterAttributeName';
00548 
00549         } elseif($char === '=') {
00550             /* U+003D EQUALS SIGN (=)
00551             Switch to the before attribute value state. */
00552             $this->state = 'beforeAttributeValue';
00553 
00554         } elseif($char === '>') {
00555             /* U+003E GREATER-THAN SIGN (>)
00556             Emit the current tag token. Switch to the data state. */
00557             $this->emitToken($this->token);
00558             $this->state = 'data';
00559 
00560         } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
00561             /* U+002F SOLIDUS (/)
00562             Parse error unless this is a permitted slash. Switch to the before
00563             attribute name state. */
00564             $this->state = 'beforeAttributeName';
00565 
00566         } elseif($this->char === $this->EOF) {
00567             /* EOF
00568             Parse error. Emit the current tag token. Reconsume the EOF
00569             character in the data state. */
00570             $this->emitToken($this->token);
00571 
00572             $this->char--;
00573             $this->state = 'data';
00574 
00575         } else {
00576             /* Anything else
00577             Append the current input character to the current attribute's name.
00578             Stay in the attribute name state. */
00579             $last = count($this->token['attr']) - 1;
00580             $this->token['attr'][$last]['name'] .= strtolower($char);
00581 
00582             $this->state = 'attributeName';
00583         }
00584     }
00585 
00586     private function afterAttributeNameState() {
00587         // Consume the next input character:
00588         $this->char++;
00589         $char = $this->character($this->char);
00590 
00591         if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
00592             /* U+0009 CHARACTER TABULATION
00593             U+000A LINE FEED (LF)
00594             U+000B LINE TABULATION
00595             U+000C FORM FEED (FF)
00596             U+0020 SPACE
00597             Stay in the after attribute name state. */
00598             $this->state = 'afterAttributeName';
00599 
00600         } elseif($char === '=') {
00601             /* U+003D EQUALS SIGN (=)
00602             Switch to the before attribute value state. */
00603             $this->state = 'beforeAttributeValue';
00604 
00605         } elseif($char === '>') {
00606             /* U+003E GREATER-THAN SIGN (>)
00607             Emit the current tag token. Switch to the data state. */
00608             $this->emitToken($this->token);
00609             $this->state = 'data';
00610 
00611         } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
00612             /* U+002F SOLIDUS (/)
00613             Parse error unless this is a permitted slash. Switch to the
00614             before attribute name state. */
00615             $this->state = 'beforeAttributeName';
00616 
00617         } elseif($this->char === $this->EOF) {
00618             /* EOF
00619             Parse error. Emit the current tag token. Reconsume the EOF
00620             character in the data state. */
00621             $this->emitToken($this->token);
00622 
00623             $this->char--;
00624             $this->state = 'data';
00625 
00626         } else {
00627             /* Anything else
00628             Start a new attribute in the current tag token. Set that attribute's
00629             name to the current input character, and its value to the empty string.
00630             Switch to the attribute name state. */
00631             $this->token['attr'][] = array(
00632                 'name'  => strtolower($char),
00633                 'value' => null
00634             );
00635 
00636             $this->state = 'attributeName';
00637         }
00638     }
00639 
00640     private function beforeAttributeValueState() {
00641         // Consume the next input character:
00642         $this->char++;
00643         $char = $this->character($this->char);
00644 
00645         if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
00646             /* U+0009 CHARACTER TABULATION
00647             U+000A LINE FEED (LF)
00648             U+000B LINE TABULATION
00649             U+000C FORM FEED (FF)
00650             U+0020 SPACE
00651             Stay in the before attribute value state. */
00652             $this->state = 'beforeAttributeValue';
00653 
00654         } elseif($char === '"') {
00655             /* U+0022 QUOTATION MARK (")
00656             Switch to the attribute value (double-quoted) state. */
00657             $this->state = 'attributeValueDoubleQuoted';
00658 
00659         } elseif($char === '&') {
00660             /* U+0026 AMPERSAND (&)
00661             Switch to the attribute value (unquoted) state and reconsume
00662             this input character. */
00663             $this->char--;
00664             $this->state = 'attributeValueUnquoted';
00665 
00666         } elseif($char === '\'') {
00667             /* U+0027 APOSTROPHE (')
00668             Switch to the attribute value (single-quoted) state. */
00669             $this->state = 'attributeValueSingleQuoted';
00670 
00671         } elseif($char === '>') {
00672             /* U+003E GREATER-THAN SIGN (>)
00673             Emit the current tag token. Switch to the data state. */
00674             $this->emitToken($this->token);
00675             $this->state = 'data';
00676 
00677         } else {
00678             /* Anything else
00679             Append the current input character to the current attribute's value.
00680             Switch to the attribute value (unquoted) state. */
00681             $last = count($this->token['attr']) - 1;
00682             $this->token['attr'][$last]['value'] .= $char;
00683 
00684             $this->state = 'attributeValueUnquoted';
00685         }
00686     }
00687 
00688     private function attributeValueDoubleQuotedState() {
00689         // Consume the next input character:
00690         $this->char++;
00691         $char = $this->character($this->char);
00692 
00693         if($char === '"') {
00694             /* U+0022 QUOTATION MARK (")
00695             Switch to the before attribute name state. */
00696             $this->state = 'beforeAttributeName';
00697 
00698         } elseif($char === '&') {
00699             /* U+0026 AMPERSAND (&)
00700             Switch to the entity in attribute value state. */
00701             $this->entityInAttributeValueState('double');
00702 
00703         } elseif($this->char === $this->EOF) {
00704             /* EOF
00705             Parse error. Emit the current tag token. Reconsume the character
00706             in the data state. */
00707             $this->emitToken($this->token);
00708 
00709             $this->char--;
00710             $this->state = 'data';
00711 
00712         } else {
00713             /* Anything else
00714             Append the current input character to the current attribute's value.
00715             Stay in the attribute value (double-quoted) state. */
00716             $last = count($this->token['attr']) - 1;
00717             $this->token['attr'][$last]['value'] .= $char;
00718 
00719             $this->state = 'attributeValueDoubleQuoted';
00720         }
00721     }
00722 
00723     private function attributeValueSingleQuotedState() {
00724         // Consume the next input character:
00725         $this->char++;
00726         $char = $this->character($this->char);
00727 
00728         if($char === '\'') {
00729             /* U+0022 QUOTATION MARK (')
00730             Switch to the before attribute name state. */
00731             $this->state = 'beforeAttributeName';
00732 
00733         } elseif($char === '&') {
00734             /* U+0026 AMPERSAND (&)
00735             Switch to the entity in attribute value state. */
00736             $this->entityInAttributeValueState('single');
00737 
00738         } elseif($this->char === $this->EOF) {
00739             /* EOF
00740             Parse error. Emit the current tag token. Reconsume the character
00741             in the data state. */
00742             $this->emitToken($this->token);
00743 
00744             $this->char--;
00745             $this->state = 'data';
00746 
00747         } else {
00748             /* Anything else
00749             Append the current input character to the current attribute's value.
00750             Stay in the attribute value (single-quoted) state. */
00751             $last = count($this->token['attr']) - 1;
00752             $this->token['attr'][$last]['value'] .= $char;
00753 
00754             $this->state = 'attributeValueSingleQuoted';
00755         }
00756     }
00757 
00758     private function attributeValueUnquotedState() {
00759         // Consume the next input character:
00760         $this->char++;
00761         $char = $this->character($this->char);
00762 
00763         if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
00764             /* U+0009 CHARACTER TABULATION
00765             U+000A LINE FEED (LF)
00766             U+000B LINE TABULATION
00767             U+000C FORM FEED (FF)
00768             U+0020 SPACE
00769             Switch to the before attribute name state. */
00770             $this->state = 'beforeAttributeName';
00771 
00772         } elseif($char === '&') {
00773             /* U+0026 AMPERSAND (&)
00774             Switch to the entity in attribute value state. */
00775             $this->entityInAttributeValueState();
00776 
00777         } elseif($char === '>') {
00778             /* U+003E GREATER-THAN SIGN (>)
00779             Emit the current tag token. Switch to the data state. */
00780             $this->emitToken($this->token);
00781             $this->state = 'data';
00782 
00783         } else {
00784             /* Anything else
00785             Append the current input character to the current attribute's value.
00786             Stay in the attribute value (unquoted) state. */
00787             $last = count($this->token['attr']) - 1;
00788             $this->token['attr'][$last]['value'] .= $char;
00789 
00790             $this->state = 'attributeValueUnquoted';
00791         }
00792     }
00793 
00794     private function entityInAttributeValueState() {
00795         // Attempt to consume an entity.
00796         $entity = $this->entity();
00797 
00798         // If nothing is returned, append a U+0026 AMPERSAND character to the
00799         // current attribute's value. Otherwise, emit the character token that
00800         // was returned.
00801         $char = (!$entity)
00802             ? '&'
00803             : $entity;
00804 
00805         $last = count($this->token['attr']) - 1;
00806         $this->token['attr'][$last]['value'] .= $char;
00807     }
00808 
00809     private function bogusCommentState() {
00810         /* Consume every character up to the first U+003E GREATER-THAN SIGN
00811         character (>) or the end of the file (EOF), whichever comes first. Emit
00812         a comment token whose data is the concatenation of all the characters
00813         starting from and including the character that caused the state machine
00814         to switch into the bogus comment state, up to and including the last
00815         consumed character before the U+003E character, if any, or up to the
00816         end of the file otherwise. (If the comment was started by the end of
00817         the file (EOF), the token is empty.) */
00818         $data = $this->characters('^>', $this->char);
00819         $this->emitToken(array(
00820             'data' => $data,
00821             'type' => self::COMMENT
00822         ));
00823 
00824         $this->char += strlen($data);
00825 
00826         /* Switch to the data state. */
00827         $this->state = 'data';
00828 
00829         /* If the end of the file was reached, reconsume the EOF character. */
00830         if($this->char === $this->EOF) {
00831             $this->char = $this->EOF - 1;
00832         }
00833     }
00834 
00835     private function markupDeclarationOpenState() {
00836         /* If the next two characters are both U+002D HYPHEN-MINUS (-)
00837         characters, consume those two characters, create a comment token whose
00838         data is the empty string, and switch to the comment state. */
00839         if($this->character($this->char + 1, 2) === '--') {
00840             $this->char += 2;
00841             $this->state = 'comment';
00842             $this->token = array(
00843                 'data' => null,
00844                 'type' => self::COMMENT
00845             );
00846 
00847         /* Otherwise if the next seven chacacters are a case-insensitive match
00848         for the word "DOCTYPE", then consume those characters and switch to the
00849         DOCTYPE state. */
00850         } elseif(strtolower($this->character($this->char + 1, 7)) === 'doctype') {
00851             $this->char += 7;
00852             $this->state = 'doctype';
00853 
00854         /* Otherwise, is is a parse error. Switch to the bogus comment state.
00855         The next character that is consumed, if any, is the first character
00856         that will be in the comment. */
00857         } else {
00858             $this->char++;
00859             $this->state = 'bogusComment';
00860         }
00861     }
00862 
00863     private function commentState() {
00864         /* Consume the next input character: */
00865         $this->char++;
00866         $char = $this->char();
00867 
00868         /* U+002D HYPHEN-MINUS (-) */
00869         if($char === '-') {
00870             /* Switch to the comment dash state  */
00871             $this->state = 'commentDash';
00872 
00873         /* EOF */
00874         } elseif($this->char === $this->EOF) {
00875             /* Parse error. Emit the comment token. Reconsume the EOF character
00876             in the data state. */
00877             $this->emitToken($this->token);
00878             $this->char--;
00879             $this->state = 'data';
00880 
00881         /* Anything else */
00882         } else {
00883             /* Append the input character to the comment token's data. Stay in
00884             the comment state. */
00885             $this->token['data'] .= $char;
00886         }
00887     }
00888 
00889     private function commentDashState() {
00890         /* Consume the next input character: */
00891         $this->char++;
00892         $char = $this->char();
00893 
00894         /* U+002D HYPHEN-MINUS (-) */
00895         if($char === '-') {
00896             /* Switch to the comment end state  */
00897             $this->state = 'commentEnd';
00898 
00899         /* EOF */
00900         } elseif($this->char === $this->EOF) {
00901             /* Parse error. Emit the comment token. Reconsume the EOF character
00902             in the data state. */
00903             $this->emitToken($this->token);
00904             $this->char--;
00905             $this->state = 'data';
00906 
00907         /* Anything else */
00908         } else {
00909             /* Append a U+002D HYPHEN-MINUS (-) character and the input
00910             character to the comment token's data. Switch to the comment state. */
00911             $this->token['data'] .= '-'.$char;
00912             $this->state = 'comment';
00913         }
00914     }
00915 
00916     private function commentEndState() {
00917         /* Consume the next input character: */
00918         $this->char++;
00919         $char = $this->char();
00920 
00921         if($char === '>') {
00922             $this->emitToken($this->token);
00923             $this->state = 'data';
00924 
00925         } elseif($char === '-') {
00926             $this->token['data'] .= '-';
00927 
00928         } elseif($this->char === $this->EOF) {
00929             $this->emitToken($this->token);
00930             $this->char--;
00931             $this->state = 'data';
00932 
00933         } else {
00934             $this->token['data'] .= '--'.$char;
00935             $this->state = 'comment';
00936         }
00937     }
00938 
00939     private function doctypeState() {
00940         /* Consume the next input character: */
00941         $this->char++;
00942         $char = $this->char();
00943 
00944         if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
00945             $this->state = 'beforeDoctypeName';
00946 
00947         } else {
00948             $this->char--;
00949             $this->state = 'beforeDoctypeName';
00950         }
00951     }
00952 
00953     private function beforeDoctypeNameState() {
00954         /* Consume the next input character: */
00955         $this->char++;
00956         $char = $this->char();
00957 
00958         if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
00959             // Stay in the before DOCTYPE name state.
00960 
00961         } elseif(preg_match('/^[a-z]$/', $char)) {
00962             $this->token = array(
00963                 'name' => strtoupper($char),
00964                 'type' => self::DOCTYPE,
00965                 'error' => true
00966             );
00967 
00968             $this->state = 'doctypeName';
00969 
00970         } elseif($char === '>') {
00971             $this->emitToken(array(
00972                 'name' => null,
00973                 'type' => self::DOCTYPE,
00974                 'error' => true
00975             ));
00976 
00977             $this->state = 'data';
00978 
00979         } elseif($this->char === $this->EOF) {
00980             $this->emitToken(array(
00981                 'name' => null,
00982                 'type' => self::DOCTYPE,
00983                 'error' => true
00984             ));
00985 
00986             $this->char--;
00987             $this->state = 'data';
00988 
00989         } else {
00990             $this->token = array(
00991                 'name' => $char,
00992                 'type' => self::DOCTYPE,
00993                 'error' => true
00994             );
00995 
00996             $this->state = 'doctypeName';
00997         }
00998     }
00999 
01000     private function doctypeNameState() {
01001         /* Consume the next input character: */
01002         $this->char++;
01003         $char = $this->char();
01004 
01005         if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
01006             $this->state = 'AfterDoctypeName';
01007 
01008         } elseif($char === '>') {
01009             $this->emitToken($this->token);
01010             $this->state = 'data';
01011 
01012         } elseif(preg_match('/^[a-z]$/', $char)) {
01013             $this->token['name'] .= strtoupper($char);
01014 
01015         } elseif($this->char === $this->EOF) {
01016             $this->emitToken($this->token);
01017             $this->char--;
01018             $this->state = 'data';
01019 
01020         } else {
01021             $this->token['name'] .= $char;
01022         }
01023 
01024         $this->token['error'] = ($this->token['name'] === 'HTML')
01025             ? false
01026             : true;
01027     }
01028 
01029     private function afterDoctypeNameState() {
01030         /* Consume the next input character: */
01031         $this->char++;
01032         $char = $this->char();
01033 
01034         if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
01035             // Stay in the DOCTYPE name state.
01036 
01037         } elseif($char === '>') {
01038             $this->emitToken($this->token);
01039             $this->state = 'data';
01040 
01041         } elseif($this->char === $this->EOF) {
01042             $this->emitToken($this->token);
01043             $this->char--;
01044             $this->state = 'data';
01045 
01046         } else {
01047             $this->token['error'] = true;
01048             $this->state = 'bogusDoctype';
01049         }
01050     }
01051 
01052     private function bogusDoctypeState() {
01053         /* Consume the next input character: */
01054         $this->char++;
01055         $char = $this->char();
01056 
01057         if($char === '>') {
01058             $this->emitToken($this->token);
01059             $this->state = 'data';
01060 
01061         } elseif($this->char === $this->EOF) {
01062             $this->emitToken($this->token);
01063             $this->char--;
01064             $this->state = 'data';
01065 
01066         } else {
01067             // Stay in the bogus DOCTYPE state.
01068         }
01069     }
01070 
01071     private function entity() {
01072         $start = $this->char;
01073 
01074         // This section defines how to consume an entity. This definition is
01075         // used when parsing entities in text and in attributes.
01076 
01077         // The behaviour depends on the identity of the next character (the
01078         // one immediately after the U+0026 AMPERSAND character): 
01079 
01080         switch($this->character($this->char + 1)) {
01081             // U+0023 NUMBER SIGN (#)
01082             case '#':
01083 
01084                 // The behaviour further depends on the character after the
01085                 // U+0023 NUMBER SIGN:
01086                 switch($this->character($this->char + 1)) {
01087                     // U+0078 LATIN SMALL LETTER X
01088                     // U+0058 LATIN CAPITAL LETTER X
01089                     case 'x':
01090                     case 'X':
01091                         // Follow the steps below, but using the range of
01092                         // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
01093                         // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
01094                         // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
01095                         // A, through to U+0046 LATIN CAPITAL LETTER F (in other
01096                         // words, 0-9, A-F, a-f).
01097                         $char = 1;
01098                         $char_class = '0-9A-Fa-f';
01099                     break;
01100 
01101                     // Anything else
01102                     default:
01103                         // Follow the steps below, but using the range of
01104                         // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
01105                         // NINE (i.e. just 0-9).
01106                         $char = 0;
01107                         $char_class = '0-9';
01108                     break;
01109                 }
01110 
01111                 // Consume as many characters as match the range of characters
01112                 // given above.
01113                 $this->char++;
01114                 $e_name = $this->characters($char_class, $this->char + $char + 1);
01115                 $entity = $this->character($start, $this->char);
01116                 $cond = strlen($e_name) > 0;
01117 
01118                 // The rest of the parsing happens bellow.
01119             break;
01120 
01121             // Anything else
01122             default:
01123                 // Consume the maximum number of characters possible, with the
01124                 // consumed characters case-sensitively matching one of the
01125                 // identifiers in the first column of the entities table.
01126                 $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
01127                 $len = strlen($e_name);
01128 
01129                 for($c = 1; $c <= $len; $c++) {
01130                     $id = substr($e_name, 0, $c);
01131                     $this->char++;
01132 
01133                     if(in_array($id, $this->entities)) {
01134                         if ($e_name[$c-1] !== ';') {
01135                             if ($c < $len && $e_name[$c] == ';') {
01136                                 $this->char++; // consume extra semicolon
01137                             }
01138                         }
01139                         $entity = $id;
01140                         break;
01141                     }
01142                 }
01143 
01144                 $cond = isset($entity);
01145                 // The rest of the parsing happens bellow.
01146             break;