HTMLPurifier 4.4.0
/home/ezyang/Dev/htmlpurifier/library/HTMLPurifier/Injector/AutoParagraph.php
Go to the documentation of this file.
00001 <?php
00002 
00009 class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
00010 {
00011 
00012     public $name = 'AutoParagraph';
00013     public $needed = array('p');
00014 
00015     private function _pStart() {
00016         $par = new HTMLPurifier_Token_Start('p');
00017         $par->armor['MakeWellFormed_TagClosedError'] = true;
00018         return $par;
00019     }
00020 
00021     public function handleText(&$token) {
00022         $text = $token->data;
00023         // Does the current parent allow <p> tags?
00024         if ($this->allowsElement('p')) {
00025             if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) {
00026                 // Note that we have differing behavior when dealing with text
00027                 // in the anonymous root node, or a node inside the document.
00028                 // If the text as a double-newline, the treatment is the same;
00029                 // if it doesn't, see the next if-block if you're in the document.
00030 
00031                 $i = $nesting = null;
00032                 if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) {
00033                     // State 1.1: ...    ^ (whitespace, then document end)
00034                     //               ----
00035                     // This is a degenerate case
00036                 } else {
00037                     if (!$token->is_whitespace || $this->_isInline($current)) {
00038                         // State 1.2: PAR1
00039                         //            ----
00040 
00041                         // State 1.3: PAR1\n\nPAR2
00042                         //            ------------
00043 
00044                         // State 1.4: <div>PAR1\n\nPAR2 (see State 2)
00045                         //                 ------------
00046                         $token = array($this->_pStart());
00047                         $this->_splitText($text, $token);
00048                     } else {
00049                         // State 1.5: \n<hr />
00050                         //            --
00051                     }
00052                 }
00053             } else {
00054                 // State 2:   <div>PAR1... (similar to 1.4)
00055                 //                 ----
00056 
00057                 // We're in an element that allows paragraph tags, but we're not
00058                 // sure if we're going to need them.
00059                 if ($this->_pLookAhead()) {
00060                     // State 2.1: <div>PAR1<b>PAR1\n\nPAR2
00061                     //                 ----
00062                     // Note: This will always be the first child, since any
00063                     // previous inline element would have triggered this very
00064                     // same routine, and found the double newline. One possible
00065                     // exception would be a comment.
00066                     $token = array($this->_pStart(), $token);
00067                 } else {
00068                     // State 2.2.1: <div>PAR1<div>
00069                     //                   ----
00070 
00071                     // State 2.2.2: <div>PAR1<b>PAR1</b></div>
00072                     //                   ----
00073                 }
00074             }
00075         // Is the current parent a <p> tag?
00076         } elseif (
00077             !empty($this->currentNesting) &&
00078             $this->currentNesting[count($this->currentNesting)-1]->name == 'p'
00079         ) {
00080             // State 3.1: ...<p>PAR1
00081             //                  ----
00082 
00083             // State 3.2: ...<p>PAR1\n\nPAR2
00084             //                  ------------
00085             $token = array();
00086             $this->_splitText($text, $token);
00087         // Abort!
00088         } else {
00089             // State 4.1: ...<b>PAR1
00090             //                  ----
00091 
00092             // State 4.2: ...<b>PAR1\n\nPAR2
00093             //                  ------------
00094         }
00095     }
00096 
00097     public function handleElement(&$token) {
00098         // We don't have to check if we're already in a <p> tag for block
00099         // tokens, because the tag would have been autoclosed by MakeWellFormed.
00100         if ($this->allowsElement('p')) {
00101             if (!empty($this->currentNesting)) {
00102                 if ($this->_isInline($token)) {
00103                     // State 1: <div>...<b>
00104                     //                  ---
00105 
00106                     // Check if this token is adjacent to the parent token
00107                     // (seek backwards until token isn't whitespace)
00108                     $i = null;
00109                     $this->backward($i, $prev);
00110 
00111                     if (!$prev instanceof HTMLPurifier_Token_Start) {
00112                         // Token wasn't adjacent
00113 
00114                         if (
00115                             $prev instanceof HTMLPurifier_Token_Text &&
00116                             substr($prev->data, -2) === "\n\n"
00117                         ) {
00118                             // State 1.1.4: <div><p>PAR1</p>\n\n<b>
00119                             //                                  ---
00120 
00121                             // Quite frankly, this should be handled by splitText
00122                             $token = array($this->_pStart(), $token);
00123                         } else {
00124                             // State 1.1.1: <div><p>PAR1</p><b>
00125                             //                              ---
00126 
00127                             // State 1.1.2: <div><br /><b>
00128                             //                         ---
00129 
00130                             // State 1.1.3: <div>PAR<b>
00131                             //                      ---
00132                         }
00133 
00134                     } else {
00135                         // State 1.2.1: <div><b>
00136                         //                   ---
00137 
00138                         // Lookahead to see if <p> is needed.
00139                         if ($this->_pLookAhead()) {
00140                             // State 1.3.1: <div><b>PAR1\n\nPAR2
00141                             //                   ---
00142                             $token = array($this->_pStart(), $token);
00143                         } else {
00144                             // State 1.3.2: <div><b>PAR1</b></div>
00145                             //                   ---
00146 
00147                             // State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div>
00148                             //                   ---
00149                         }
00150                     }
00151                 } else {
00152                     // State 2.3: ...<div>
00153                     //               -----
00154                 }
00155             } else {
00156                 if ($this->_isInline($token)) {
00157                     // State 3.1: <b>
00158                     //            ---
00159                     // This is where the {p} tag is inserted, not reflected in
00160                     // inputTokens yet, however.
00161                     $token = array($this->_pStart(), $token);
00162                 } else {
00163                     // State 3.2: <div>
00164                     //            -----
00165                 }
00166 
00167                 $i = null;
00168                 if ($this->backward($i, $prev)) {
00169                     if (
00170                         !$prev instanceof HTMLPurifier_Token_Text
00171                     ) {
00172                         // State 3.1.1: ...</p>{p}<b>
00173                         //                        ---
00174 
00175                         // State 3.2.1: ...</p><div>
00176                         //                     -----
00177 
00178                         if (!is_array($token)) $token = array($token);
00179                         array_unshift($token, new HTMLPurifier_Token_Text("\n\n"));
00180                     } else {
00181                         // State 3.1.2: ...</p>\n\n{p}<b>
00182                         //                            ---
00183 
00184                         // State 3.2.2: ...</p>\n\n<div>
00185                         //                         -----
00186 
00187                         // Note: PAR<ELEM> cannot occur because PAR would have been
00188                         // wrapped in <p> tags.
00189                     }
00190                 }
00191             }
00192         } else {
00193             // State 2.2: <ul><li>
00194             //                ----
00195 
00196             // State 2.4: <p><b>
00197             //               ---
00198         }
00199     }
00200 
00211     private function _splitText($data, &$result) {
00212         $raw_paragraphs = explode("\n\n", $data);
00213         $paragraphs  = array(); // without empty paragraphs
00214         $needs_start = false;
00215         $needs_end   = false;
00216 
00217         $c = count($raw_paragraphs);
00218         if ($c == 1) {
00219             // There were no double-newlines, abort quickly. In theory this
00220             // should never happen.
00221             $result[] = new HTMLPurifier_Token_Text($data);
00222             return;
00223         }
00224         for ($i = 0; $i < $c; $i++) {
00225             $par = $raw_paragraphs[$i];
00226             if (trim($par) !== '') {
00227                 $paragraphs[] = $par;
00228             } else {
00229                 if ($i == 0) {
00230                     // Double newline at the front
00231                     if (empty($result)) {
00232                         // The empty result indicates that the AutoParagraph
00233                         // injector did not add any start paragraph tokens.
00234                         // This means that we have been in a paragraph for
00235                         // a while, and the newline means we should start a new one.
00236                         $result[] = new HTMLPurifier_Token_End('p');
00237                         $result[] = new HTMLPurifier_Token_Text("\n\n");
00238                         // However, the start token should only be added if
00239                         // there is more processing to be done (i.e. there are
00240                         // real paragraphs in here). If there are none, the
00241                         // next start paragraph tag will be handled by the
00242                         // next call to the injector
00243                         $needs_start = true;
00244                     } else {
00245                         // We just started a new paragraph!
00246                         // Reinstate a double-newline for presentation's sake, since
00247                         // it was in the source code.
00248                         array_unshift($result, new HTMLPurifier_Token_Text("\n\n"));
00249                     }
00250                 } elseif ($i + 1 == $c) {
00251                     // Double newline at the end
00252                     // There should be a trailing </p> when we're finally done.
00253                     $needs_end = true;
00254                 }
00255             }
00256         }
00257 
00258         // Check if this was just a giant blob of whitespace. Move this earlier,
00259         // perhaps?
00260         if (empty($paragraphs)) {
00261             return;
00262         }
00263 
00264         // Add the start tag indicated by \n\n at the beginning of $data
00265         if ($needs_start) {
00266             $result[] = $this->_pStart();
00267         }
00268 
00269         // Append the paragraphs onto the result
00270         foreach ($paragraphs as $par) {
00271             $result[] = new HTMLPurifier_Token_Text($par);
00272             $result[] = new HTMLPurifier_Token_End('p');
00273             $result[] = new HTMLPurifier_Token_Text("\n\n");
00274             $result[] = $this->_pStart();
00275         }
00276 
00277         // Remove trailing start token; Injector will handle this later if
00278         // it was indeed needed. This prevents from needing to do a lookahead,
00279         // at the cost of a lookbehind later.
00280         array_pop($result);
00281 
00282         // If there is no need for an end tag, remove all of it and let
00283         // MakeWellFormed close it later.
00284         if (!$needs_end) {
00285             array_pop($result); // removes \n\n
00286             array_pop($result); // removes </p>
00287         }
00288 
00289     }
00290 
00295     private function _isInline($token) {
00296         return isset($this->htmlDefinition->info['p']->child->elements[$token->name]);
00297     }
00298 
00303     private function _pLookAhead() {
00304         $this->current($i, $current);
00305         if ($current instanceof HTMLPurifier_Token_Start) $nesting = 1;
00306         else $nesting = 0;
00307         $ok = false;
00308         while ($this->forwardUntilEndToken($i, $current, $nesting)) {
00309             $result = $this->_checkNeedsP($current);
00310             if ($result !== null) {
00311                 $ok = $result;
00312                 break;
00313             }
00314         }
00315         return $ok;
00316     }
00317 
00322     private function _checkNeedsP($current) {
00323         if ($current instanceof HTMLPurifier_Token_Start){
00324             if (!$this->_isInline($current)) {
00325                 // <div>PAR1<div>
00326                 //      ----
00327                 // Terminate early, since we hit a block element
00328                 return false;
00329             }
00330         } elseif ($current instanceof HTMLPurifier_Token_Text) {
00331             if (strpos($current->data, "\n\n") !== false) {
00332                 // <div>PAR1<b>PAR1\n\nPAR2
00333                 //      ----
00334                 return true;
00335             } else {
00336                 // <div>PAR1<b>PAR1...
00337                 //      ----
00338             }
00339         }
00340         return null;
00341     }
00342 
00343 }
00344 
00345 // vim: et sw=4 sts=4