HTMLPurifier 4.4.0
|
00001 <?php 00002 00009 class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector 00010 { 00011 00012 public $name = 'AutoParagraph'; 00013 public $needed = array('p'); 00014 00015 private function _pStart() { 00016 $par = new HTMLPurifier_Token_Start('p'); 00017 $par->armor['MakeWellFormed_TagClosedError'] = true; 00018 return $par; 00019 } 00020 00021 public function handleText(&$token) { 00022 $text = $token->data; 00023 // Does the current parent allow <p> tags? 00024 if ($this->allowsElement('p')) { 00025 if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) { 00026 // Note that we have differing behavior when dealing with text 00027 // in the anonymous root node, or a node inside the document. 00028 // If the text as a double-newline, the treatment is the same; 00029 // if it doesn't, see the next if-block if you're in the document. 00030 00031 $i = $nesting = null; 00032 if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) { 00033 // State 1.1: ... ^ (whitespace, then document end) 00034 // ---- 00035 // This is a degenerate case 00036 } else { 00037 if (!$token->is_whitespace || $this->_isInline($current)) { 00038 // State 1.2: PAR1 00039 // ---- 00040 00041 // State 1.3: PAR1\n\nPAR2 00042 // ------------ 00043 00044 // State 1.4: <div>PAR1\n\nPAR2 (see State 2) 00045 // ------------ 00046 $token = array($this->_pStart()); 00047 $this->_splitText($text, $token); 00048 } else { 00049 // State 1.5: \n<hr /> 00050 // -- 00051 } 00052 } 00053 } else { 00054 // State 2: <div>PAR1... (similar to 1.4) 00055 // ---- 00056 00057 // We're in an element that allows paragraph tags, but we're not 00058 // sure if we're going to need them. 00059 if ($this->_pLookAhead()) { 00060 // State 2.1: <div>PAR1<b>PAR1\n\nPAR2 00061 // ---- 00062 // Note: This will always be the first child, since any 00063 // previous inline element would have triggered this very 00064 // same routine, and found the double newline. One possible 00065 // exception would be a comment. 00066 $token = array($this->_pStart(), $token); 00067 } else { 00068 // State 2.2.1: <div>PAR1<div> 00069 // ---- 00070 00071 // State 2.2.2: <div>PAR1<b>PAR1</b></div> 00072 // ---- 00073 } 00074 } 00075 // Is the current parent a <p> tag? 00076 } elseif ( 00077 !empty($this->currentNesting) && 00078 $this->currentNesting[count($this->currentNesting)-1]->name == 'p' 00079 ) { 00080 // State 3.1: ...<p>PAR1 00081 // ---- 00082 00083 // State 3.2: ...<p>PAR1\n\nPAR2 00084 // ------------ 00085 $token = array(); 00086 $this->_splitText($text, $token); 00087 // Abort! 00088 } else { 00089 // State 4.1: ...<b>PAR1 00090 // ---- 00091 00092 // State 4.2: ...<b>PAR1\n\nPAR2 00093 // ------------ 00094 } 00095 } 00096 00097 public function handleElement(&$token) { 00098 // We don't have to check if we're already in a <p> tag for block 00099 // tokens, because the tag would have been autoclosed by MakeWellFormed. 00100 if ($this->allowsElement('p')) { 00101 if (!empty($this->currentNesting)) { 00102 if ($this->_isInline($token)) { 00103 // State 1: <div>...<b> 00104 // --- 00105 00106 // Check if this token is adjacent to the parent token 00107 // (seek backwards until token isn't whitespace) 00108 $i = null; 00109 $this->backward($i, $prev); 00110 00111 if (!$prev instanceof HTMLPurifier_Token_Start) { 00112 // Token wasn't adjacent 00113 00114 if ( 00115 $prev instanceof HTMLPurifier_Token_Text && 00116 substr($prev->data, -2) === "\n\n" 00117 ) { 00118 // State 1.1.4: <div><p>PAR1</p>\n\n<b> 00119 // --- 00120 00121 // Quite frankly, this should be handled by splitText 00122 $token = array($this->_pStart(), $token); 00123 } else { 00124 // State 1.1.1: <div><p>PAR1</p><b> 00125 // --- 00126 00127 // State 1.1.2: <div><br /><b> 00128 // --- 00129 00130 // State 1.1.3: <div>PAR<b> 00131 // --- 00132 } 00133 00134 } else { 00135 // State 1.2.1: <div><b> 00136 // --- 00137 00138 // Lookahead to see if <p> is needed. 00139 if ($this->_pLookAhead()) { 00140 // State 1.3.1: <div><b>PAR1\n\nPAR2 00141 // --- 00142 $token = array($this->_pStart(), $token); 00143 } else { 00144 // State 1.3.2: <div><b>PAR1</b></div> 00145 // --- 00146 00147 // State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div> 00148 // --- 00149 } 00150 } 00151 } else { 00152 // State 2.3: ...<div> 00153 // ----- 00154 } 00155 } else { 00156 if ($this->_isInline($token)) { 00157 // State 3.1: <b> 00158 // --- 00159 // This is where the {p} tag is inserted, not reflected in 00160 // inputTokens yet, however. 00161 $token = array($this->_pStart(), $token); 00162 } else { 00163 // State 3.2: <div> 00164 // ----- 00165 } 00166 00167 $i = null; 00168 if ($this->backward($i, $prev)) { 00169 if ( 00170 !$prev instanceof HTMLPurifier_Token_Text 00171 ) { 00172 // State 3.1.1: ...</p>{p}<b> 00173 // --- 00174 00175 // State 3.2.1: ...</p><div> 00176 // ----- 00177 00178 if (!is_array($token)) $token = array($token); 00179 array_unshift($token, new HTMLPurifier_Token_Text("\n\n")); 00180 } else { 00181 // State 3.1.2: ...</p>\n\n{p}<b> 00182 // --- 00183 00184 // State 3.2.2: ...</p>\n\n<div> 00185 // ----- 00186 00187 // Note: PAR<ELEM> cannot occur because PAR would have been 00188 // wrapped in <p> tags. 00189 } 00190 } 00191 } 00192 } else { 00193 // State 2.2: <ul><li> 00194 // ---- 00195 00196 // State 2.4: <p><b> 00197 // --- 00198 } 00199 } 00200 00211 private function _splitText($data, &$result) { 00212 $raw_paragraphs = explode("\n\n", $data); 00213 $paragraphs = array(); // without empty paragraphs 00214 $needs_start = false; 00215 $needs_end = false; 00216 00217 $c = count($raw_paragraphs); 00218 if ($c == 1) { 00219 // There were no double-newlines, abort quickly. In theory this 00220 // should never happen. 00221 $result[] = new HTMLPurifier_Token_Text($data); 00222 return; 00223 } 00224 for ($i = 0; $i < $c; $i++) { 00225 $par = $raw_paragraphs[$i]; 00226 if (trim($par) !== '') { 00227 $paragraphs[] = $par; 00228 } else { 00229 if ($i == 0) { 00230 // Double newline at the front 00231 if (empty($result)) { 00232 // The empty result indicates that the AutoParagraph 00233 // injector did not add any start paragraph tokens. 00234 // This means that we have been in a paragraph for 00235 // a while, and the newline means we should start a new one. 00236 $result[] = new HTMLPurifier_Token_End('p'); 00237 $result[] = new HTMLPurifier_Token_Text("\n\n"); 00238 // However, the start token should only be added if 00239 // there is more processing to be done (i.e. there are 00240 // real paragraphs in here). If there are none, the 00241 // next start paragraph tag will be handled by the 00242 // next call to the injector 00243 $needs_start = true; 00244 } else { 00245 // We just started a new paragraph! 00246 // Reinstate a double-newline for presentation's sake, since 00247 // it was in the source code. 00248 array_unshift($result, new HTMLPurifier_Token_Text("\n\n")); 00249 } 00250 } elseif ($i + 1 == $c) { 00251 // Double newline at the end 00252 // There should be a trailing </p> when we're finally done. 00253 $needs_end = true; 00254 } 00255 } 00256 } 00257 00258 // Check if this was just a giant blob of whitespace. Move this earlier, 00259 // perhaps? 00260 if (empty($paragraphs)) { 00261 return; 00262 } 00263 00264 // Add the start tag indicated by \n\n at the beginning of $data 00265 if ($needs_start) { 00266 $result[] = $this->_pStart(); 00267 } 00268 00269 // Append the paragraphs onto the result 00270 foreach ($paragraphs as $par) { 00271 $result[] = new HTMLPurifier_Token_Text($par); 00272 $result[] = new HTMLPurifier_Token_End('p'); 00273 $result[] = new HTMLPurifier_Token_Text("\n\n"); 00274 $result[] = $this->_pStart(); 00275 } 00276 00277 // Remove trailing start token; Injector will handle this later if 00278 // it was indeed needed. This prevents from needing to do a lookahead, 00279 // at the cost of a lookbehind later. 00280 array_pop($result); 00281 00282 // If there is no need for an end tag, remove all of it and let 00283 // MakeWellFormed close it later. 00284 if (!$needs_end) { 00285 array_pop($result); // removes \n\n 00286 array_pop($result); // removes </p> 00287 } 00288 00289 } 00290 00295 private function _isInline($token) { 00296 return isset($this->htmlDefinition->info['p']->child->elements[$token->name]); 00297 } 00298 00303 private function _pLookAhead() { 00304 $this->current($i, $current); 00305 if ($current instanceof HTMLPurifier_Token_Start) $nesting = 1; 00306 else $nesting = 0; 00307 $ok = false; 00308 while ($this->forwardUntilEndToken($i, $current, $nesting)) { 00309 $result = $this->_checkNeedsP($current); 00310 if ($result !== null) { 00311 $ok = $result; 00312 break; 00313 } 00314 } 00315 return $ok; 00316 } 00317 00322 private function _checkNeedsP($current) { 00323 if ($current instanceof HTMLPurifier_Token_Start){ 00324 if (!$this->_isInline($current)) { 00325 // <div>PAR1<div> 00326 // ---- 00327 // Terminate early, since we hit a block element 00328 return false; 00329 } 00330 } elseif ($current instanceof HTMLPurifier_Token_Text) { 00331 if (strpos($current->data, "\n\n") !== false) { 00332 // <div>PAR1<b>PAR1\n\nPAR2 00333 // ---- 00334 return true; 00335 } else { 00336 // <div>PAR1<b>PAR1... 00337 // ---- 00338 } 00339 } 00340 return null; 00341 } 00342 00343 } 00344 00345 // vim: et sw=4 sts=4