|
HTMLPurifier 4.4.0
|
00001 <?php 00002 00003 // why is this a top level function? Because PHP 5.2.0 doesn't seem to 00004 // understand how to interpret this filter if it's a static method. 00005 // It's all really silly, but if we go this route it might be reasonable 00006 // to coalesce all of these methods into one. 00007 function htmlpurifier_filter_extractstyleblocks_muteerrorhandler() {} 00008 00023 class HTMLPurifier_Filter_ExtractStyleBlocks extends HTMLPurifier_Filter 00024 { 00025 00026 public $name = 'ExtractStyleBlocks'; 00027 private $_styleMatches = array(); 00028 private $_tidy; 00029 00030 private $_id_attrdef; 00031 private $_class_attrdef; 00032 private $_enum_attrdef; 00033 00034 public function __construct() { 00035 $this->_tidy = new csstidy(); 00036 $this->_id_attrdef = new HTMLPurifier_AttrDef_HTML_ID(true); 00037 $this->_class_attrdef = new HTMLPurifier_AttrDef_CSS_Ident(); 00038 $this->_enum_attrdef = new HTMLPurifier_AttrDef_Enum(array('first-child', 'link', 'visited', 'active', 'hover', 'focus')); 00039 } 00040 00045 protected function styleCallback($matches) { 00046 $this->_styleMatches[] = $matches[1]; 00047 } 00048 00053 public function preFilter($html, $config, $context) { 00054 $tidy = $config->get('Filter.ExtractStyleBlocks.TidyImpl'); 00055 if ($tidy !== null) $this->_tidy = $tidy; 00056 $html = preg_replace_callback('#<style(?:\s.*)?>(.+)</style>#isU', array($this, 'styleCallback'), $html); 00057 $style_blocks = $this->_styleMatches; 00058 $this->_styleMatches = array(); // reset 00059 $context->register('StyleBlocks', $style_blocks); // $context must not be reused 00060 if ($this->_tidy) { 00061 foreach ($style_blocks as &$style) { 00062 $style = $this->cleanCSS($style, $config, $context); 00063 } 00064 } 00065 return $html; 00066 } 00067 00076 public function cleanCSS($css, $config, $context) { 00077 // prepare scope 00078 $scope = $config->get('Filter.ExtractStyleBlocks.Scope'); 00079 if ($scope !== null) { 00080 $scopes = array_map('trim', explode(',', $scope)); 00081 } else { 00082 $scopes = array(); 00083 } 00084 // remove comments from CSS 00085 $css = trim($css); 00086 if (strncmp('<!--', $css, 4) === 0) { 00087 $css = substr($css, 4); 00088 } 00089 if (strlen($css) > 3 && substr($css, -3) == '-->') { 00090 $css = substr($css, 0, -3); 00091 } 00092 $css = trim($css); 00093 set_error_handler('htmlpurifier_filter_extractstyleblocks_muteerrorhandler'); 00094 $this->_tidy->parse($css); 00095 restore_error_handler(); 00096 $css_definition = $config->getDefinition('CSS'); 00097 $html_definition = $config->getDefinition('HTML'); 00098 $new_css = array(); 00099 foreach ($this->_tidy->css as $k => $decls) { 00100 // $decls are all CSS declarations inside an @ selector 00101 $new_decls = array(); 00102 foreach ($decls as $selector => $style) { 00103 $selector = trim($selector); 00104 if ($selector === '') continue; // should not happen 00105 // Parse the selector 00106 // Here is the relevant part of the CSS grammar: 00107 // 00108 // ruleset 00109 // : selector [ ',' S* selector ]* '{' ... 00110 // selector 00111 // : simple_selector [ combinator selector | S+ [ combinator? selector ]? ]? 00112 // combinator 00113 // : '+' S* 00114 // : '>' S* 00115 // simple_selector 00116 // : element_name [ HASH | class | attrib | pseudo ]* 00117 // | [ HASH | class | attrib | pseudo ]+ 00118 // element_name 00119 // : IDENT | '*' 00120 // ; 00121 // class 00122 // : '.' IDENT 00123 // ; 00124 // attrib 00125 // : '[' S* IDENT S* [ [ '=' | INCLUDES | DASHMATCH ] S* 00126 // [ IDENT | STRING ] S* ]? ']' 00127 // ; 00128 // pseudo 00129 // : ':' [ IDENT | FUNCTION S* [IDENT S*]? ')' ] 00130 // ; 00131 // 00132 // For reference, here are the relevant tokens: 00133 // 00134 // HASH #{name} 00135 // IDENT {ident} 00136 // INCLUDES == 00137 // DASHMATCH |= 00138 // STRING {string} 00139 // FUNCTION {ident}\( 00140 // 00141 // And the lexical scanner tokens 00142 // 00143 // name {nmchar}+ 00144 // nmchar [_a-z0-9-]|{nonascii}|{escape} 00145 // nonascii [\240-\377] 00146 // escape {unicode}|\\[^\r\n\f0-9a-f] 00147 // unicode \\{h}}{1,6}(\r\n|[ \t\r\n\f])? 00148 // ident -?{nmstart}{nmchar*} 00149 // nmstart [_a-z]|{nonascii}|{escape} 00150 // string {string1}|{string2} 00151 // string1 \"([^\n\r\f\\"]|\\{nl}|{escape})*\" 00152 // string2 \'([^\n\r\f\\"]|\\{nl}|{escape})*\' 00153 // 00154 // We'll implement a subset (in order to reduce attack 00155 // surface); in particular: 00156 // 00157 // - No Unicode support 00158 // - No escapes support 00159 // - No string support (by proxy no attrib support) 00160 // - element_name is matched against allowed 00161 // elements (some people might find this 00162 // annoying...) 00163 // - Pseudo-elements one of :first-child, :link, 00164 // :visited, :active, :hover, :focus 00165 00166 // handle ruleset 00167 $selectors = array_map('trim', explode(',', $selector)); 00168 $new_selectors = array(); 00169 foreach ($selectors as $sel) { 00170 // split on +, > and spaces 00171 $basic_selectors = preg_split('/\s*([+> ])\s*/', $sel, -1, PREG_SPLIT_DELIM_CAPTURE); 00172 // even indices are chunks, odd indices are 00173 // delimiters 00174 $nsel = null; 00175 $delim = null; // guaranteed to be non-null after 00176 // two loop iterations 00177 for ($i = 0, $c = count($basic_selectors); $i < $c; $i++) { 00178 $x = $basic_selectors[$i]; 00179 if ($i % 2) { 00180 // delimiter 00181 if ($x === ' ') { 00182 $delim = ' '; 00183 } else { 00184 $delim = ' ' . $x . ' '; 00185 } 00186 } else { 00187 // simple selector 00188 $components = preg_split('/([#.:])/', $x, -1, PREG_SPLIT_DELIM_CAPTURE); 00189 $sdelim = null; 00190 $nx = null; 00191 for ($j = 0, $cc = count($components); $j < $cc; $j ++) { 00192 $y = $components[$j]; 00193 if ($j === 0) { 00194 if ($y === '*' || isset($html_definition->info[$y = strtolower($y)])) { 00195 $nx = $y; 00196 } else { 00197 // $nx stays null; this matters 00198 // if we don't manage to find 00199 // any valid selector content, 00200 // in which case we ignore the 00201 // outer $delim 00202 } 00203 } elseif ($j % 2) { 00204 // set delimiter 00205 $sdelim = $y; 00206 } else { 00207 $attrdef = null; 00208 if ($sdelim === '#') { 00209 $attrdef = $this->_id_attrdef; 00210 } elseif ($sdelim === '.') { 00211 $attrdef = $this->_class_attrdef; 00212 } elseif ($sdelim === ':') { 00213 $attrdef = $this->_enum_attrdef; 00214 } else { 00215 throw new HTMLPurifier_Exception('broken invariant sdelim and preg_split'); 00216 } 00217 $r = $attrdef->validate($y, $config, $context); 00218 if ($r !== false) { 00219 if ($r !== true) { 00220 $y = $r; 00221 } 00222 if ($nx === null) { 00223 $nx = ''; 00224 } 00225 $nx .= $sdelim . $y; 00226 } 00227 } 00228 } 00229 if ($nx !== null) { 00230 if ($nsel === null) { 00231 $nsel = $nx; 00232 } else { 00233 $nsel .= $delim . $nx; 00234 } 00235 } else { 00236 // delimiters to the left of invalid 00237 // basic selector ignored 00238 } 00239 } 00240 } 00241 if ($nsel !== null) { 00242 if (!empty($scopes)) { 00243 foreach ($scopes as $s) { 00244 $new_selectors[] = "$s $nsel"; 00245 } 00246 } else { 00247 $new_selectors[] = $nsel; 00248 } 00249 } 00250 } 00251 if (empty($new_selectors)) continue; 00252 $selector = implode(', ', $new_selectors); 00253 foreach ($style as $name => $value) { 00254 if (!isset($css_definition->info[$name])) { 00255 unset($style[$name]); 00256 continue; 00257 } 00258 $def = $css_definition->info[$name]; 00259 $ret = $def->validate($value, $config, $context); 00260 if ($ret === false) unset($style[$name]); 00261 else $style[$name] = $ret; 00262 } 00263 $new_decls[$selector] = $style; 00264 } 00265 $new_css[$k] = $new_decls; 00266 } 00267 // remove stuff that shouldn't be used, could be reenabled 00268 // after security risks are analyzed 00269 $this->_tidy->css = $new_css; 00270 $this->_tidy->import = array(); 00271 $this->_tidy->charset = null; 00272 $this->_tidy->namespace = null; 00273 $css = $this->_tidy->print->plain(); 00274 // we are going to escape any special characters <>& to ensure 00275 // that no funny business occurs (i.e. </style> in a font-family prop). 00276 if ($config->get('Filter.ExtractStyleBlocks.Escaping')) { 00277 $css = str_replace( 00278 array('<', '>', '&'), 00279 array('\3C ', '\3E ', '\26 '), 00280 $css 00281 ); 00282 } 00283 return $css; 00284 } 00285 00286 } 00287 00288 // vim: et sw=4 sts=4