HTMLPurifier 4.4.0
/home/ezyang/Dev/htmlpurifier/library/HTMLPurifier/Filter/ExtractStyleBlocks.php
Go to the documentation of this file.
00001 <?php
00002 
00003 // why is this a top level function? Because PHP 5.2.0 doesn't seem to
00004 // understand how to interpret this filter if it's a static method.
00005 // It's all really silly, but if we go this route it might be reasonable
00006 // to coalesce all of these methods into one.
00007 function htmlpurifier_filter_extractstyleblocks_muteerrorhandler() {}
00008 
00023 class HTMLPurifier_Filter_ExtractStyleBlocks extends HTMLPurifier_Filter
00024 {
00025 
00026     public $name = 'ExtractStyleBlocks';
00027     private $_styleMatches = array();
00028     private $_tidy;
00029 
00030     private $_id_attrdef;
00031     private $_class_attrdef;
00032     private $_enum_attrdef;
00033 
00034     public function __construct() {
00035         $this->_tidy = new csstidy();
00036         $this->_id_attrdef = new HTMLPurifier_AttrDef_HTML_ID(true);
00037         $this->_class_attrdef = new HTMLPurifier_AttrDef_CSS_Ident();
00038         $this->_enum_attrdef = new HTMLPurifier_AttrDef_Enum(array('first-child', 'link', 'visited', 'active', 'hover', 'focus'));
00039     }
00040 
00045     protected function styleCallback($matches) {
00046         $this->_styleMatches[] = $matches[1];
00047     }
00048 
00053     public function preFilter($html, $config, $context) {
00054         $tidy = $config->get('Filter.ExtractStyleBlocks.TidyImpl');
00055         if ($tidy !== null) $this->_tidy = $tidy;
00056         $html = preg_replace_callback('#<style(?:\s.*)?>(.+)</style>#isU', array($this, 'styleCallback'), $html);
00057         $style_blocks = $this->_styleMatches;
00058         $this->_styleMatches = array(); // reset
00059         $context->register('StyleBlocks', $style_blocks); // $context must not be reused
00060         if ($this->_tidy) {
00061             foreach ($style_blocks as &$style) {
00062                 $style = $this->cleanCSS($style, $config, $context);
00063             }
00064         }
00065         return $html;
00066     }
00067 
00076     public function cleanCSS($css, $config, $context) {
00077         // prepare scope
00078         $scope = $config->get('Filter.ExtractStyleBlocks.Scope');
00079         if ($scope !== null) {
00080             $scopes = array_map('trim', explode(',', $scope));
00081         } else {
00082             $scopes = array();
00083         }
00084         // remove comments from CSS
00085         $css = trim($css);
00086         if (strncmp('<!--', $css, 4) === 0) {
00087             $css = substr($css, 4);
00088         }
00089         if (strlen($css) > 3 && substr($css, -3) == '-->') {
00090             $css = substr($css, 0, -3);
00091         }
00092         $css = trim($css);
00093         set_error_handler('htmlpurifier_filter_extractstyleblocks_muteerrorhandler');
00094         $this->_tidy->parse($css);
00095         restore_error_handler();
00096         $css_definition = $config->getDefinition('CSS');
00097         $html_definition = $config->getDefinition('HTML');
00098         $new_css = array();
00099         foreach ($this->_tidy->css as $k => $decls) {
00100             // $decls are all CSS declarations inside an @ selector
00101             $new_decls = array();
00102             foreach ($decls as $selector => $style) {
00103                 $selector = trim($selector);
00104                 if ($selector === '') continue; // should not happen
00105                 // Parse the selector
00106                 // Here is the relevant part of the CSS grammar:
00107                 //
00108                 // ruleset
00109                 //   : selector [ ',' S* selector ]* '{' ...
00110                 // selector
00111                 //   : simple_selector [ combinator selector | S+ [ combinator? selector ]? ]?
00112                 // combinator
00113                 //   : '+' S*
00114                 //   : '>' S*
00115                 // simple_selector
00116                 //   : element_name [ HASH | class | attrib | pseudo ]*
00117                 //   | [ HASH | class | attrib | pseudo ]+
00118                 // element_name
00119                 //   : IDENT | '*'
00120                 //   ;
00121                 // class
00122                 //   : '.' IDENT
00123                 //   ;
00124                 // attrib
00125                 //   : '[' S* IDENT S* [ [ '=' | INCLUDES | DASHMATCH ] S*
00126                 //     [ IDENT | STRING ] S* ]? ']'
00127                 //   ;
00128                 // pseudo
00129                 //   : ':' [ IDENT | FUNCTION S* [IDENT S*]? ')' ]
00130                 //   ;
00131                 //
00132                 // For reference, here are the relevant tokens:
00133                 //
00134                 // HASH         #{name}
00135                 // IDENT        {ident}
00136                 // INCLUDES     ==
00137                 // DASHMATCH    |=
00138                 // STRING       {string}
00139                 // FUNCTION     {ident}\(
00140                 //
00141                 // And the lexical scanner tokens
00142                 //
00143                 // name         {nmchar}+
00144                 // nmchar       [_a-z0-9-]|{nonascii}|{escape}
00145                 // nonascii     [\240-\377]
00146                 // escape       {unicode}|\\[^\r\n\f0-9a-f]
00147                 // unicode      \\{h}}{1,6}(\r\n|[ \t\r\n\f])?
00148                 // ident        -?{nmstart}{nmchar*}
00149                 // nmstart      [_a-z]|{nonascii}|{escape}
00150                 // string       {string1}|{string2}
00151                 // string1      \"([^\n\r\f\\"]|\\{nl}|{escape})*\"
00152                 // string2      \'([^\n\r\f\\"]|\\{nl}|{escape})*\'
00153                 //
00154                 // We'll implement a subset (in order to reduce attack
00155                 // surface); in particular:
00156                 //
00157                 //      - No Unicode support
00158                 //      - No escapes support
00159                 //      - No string support (by proxy no attrib support)
00160                 //      - element_name is matched against allowed
00161                 //        elements (some people might find this
00162                 //        annoying...)
00163                 //      - Pseudo-elements one of :first-child, :link,
00164                 //        :visited, :active, :hover, :focus
00165 
00166                 // handle ruleset
00167                 $selectors = array_map('trim', explode(',', $selector));
00168                 $new_selectors = array();
00169                 foreach ($selectors as $sel) {
00170                     // split on +, > and spaces
00171                     $basic_selectors = preg_split('/\s*([+> ])\s*/', $sel, -1, PREG_SPLIT_DELIM_CAPTURE);
00172                     // even indices are chunks, odd indices are
00173                     // delimiters
00174                     $nsel = null;
00175                     $delim = null; // guaranteed to be non-null after
00176                                    // two loop iterations
00177                     for ($i = 0, $c = count($basic_selectors); $i < $c; $i++) {
00178                         $x = $basic_selectors[$i];
00179                         if ($i % 2) {
00180                             // delimiter
00181                             if ($x === ' ') {
00182                                 $delim = ' ';
00183                             } else {
00184                                 $delim = ' ' . $x . ' ';
00185                             }
00186                         } else {
00187                             // simple selector
00188                             $components = preg_split('/([#.:])/', $x, -1, PREG_SPLIT_DELIM_CAPTURE);
00189                             $sdelim = null;
00190                             $nx = null;
00191                             for ($j = 0, $cc = count($components); $j < $cc; $j ++) {
00192                                 $y = $components[$j];
00193                                 if ($j === 0) {
00194                                     if ($y === '*' || isset($html_definition->info[$y = strtolower($y)])) {
00195                                         $nx = $y;
00196                                     } else {
00197                                         // $nx stays null; this matters
00198                                         // if we don't manage to find
00199                                         // any valid selector content,
00200                                         // in which case we ignore the
00201                                         // outer $delim
00202                                     }
00203                                 } elseif ($j % 2) {
00204                                     // set delimiter
00205                                     $sdelim = $y;
00206                                 } else {
00207                                     $attrdef = null;
00208                                     if ($sdelim === '#') {
00209                                         $attrdef = $this->_id_attrdef;
00210                                     } elseif ($sdelim === '.') {
00211                                         $attrdef = $this->_class_attrdef;
00212                                     } elseif ($sdelim === ':') {
00213                                         $attrdef = $this->_enum_attrdef;
00214                                     } else {
00215                                         throw new HTMLPurifier_Exception('broken invariant sdelim and preg_split');
00216                                     }
00217                                     $r = $attrdef->validate($y, $config, $context);
00218                                     if ($r !== false) {
00219                                         if ($r !== true) {
00220                                             $y = $r;
00221                                         }
00222                                         if ($nx === null) {
00223                                             $nx = '';
00224                                         }
00225                                         $nx .= $sdelim . $y;
00226                                     }
00227                                 }
00228                             }
00229                             if ($nx !== null) {
00230                                 if ($nsel === null) {
00231                                     $nsel = $nx;
00232                                 } else {
00233                                     $nsel .= $delim . $nx;
00234                                 }
00235                             } else {
00236                                 // delimiters to the left of invalid
00237                                 // basic selector ignored
00238                             }
00239                         }
00240                     }
00241                     if ($nsel !== null) {
00242                         if (!empty($scopes)) {
00243                             foreach ($scopes as $s) {
00244                                 $new_selectors[] = "$s $nsel";
00245                             }
00246                         } else {
00247                             $new_selectors[] = $nsel;
00248                         }
00249                     }
00250                 }
00251                 if (empty($new_selectors)) continue;
00252                 $selector = implode(', ', $new_selectors);
00253                 foreach ($style as $name => $value) {
00254                     if (!isset($css_definition->info[$name])) {
00255                         unset($style[$name]);
00256                         continue;
00257                     }
00258                     $def = $css_definition->info[$name];
00259                     $ret = $def->validate($value, $config, $context);
00260                     if ($ret === false) unset($style[$name]);
00261                     else $style[$name] = $ret;
00262                 }
00263                 $new_decls[$selector] = $style;
00264             }
00265             $new_css[$k] = $new_decls;
00266         }
00267         // remove stuff that shouldn't be used, could be reenabled
00268         // after security risks are analyzed
00269         $this->_tidy->css = $new_css;
00270         $this->_tidy->import = array();
00271         $this->_tidy->charset = null;
00272         $this->_tidy->namespace = null;
00273         $css = $this->_tidy->print->plain();
00274         // we are going to escape any special characters <>& to ensure
00275         // that no funny business occurs (i.e. </style> in a font-family prop).
00276         if ($config->get('Filter.ExtractStyleBlocks.Escaping')) {
00277             $css = str_replace(
00278                 array('<',    '>',    '&'),
00279                 array('\3C ', '\3E ', '\26 '),
00280                 $css
00281             );
00282         }
00283         return $css;
00284     }
00285 
00286 }
00287 
00288 // vim: et sw=4 sts=4