00001 <?php
00002
00007 class HTMLPurifier_Encoder
00008 {
00009
00013 private function __construct() {
00014 trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
00015 }
00016
00020 private static function muteErrorHandler() {}
00021
00047 public static function cleanUTF8($str, $force_php = false) {
00048
00049
00050
00051
00052
00053
00054 if (preg_match('/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du', $str)) {
00055 return $str;
00056 }
00057
00058 $mState = 0;
00059
00060 $mUcs4 = 0;
00061 $mBytes = 1;
00062
00063
00064
00065
00066
00067
00068
00069 $out = '';
00070 $char = '';
00071
00072 $len = strlen($str);
00073 for($i = 0; $i < $len; $i++) {
00074 $in = ord($str{$i});
00075 $char .= $str[$i];
00076 if (0 == $mState) {
00077
00078
00079 if (0 == (0x80 & ($in))) {
00080
00081 if (($in <= 31 || $in == 127) &&
00082 !($in == 9 || $in == 13 || $in == 10)
00083 ) {
00084
00085 } else {
00086 $out .= $char;
00087 }
00088
00089 $char = '';
00090 $mBytes = 1;
00091 } elseif (0xC0 == (0xE0 & ($in))) {
00092
00093 $mUcs4 = ($in);
00094 $mUcs4 = ($mUcs4 & 0x1F) << 6;
00095 $mState = 1;
00096 $mBytes = 2;
00097 } elseif (0xE0 == (0xF0 & ($in))) {
00098
00099 $mUcs4 = ($in);
00100 $mUcs4 = ($mUcs4 & 0x0F) << 12;
00101 $mState = 2;
00102 $mBytes = 3;
00103 } elseif (0xF0 == (0xF8 & ($in))) {
00104
00105 $mUcs4 = ($in);
00106 $mUcs4 = ($mUcs4 & 0x07) << 18;
00107 $mState = 3;
00108 $mBytes = 4;
00109 } elseif (0xF8 == (0xFC & ($in))) {
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119 $mUcs4 = ($in);
00120 $mUcs4 = ($mUcs4 & 0x03) << 24;
00121 $mState = 4;
00122 $mBytes = 5;
00123 } elseif (0xFC == (0xFE & ($in))) {
00124
00125
00126 $mUcs4 = ($in);
00127 $mUcs4 = ($mUcs4 & 1) << 30;
00128 $mState = 5;
00129 $mBytes = 6;
00130 } else {
00131
00132
00133 $mState = 0;
00134 $mUcs4 = 0;
00135 $mBytes = 1;
00136 $char = '';
00137 }
00138 } else {
00139
00140
00141 if (0x80 == (0xC0 & ($in))) {
00142
00143 $shift = ($mState - 1) * 6;
00144 $tmp = $in;
00145 $tmp = ($tmp & 0x0000003F) << $shift;
00146 $mUcs4 |= $tmp;
00147
00148 if (0 == --$mState) {
00149
00150
00151
00152
00153
00154
00155 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
00156 ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
00157 ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
00158 (4 < $mBytes) ||
00159
00160 (($mUcs4 & 0xFFFFF800) == 0xD800) ||
00161
00162 ($mUcs4 > 0x10FFFF)
00163 ) {
00164
00165 } elseif (0xFEFF != $mUcs4 &&
00166
00167 (
00168 0x9 == $mUcs4 ||
00169 0xA == $mUcs4 ||
00170 0xD == $mUcs4 ||
00171 (0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
00172
00173
00174 (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
00175 (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
00176 )
00177 ) {
00178 $out .= $char;
00179 }
00180
00181 $mState = 0;
00182 $mUcs4 = 0;
00183 $mBytes = 1;
00184 $char = '';
00185 }
00186 } else {
00187
00188
00189
00190 $mState = 0;
00191 $mUcs4 = 0;
00192 $mBytes = 1;
00193 $char ='';
00194 }
00195 }
00196 }
00197 return $out;
00198 }
00199
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226 public static function unichr($code) {
00227 if($code > 1114111 or $code < 0 or
00228 ($code >= 55296 and $code <= 57343) ) {
00229
00230
00231 return '';
00232 }
00233
00234 $x = $y = $z = $w = 0;
00235 if ($code < 128) {
00236
00237 $x = $code;
00238 } else {
00239
00240 $x = ($code & 63) | 128;
00241 if ($code < 2048) {
00242 $y = (($code & 2047) >> 6) | 192;
00243 } else {
00244 $y = (($code & 4032) >> 6) | 128;
00245 if($code < 65536) {
00246 $z = (($code >> 12) & 15) | 224;
00247 } else {
00248 $z = (($code >> 12) & 63) | 128;
00249 $w = (($code >> 18) & 7) | 240;
00250 }
00251 }
00252 }
00253
00254 $ret = '';
00255 if($w) $ret .= chr($w);
00256 if($z) $ret .= chr($z);
00257 if($y) $ret .= chr($y);
00258 $ret .= chr($x);
00259
00260 return $ret;
00261 }
00262
00266 public static function convertToUTF8($str, $config, $context) {
00267 $encoding = $config->get('Core', 'Encoding');
00268 if ($encoding === 'utf-8') return $str;
00269 static $iconv = null;
00270 if ($iconv === null) $iconv = function_exists('iconv');
00271 set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
00272 if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
00273 $str = iconv($encoding, 'utf-8
00274
00275
00276
00277 $str = strtr($str, HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding));
00278 restore_error_handler();
00279 return $str;
00280 } elseif ($encoding === 'iso-8859-1') {
00281 $str = utf8_encode($str);
00282 restore_error_handler();
00283 return $str;
00284 }
00285 trigger_error('Encoding not supported', E_USER_ERROR);
00286 }
00287
00293 public static function convertFromUTF8($str, $config, $context) {
00294 $encoding = $config->get('Core', 'Encoding');
00295 if ($encoding === 'utf-8') return $str;
00296 static $iconv = null;
00297 if ($iconv === null) $iconv = function_exists('iconv');
00298 if ($escape = $config->get('Core', 'EscapeNonASCIICharacters')) {
00299 $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
00300 }
00301 set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
00302 if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
00303
00304 $ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding);
00305 if (!$escape && !empty($ascii_fix)) {
00306 $clear_fix = array();
00307 foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] = '';
00308 $str = strtr($str, $clear_fix);
00309 }
00310 $str = strtr($str, array_flip($ascii_fix));
00311
00312 $str = iconv('utf-8', $encoding . '
00313 restore_error_handler();
00314 return $str;
00315 } elseif ($encoding === 'iso-8859-1') {
00316 $str = utf8_decode($str);
00317 restore_error_handler();
00318 return $str;
00319 }
00320 trigger_error('Encoding not supported', E_USER_ERROR);
00321 }
00322
00339 public static function convertToASCIIDumbLossless($str) {
00340 $bytesleft = 0;
00341 $result = '';
00342 $working = 0;
00343 $len = strlen($str);
00344 for( $i = 0; $i < $len; $i++ ) {
00345 $bytevalue = ord( $str[$i] );
00346 if( $bytevalue <= 0x7F ) {
00347 $result .= chr( $bytevalue );
00348 $bytesleft = 0;
00349 } elseif( $bytevalue <= 0xBF ) {
00350 $working = $working << 6;
00351 $working += ($bytevalue & 0x3F);
00352 $bytesleft--;
00353 if( $bytesleft <= 0 ) {
00354 $result .= "&#" . $working . ";";
00355 }
00356 } elseif( $bytevalue <= 0xDF ) {
00357 $working = $bytevalue & 0x1F;
00358 $bytesleft = 1;
00359 } elseif( $bytevalue <= 0xEF ) {
00360 $working = $bytevalue & 0x0F;
00361 $bytesleft = 2;
00362 } else {
00363 $working = $bytevalue & 0x07;
00364 $bytesleft = 3;
00365 }
00366 }
00367 return $result;
00368 }
00369
00381 public static function testEncodingSupportsASCII($encoding, $bypass = false) {
00382 static $encodings = array();
00383 if (!$bypass) {
00384 if (isset($encodings[$encoding])) return $encodings[$encoding];
00385 $lenc = strtolower($encoding);
00386 switch ($lenc) {
00387 case 'shift_jis':
00388 return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');
00389 case 'johab':
00390 return array("\xE2\x82\xA9" => '\\');
00391 }
00392 if (strpos($lenc, 'iso-8859-') === 0) return array();
00393 }
00394 $ret = array();
00395 set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
00396 if (iconv('UTF-8', $encoding, 'a') === false) return false;
00397 for ($i = 0x20; $i <= 0x7E; $i++) {
00398 $c = chr($i);
00399 if (iconv('UTF-8', "$encoding//IGNORE", $c) === '') {
00400
00401
00402
00403 $ret[iconv($encoding, 'UTF-8
00404 }
00405 }
00406 restore_error_handler();
00407 $encodings[$encoding] = $ret;
00408 return $ret;
00409 }
00410
00411
00412 }
00413