14 trigger_error(
'Cannot instantiate encoder, call methods statically', E_USER_ERROR);
26 set_error_handler(array(
'HTMLPurifier_Encoder',
'muteErrorHandler'));
27 $r =
iconv($in, $out, $text);
28 restore_error_handler();
35 public static function iconv($in, $out, $text, $max_chunk_size = 8000) {
36 $code = self::testIconvTruncateBug();
37 if ($code == self::ICONV_OK) {
38 return self::unsafeIconv($in, $out, $text);
39 } elseif ($code == self::ICONV_TRUNCATES) {
43 if ($max_chunk_size < 4) {
44 trigger_error(
'max_chunk_size is too small', E_USER_WARNING);
49 if (($c = strlen($text)) <= $max_chunk_size) {
50 return self::unsafeIconv($in, $out, $text);
55 if (
$i + $max_chunk_size >= $c) {
56 $r .= self::unsafeIconv($in, $out, substr($text,
$i));
60 if (0x80 != (0xC0 & ord($text[
$i + $max_chunk_size]))) {
61 $chunk_size = $max_chunk_size;
62 } elseif (0x80 != (0xC0 & ord($text[
$i + $max_chunk_size - 1]))) {
63 $chunk_size = $max_chunk_size - 1;
64 } elseif (0x80 != (0xC0 & ord($text[
$i + $max_chunk_size - 2]))) {
65 $chunk_size = $max_chunk_size - 2;
66 } elseif (0x80 != (0xC0 & ord($text[
$i + $max_chunk_size - 3]))) {
67 $chunk_size = $max_chunk_size - 3;
71 $chunk = substr($text,
$i, $chunk_size);
72 $r .= self::unsafeIconv($in, $out, $chunk);
109 public static function cleanUTF8($str, $force_php =
false) {
116 if (preg_match(
'/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du', $str)) {
135 for(
$i = 0;
$i < $len;
$i++) {
141 if (0 == (0x80 & ($in))) {
143 if (($in <= 31 || $in == 127) &&
144 !($in == 9 || $in == 13 || $in == 10)
153 } elseif (0xC0 == (0xE0 & ($in))) {
156 $mUcs4 = ($mUcs4 & 0x1F) << 6;
159 } elseif (0xE0 == (0xF0 & ($in))) {
162 $mUcs4 = ($mUcs4 & 0x0F) << 12;
165 } elseif (0xF0 == (0xF8 & ($in))) {
168 $mUcs4 = ($mUcs4 & 0x07) << 18;
171 } elseif (0xF8 == (0xFC & ($in))) {
182 $mUcs4 = ($mUcs4 & 0x03) << 24;
185 } elseif (0xFC == (0xFE & ($in))) {
189 $mUcs4 = ($mUcs4 & 1) << 30;
203 if (0x80 == (0xC0 & ($in))) {
205 $shift = ($mState - 1) * 6;
207 $tmp = ($tmp & 0x0000003F) << $shift;
210 if (0 == --$mState) {
217 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
218 ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
219 ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
222 (($mUcs4 & 0xFFFFF800) == 0xD800) ||
227 } elseif (0xFEFF != $mUcs4 &&
233 (0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
236 (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
237 (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
289 if($code > 1114111 or $code < 0 or
290 ($code >= 55296 and $code <= 57343) ) {
296 $x = $y = $z = $w = 0;
302 $x = ($code & 63) | 128;
304 $y = (($code & 2047) >> 6) | 192;
306 $y = (($code & 4032) >> 6) | 128;
308 $z = (($code >> 12) & 15) | 224;
310 $z = (($code >> 12) & 63) | 128;
311 $w = (($code >> 18) & 7) | 240;
317 if($w)
$ret .= chr($w);
318 if($z)
$ret .= chr($z);
319 if($y)
$ret .= chr($y);
326 static $iconv = null;
327 if ($iconv === null) {
328 $iconv = function_exists(
'iconv') && self::testIconvTruncateBug() != self::ICONV_UNUSABLE;
337 $encoding =
$config->get(
'Core.Encoding');
338 if ($encoding ===
'utf-8')
return $str;
339 static $iconv = null;
340 if ($iconv === null) $iconv = self::iconvAvailable();
341 if ($iconv && !
$config->get(
'Test.ForceNoIconv')) {
343 $str = self::unsafeIconv($encoding,
'utf-8//IGNORE', $str);
344 if ($str ===
false) {
346 trigger_error(
'Invalid encoding ' . $encoding, E_USER_ERROR);
352 $str = strtr($str, self::testEncodingSupportsASCII($encoding));
354 } elseif ($encoding ===
'iso-8859-1') {
355 $str = utf8_encode($str);
358 trigger_error(
'Encoding not supported, please install iconv', E_USER_ERROR);
367 $encoding =
$config->get(
'Core.Encoding');
368 if ($escape =
$config->get(
'Core.EscapeNonASCIICharacters')) {
369 $str = self::convertToASCIIDumbLossless($str);
371 if ($encoding ===
'utf-8')
return $str;
372 static $iconv = null;
373 if ($iconv === null) $iconv = self::iconvAvailable();
374 if ($iconv && !
$config->get(
'Test.ForceNoIconv')) {
376 $ascii_fix = self::testEncodingSupportsASCII($encoding);
377 if (!$escape && !empty($ascii_fix)) {
378 $clear_fix = array();
379 foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] =
'';
380 $str = strtr($str, $clear_fix);
382 $str = strtr($str, array_flip($ascii_fix));
384 $str = self::iconv(
'utf-8', $encoding .
'//IGNORE', $str);
386 } elseif ($encoding ===
'iso-8859-1') {
387 $str = utf8_decode($str);
390 trigger_error(
'Encoding not supported', E_USER_ERROR);
418 for(
$i = 0;
$i < $len;
$i++ ) {
419 $bytevalue = ord( $str[
$i] );
420 if( $bytevalue <= 0x7F ) {
423 } elseif( $bytevalue <= 0xBF ) {
424 $working = $working << 6;
425 $working += ($bytevalue & 0x3F);
427 if( $bytesleft <= 0 ) {
428 $result .=
"&#" . $working .
";";
430 } elseif( $bytevalue <= 0xDF ) {
431 $working = $bytevalue & 0x1F;
433 } elseif( $bytevalue <= 0xEF ) {
434 $working = $bytevalue & 0x0F;
437 $working = $bytevalue & 0x07;
471 if ($code === null) {
473 $r = self::unsafeIconv(
'utf-8',
'ascii//IGNORE',
"\xCE\xB1" . str_repeat(
'a', 9000));
475 $code = self::ICONV_UNUSABLE;
476 } elseif (($c = strlen($r)) < 9000) {
477 $code = self::ICONV_TRUNCATES;
478 } elseif ($c > 9000) {
479 trigger_error(
'Your copy of iconv is extremely buggy. Please notify HTML Purifier maintainers: include your iconv version as per phpversion()', E_USER_ERROR);
481 $code = self::ICONV_OK;
504 static $encodings = array();
506 if (isset($encodings[$encoding]))
return $encodings[$encoding];
507 $lenc = strtolower($encoding);
510 return array(
"\xC2\xA5" =>
'\\',
"\xE2\x80\xBE" =>
'~');
512 return array(
"\xE2\x82\xA9" =>
'\\');
514 if (strpos($lenc,
'iso-8859-') === 0)
return array();
517 if (self::unsafeIconv(
'UTF-8', $encoding,
'a') ===
false)
return false;
518 for (
$i = 0x20;
$i <= 0x7E;
$i++) {
520 $r = self::unsafeIconv(
'UTF-8',
"$encoding//IGNORE", $c);
525 ($r === $c && self::unsafeIconv($encoding,
'UTF-8//IGNORE', $r) !== $c)
530 $ret[self::unsafeIconv($encoding,
'UTF-8//IGNORE', $c)] = $c;
533 $encodings[$encoding] =
$ret;