| 1: | <?php
|
| 2: |
|
| 3: | |
| 4: | |
| 5: | |
| 6: |
|
| 7: | class HTMLPurifier_Encoder
|
| 8: | {
|
| 9: |
|
| 10: | |
| 11: | |
| 12: |
|
| 13: | private function __construct()
|
| 14: | {
|
| 15: | trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
|
| 16: | }
|
| 17: |
|
| 18: | |
| 19: | |
| 20: |
|
| 21: | public static function muteErrorHandler()
|
| 22: | {
|
| 23: | }
|
| 24: |
|
| 25: | |
| 26: | |
| 27: | |
| 28: | |
| 29: | |
| 30: | |
| 31: |
|
| 32: | public static function unsafeIconv($in, $out, $text)
|
| 33: | {
|
| 34: | set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
|
| 35: | $r = iconv($in, $out, $text);
|
| 36: | restore_error_handler();
|
| 37: | return $r;
|
| 38: | }
|
| 39: |
|
| 40: | |
| 41: | |
| 42: | |
| 43: | |
| 44: | |
| 45: | |
| 46: | |
| 47: |
|
| 48: | public static function iconv($in, $out, $text, $max_chunk_size = 8000)
|
| 49: | {
|
| 50: | $code = self::testIconvTruncateBug();
|
| 51: | if ($code == self::ICONV_OK) {
|
| 52: | return self::unsafeIconv($in, $out, $text);
|
| 53: | } elseif ($code == self::ICONV_TRUNCATES) {
|
| 54: |
|
| 55: |
|
| 56: | if ($in == 'utf-8') {
|
| 57: | if ($max_chunk_size < 4) {
|
| 58: | trigger_error('max_chunk_size is too small', E_USER_WARNING);
|
| 59: | return false;
|
| 60: | }
|
| 61: |
|
| 62: |
|
| 63: | if (($c = strlen($text)) <= $max_chunk_size) {
|
| 64: | return self::unsafeIconv($in, $out, $text);
|
| 65: | }
|
| 66: | $r = '';
|
| 67: | $i = 0;
|
| 68: | while (true) {
|
| 69: | if ($i + $max_chunk_size >= $c) {
|
| 70: | $r .= self::unsafeIconv($in, $out, substr($text, $i));
|
| 71: | break;
|
| 72: | }
|
| 73: |
|
| 74: | if (0x80 != (0xC0 & ord($text[$i + $max_chunk_size]))) {
|
| 75: | $chunk_size = $max_chunk_size;
|
| 76: | } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 1]))) {
|
| 77: | $chunk_size = $max_chunk_size - 1;
|
| 78: | } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 2]))) {
|
| 79: | $chunk_size = $max_chunk_size - 2;
|
| 80: | } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 3]))) {
|
| 81: | $chunk_size = $max_chunk_size - 3;
|
| 82: | } else {
|
| 83: | return false;
|
| 84: | }
|
| 85: | $chunk = substr($text, $i, $chunk_size);
|
| 86: | $r .= self::unsafeIconv($in, $out, $chunk);
|
| 87: | $i += $chunk_size;
|
| 88: | }
|
| 89: | return $r;
|
| 90: | } else {
|
| 91: | return false;
|
| 92: | }
|
| 93: | } else {
|
| 94: | return false;
|
| 95: | }
|
| 96: | }
|
| 97: |
|
| 98: | |
| 99: | |
| 100: | |
| 101: | |
| 102: | |
| 103: | |
| 104: | |
| 105: | |
| 106: | |
| 107: | |
| 108: | |
| 109: | |
| 110: | |
| 111: | |
| 112: | |
| 113: | |
| 114: | |
| 115: | |
| 116: | |
| 117: | |
| 118: | |
| 119: | |
| 120: | |
| 121: | |
| 122: | |
| 123: | |
| 124: | |
| 125: | |
| 126: | |
| 127: | |
| 128: | |
| 129: | |
| 130: | |
| 131: | |
| 132: | |
| 133: |
|
| 134: | public static function cleanUTF8($str, $force_php = false)
|
| 135: | {
|
| 136: |
|
| 137: |
|
| 138: |
|
| 139: | if (preg_match(
|
| 140: | '/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du',
|
| 141: | $str
|
| 142: | )) {
|
| 143: | return $str;
|
| 144: | }
|
| 145: |
|
| 146: | $mState = 0;
|
| 147: |
|
| 148: | $mUcs4 = 0;
|
| 149: | $mBytes = 1;
|
| 150: |
|
| 151: |
|
| 152: |
|
| 153: |
|
| 154: |
|
| 155: |
|
| 156: |
|
| 157: | $out = '';
|
| 158: | $char = '';
|
| 159: |
|
| 160: | $len = strlen($str);
|
| 161: | for ($i = 0; $i < $len; $i++) {
|
| 162: | $in = ord($str[$i]);
|
| 163: | $char .= $str[$i];
|
| 164: | if (0 == $mState) {
|
| 165: |
|
| 166: |
|
| 167: | if (0 == (0x80 & ($in))) {
|
| 168: |
|
| 169: | if (($in <= 31 || $in == 127) &&
|
| 170: | !($in == 9 || $in == 13 || $in == 10)
|
| 171: | ) {
|
| 172: |
|
| 173: | } else {
|
| 174: | $out .= $char;
|
| 175: | }
|
| 176: |
|
| 177: | $char = '';
|
| 178: | $mBytes = 1;
|
| 179: | } elseif (0xC0 == (0xE0 & ($in))) {
|
| 180: |
|
| 181: | $mUcs4 = ($in);
|
| 182: | $mUcs4 = ($mUcs4 & 0x1F) << 6;
|
| 183: | $mState = 1;
|
| 184: | $mBytes = 2;
|
| 185: | } elseif (0xE0 == (0xF0 & ($in))) {
|
| 186: |
|
| 187: | $mUcs4 = ($in);
|
| 188: | $mUcs4 = ($mUcs4 & 0x0F) << 12;
|
| 189: | $mState = 2;
|
| 190: | $mBytes = 3;
|
| 191: | } elseif (0xF0 == (0xF8 & ($in))) {
|
| 192: |
|
| 193: | $mUcs4 = ($in);
|
| 194: | $mUcs4 = ($mUcs4 & 0x07) << 18;
|
| 195: | $mState = 3;
|
| 196: | $mBytes = 4;
|
| 197: | } elseif (0xF8 == (0xFC & ($in))) {
|
| 198: |
|
| 199: |
|
| 200: |
|
| 201: |
|
| 202: |
|
| 203: |
|
| 204: |
|
| 205: |
|
| 206: |
|
| 207: | $mUcs4 = ($in);
|
| 208: | $mUcs4 = ($mUcs4 & 0x03) << 24;
|
| 209: | $mState = 4;
|
| 210: | $mBytes = 5;
|
| 211: | } elseif (0xFC == (0xFE & ($in))) {
|
| 212: |
|
| 213: |
|
| 214: | $mUcs4 = ($in);
|
| 215: | $mUcs4 = ($mUcs4 & 1) << 30;
|
| 216: | $mState = 5;
|
| 217: | $mBytes = 6;
|
| 218: | } else {
|
| 219: |
|
| 220: |
|
| 221: | $mState = 0;
|
| 222: | $mUcs4 = 0;
|
| 223: | $mBytes = 1;
|
| 224: | $char = '';
|
| 225: | }
|
| 226: | } else {
|
| 227: |
|
| 228: |
|
| 229: | if (0x80 == (0xC0 & ($in))) {
|
| 230: |
|
| 231: | $shift = ($mState - 1) * 6;
|
| 232: | $tmp = $in;
|
| 233: | $tmp = ($tmp & 0x0000003F) << $shift;
|
| 234: | $mUcs4 |= $tmp;
|
| 235: |
|
| 236: | if (0 == --$mState) {
|
| 237: |
|
| 238: |
|
| 239: |
|
| 240: |
|
| 241: |
|
| 242: |
|
| 243: | if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
|
| 244: | ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
|
| 245: | ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
|
| 246: | (4 < $mBytes) ||
|
| 247: |
|
| 248: | (($mUcs4 & 0xFFFFF800) == 0xD800) ||
|
| 249: |
|
| 250: | ($mUcs4 > 0x10FFFF)
|
| 251: | ) {
|
| 252: |
|
| 253: | } elseif (0xFEFF != $mUcs4 &&
|
| 254: |
|
| 255: | (
|
| 256: | 0x9 == $mUcs4 ||
|
| 257: | 0xA == $mUcs4 ||
|
| 258: | 0xD == $mUcs4 ||
|
| 259: | (0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
|
| 260: |
|
| 261: |
|
| 262: | (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
|
| 263: | (0xE000 <= $mUcs4 && 0xFFFD >= $mUcs4) ||
|
| 264: | (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
|
| 265: | )
|
| 266: | ) {
|
| 267: | $out .= $char;
|
| 268: | }
|
| 269: |
|
| 270: | $mState = 0;
|
| 271: | $mUcs4 = 0;
|
| 272: | $mBytes = 1;
|
| 273: | $char = '';
|
| 274: | }
|
| 275: | } else {
|
| 276: |
|
| 277: |
|
| 278: |
|
| 279: | $mState = 0;
|
| 280: | $mUcs4 = 0;
|
| 281: | $mBytes = 1;
|
| 282: | $char ='';
|
| 283: | }
|
| 284: | }
|
| 285: | }
|
| 286: | return $out;
|
| 287: | }
|
| 288: |
|
| 289: | |
| 290: | |
| 291: | |
| 292: | |
| 293: | |
| 294: | |
| 295: | |
| 296: | |
| 297: | |
| 298: | |
| 299: | |
| 300: |
|
| 301: |
|
| 302: |
|
| 303: |
|
| 304: |
|
| 305: |
|
| 306: |
|
| 307: |
|
| 308: |
|
| 309: |
|
| 310: |
|
| 311: |
|
| 312: |
|
| 313: |
|
| 314: |
|
| 315: | public static function unichr($code)
|
| 316: | {
|
| 317: | if ($code > 1114111 or $code < 0 or
|
| 318: | ($code >= 55296 and $code <= 57343) ) {
|
| 319: |
|
| 320: |
|
| 321: | return '';
|
| 322: | }
|
| 323: |
|
| 324: | $x = $y = $z = $w = 0;
|
| 325: | if ($code < 128) {
|
| 326: |
|
| 327: | $x = $code;
|
| 328: | } else {
|
| 329: |
|
| 330: | $x = ($code & 63) | 128;
|
| 331: | if ($code < 2048) {
|
| 332: | $y = (($code & 2047) >> 6) | 192;
|
| 333: | } else {
|
| 334: | $y = (($code & 4032) >> 6) | 128;
|
| 335: | if ($code < 65536) {
|
| 336: | $z = (($code >> 12) & 15) | 224;
|
| 337: | } else {
|
| 338: | $z = (($code >> 12) & 63) | 128;
|
| 339: | $w = (($code >> 18) & 7) | 240;
|
| 340: | }
|
| 341: | }
|
| 342: | }
|
| 343: |
|
| 344: | $ret = '';
|
| 345: | if ($w) {
|
| 346: | $ret .= chr($w);
|
| 347: | }
|
| 348: | if ($z) {
|
| 349: | $ret .= chr($z);
|
| 350: | }
|
| 351: | if ($y) {
|
| 352: | $ret .= chr($y);
|
| 353: | }
|
| 354: | $ret .= chr($x);
|
| 355: |
|
| 356: | return $ret;
|
| 357: | }
|
| 358: |
|
| 359: | |
| 360: | |
| 361: |
|
| 362: | public static function iconvAvailable()
|
| 363: | {
|
| 364: | static $iconv = null;
|
| 365: | if ($iconv === null) {
|
| 366: | $iconv = function_exists('iconv') && self::testIconvTruncateBug() != self::ICONV_UNUSABLE;
|
| 367: | }
|
| 368: | return $iconv;
|
| 369: | }
|
| 370: |
|
| 371: | |
| 372: | |
| 373: | |
| 374: | |
| 375: | |
| 376: | |
| 377: |
|
| 378: | public static function convertToUTF8($str, $config, $context)
|
| 379: | {
|
| 380: | $encoding = $config->get('Core.Encoding');
|
| 381: | if ($encoding === 'utf-8') {
|
| 382: | return $str;
|
| 383: | }
|
| 384: | static $iconv = null;
|
| 385: | if ($iconv === null) {
|
| 386: | $iconv = self::iconvAvailable();
|
| 387: | }
|
| 388: | if ($iconv && !$config->get('Test.ForceNoIconv')) {
|
| 389: |
|
| 390: | $str = self::unsafeIconv($encoding, 'utf-8//IGNORE', $str);
|
| 391: | if ($str === false) {
|
| 392: |
|
| 393: | trigger_error('Invalid encoding ' . $encoding, E_USER_ERROR);
|
| 394: | return '';
|
| 395: | }
|
| 396: |
|
| 397: |
|
| 398: |
|
| 399: | $str = strtr($str, self::testEncodingSupportsASCII($encoding));
|
| 400: | return $str;
|
| 401: | } elseif ($encoding === 'iso-8859-1' && function_exists('mb_convert_encoding')) {
|
| 402: | $str = mb_convert_encoding($str, 'UTF-8', 'ISO-8859-1');
|
| 403: | return $str;
|
| 404: | }
|
| 405: | $bug = HTMLPurifier_Encoder::testIconvTruncateBug();
|
| 406: | if ($bug == self::ICONV_OK) {
|
| 407: | trigger_error('Encoding not supported, please install iconv', E_USER_ERROR);
|
| 408: | } else {
|
| 409: | trigger_error(
|
| 410: | 'You have a buggy version of iconv, see https://bugs.php.net/bug.php?id=48147 ' .
|
| 411: | 'and http://sourceware.org/bugzilla/show_bug.cgi?id=13541',
|
| 412: | E_USER_ERROR
|
| 413: | );
|
| 414: | }
|
| 415: | }
|
| 416: |
|
| 417: | |
| 418: | |
| 419: | |
| 420: | |
| 421: | |
| 422: | |
| 423: | |
| 424: | |
| 425: |
|
| 426: | public static function convertFromUTF8($str, $config, $context)
|
| 427: | {
|
| 428: | $encoding = $config->get('Core.Encoding');
|
| 429: | if ($escape = $config->get('Core.EscapeNonASCIICharacters')) {
|
| 430: | $str = self::convertToASCIIDumbLossless($str);
|
| 431: | }
|
| 432: | if ($encoding === 'utf-8') {
|
| 433: | return $str;
|
| 434: | }
|
| 435: | static $iconv = null;
|
| 436: | if ($iconv === null) {
|
| 437: | $iconv = self::iconvAvailable();
|
| 438: | }
|
| 439: | if ($iconv && !$config->get('Test.ForceNoIconv')) {
|
| 440: |
|
| 441: | $ascii_fix = self::testEncodingSupportsASCII($encoding);
|
| 442: | if (!$escape && !empty($ascii_fix)) {
|
| 443: | $clear_fix = array();
|
| 444: | foreach ($ascii_fix as $utf8 => $native) {
|
| 445: | $clear_fix[$utf8] = '';
|
| 446: | }
|
| 447: | $str = strtr($str, $clear_fix);
|
| 448: | }
|
| 449: | $str = strtr($str, array_flip($ascii_fix));
|
| 450: |
|
| 451: | $str = self::iconv('utf-8', $encoding . '//IGNORE', $str);
|
| 452: | return $str;
|
| 453: | } elseif ($encoding === 'iso-8859-1' && function_exists('mb_convert_encoding')) {
|
| 454: | $str = mb_convert_encoding($str, 'ISO-8859-1', 'UTF-8');
|
| 455: | return $str;
|
| 456: | }
|
| 457: | trigger_error('Encoding not supported', E_USER_ERROR);
|
| 458: |
|
| 459: |
|
| 460: |
|
| 461: |
|
| 462: | }
|
| 463: |
|
| 464: | |
| 465: | |
| 466: | |
| 467: | |
| 468: | |
| 469: | |
| 470: | |
| 471: | |
| 472: | |
| 473: | |
| 474: | |
| 475: | |
| 476: | |
| 477: | |
| 478: | |
| 479: |
|
| 480: | public static function convertToASCIIDumbLossless($str)
|
| 481: | {
|
| 482: | $bytesleft = 0;
|
| 483: | $result = '';
|
| 484: | $working = 0;
|
| 485: | $len = strlen($str);
|
| 486: | for ($i = 0; $i < $len; $i++) {
|
| 487: | $bytevalue = ord($str[$i]);
|
| 488: | if ($bytevalue <= 0x7F) {
|
| 489: | $result .= chr($bytevalue);
|
| 490: | $bytesleft = 0;
|
| 491: | } elseif ($bytevalue <= 0xBF) {
|
| 492: | $working = $working << 6;
|
| 493: | $working += ($bytevalue & 0x3F);
|
| 494: | $bytesleft--;
|
| 495: | if ($bytesleft <= 0) {
|
| 496: | $result .= "&#" . $working . ";";
|
| 497: | }
|
| 498: | } elseif ($bytevalue <= 0xDF) {
|
| 499: | $working = $bytevalue & 0x1F;
|
| 500: | $bytesleft = 1;
|
| 501: | } elseif ($bytevalue <= 0xEF) {
|
| 502: | $working = $bytevalue & 0x0F;
|
| 503: | $bytesleft = 2;
|
| 504: | } else {
|
| 505: | $working = $bytevalue & 0x07;
|
| 506: | $bytesleft = 3;
|
| 507: | }
|
| 508: | }
|
| 509: | return $result;
|
| 510: | }
|
| 511: |
|
| 512: |
|
| 513: | const ICONV_OK = 0;
|
| 514: |
|
| 515: | |
| 516: |
|
| 517: | const ICONV_TRUNCATES = 1;
|
| 518: |
|
| 519: | |
| 520: |
|
| 521: | const ICONV_UNUSABLE = 2;
|
| 522: |
|
| 523: | |
| 524: | |
| 525: | |
| 526: | |
| 527: | |
| 528: | |
| 529: | |
| 530: | |
| 531: | |
| 532: | |
| 533: | |
| 534: | |
| 535: | |
| 536: |
|
| 537: | public static function testIconvTruncateBug()
|
| 538: | {
|
| 539: | static $code = null;
|
| 540: | if ($code === null) {
|
| 541: |
|
| 542: | $r = self::unsafeIconv('utf-8', 'ascii//IGNORE', "\xCE\xB1" . str_repeat('a', 9000));
|
| 543: | if ($r === false) {
|
| 544: | $code = self::ICONV_UNUSABLE;
|
| 545: | } elseif (($c = strlen($r)) < 9000) {
|
| 546: | $code = self::ICONV_TRUNCATES;
|
| 547: | } elseif ($c > 9000) {
|
| 548: | trigger_error(
|
| 549: | 'Your copy of iconv is extremely buggy. Please notify HTML Purifier maintainers: ' .
|
| 550: | 'include your iconv version as per phpversion()',
|
| 551: | E_USER_ERROR
|
| 552: | );
|
| 553: | } else {
|
| 554: | $code = self::ICONV_OK;
|
| 555: | }
|
| 556: | }
|
| 557: | return $code;
|
| 558: | }
|
| 559: |
|
| 560: | |
| 561: | |
| 562: | |
| 563: | |
| 564: | |
| 565: | |
| 566: | |
| 567: | |
| 568: | |
| 569: | |
| 570: |
|
| 571: | public static function testEncodingSupportsASCII($encoding, $bypass = false)
|
| 572: | {
|
| 573: |
|
| 574: |
|
| 575: |
|
| 576: |
|
| 577: |
|
| 578: | static $encodings = array();
|
| 579: | if (!$bypass) {
|
| 580: | if (isset($encodings[$encoding])) {
|
| 581: | return $encodings[$encoding];
|
| 582: | }
|
| 583: | $lenc = strtolower($encoding);
|
| 584: | switch ($lenc) {
|
| 585: | case 'shift_jis':
|
| 586: | return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');
|
| 587: | case 'johab':
|
| 588: | return array("\xE2\x82\xA9" => '\\');
|
| 589: | }
|
| 590: | if (strpos($lenc, 'iso-8859-') === 0) {
|
| 591: | return array();
|
| 592: | }
|
| 593: | }
|
| 594: | $ret = array();
|
| 595: | if (self::unsafeIconv('UTF-8', $encoding, 'a') === false) {
|
| 596: | return false;
|
| 597: | }
|
| 598: | for ($i = 0x20; $i <= 0x7E; $i++) {
|
| 599: | $c = chr($i);
|
| 600: | $r = self::unsafeIconv('UTF-8', "$encoding//IGNORE", $c);
|
| 601: | if ($r === '' ||
|
| 602: |
|
| 603: |
|
| 604: | ($r === $c && self::unsafeIconv($encoding, 'UTF-8//IGNORE', $r) !== $c)
|
| 605: | ) {
|
| 606: |
|
| 607: |
|
| 608: |
|
| 609: | $ret[self::unsafeIconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
|
| 610: | }
|
| 611: | }
|
| 612: | $encodings[$encoding] = $ret;
|
| 613: | return $ret;
|
| 614: | }
|
| 615: | }
|
| 616: |
|
| 617: |
|
| 618: | |