1: | <?php
|
2: |
|
3: | |
4: | |
5: | |
6: |
|
7: | class HTMLPurifier_Encoder
|
8: | {
|
9: |
|
10: | |
11: | |
12: |
|
13: | private function __construct()
|
14: | {
|
15: | trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
|
16: | }
|
17: |
|
18: | |
19: | |
20: |
|
21: | public static function muteErrorHandler()
|
22: | {
|
23: | }
|
24: |
|
25: | |
26: | |
27: | |
28: | |
29: | |
30: | |
31: |
|
32: | public static function unsafeIconv($in, $out, $text)
|
33: | {
|
34: | set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
|
35: | $r = iconv($in, $out, $text);
|
36: | restore_error_handler();
|
37: | return $r;
|
38: | }
|
39: |
|
40: | |
41: | |
42: | |
43: | |
44: | |
45: | |
46: | |
47: |
|
48: | public static function iconv($in, $out, $text, $max_chunk_size = 8000)
|
49: | {
|
50: | $code = self::testIconvTruncateBug();
|
51: | if ($code == self::ICONV_OK) {
|
52: | return self::unsafeIconv($in, $out, $text);
|
53: | } elseif ($code == self::ICONV_TRUNCATES) {
|
54: |
|
55: |
|
56: | if ($in == 'utf-8') {
|
57: | if ($max_chunk_size < 4) {
|
58: | trigger_error('max_chunk_size is too small', E_USER_WARNING);
|
59: | return false;
|
60: | }
|
61: |
|
62: |
|
63: | if (($c = strlen($text)) <= $max_chunk_size) {
|
64: | return self::unsafeIconv($in, $out, $text);
|
65: | }
|
66: | $r = '';
|
67: | $i = 0;
|
68: | while (true) {
|
69: | if ($i + $max_chunk_size >= $c) {
|
70: | $r .= self::unsafeIconv($in, $out, substr($text, $i));
|
71: | break;
|
72: | }
|
73: |
|
74: | if (0x80 != (0xC0 & ord($text[$i + $max_chunk_size]))) {
|
75: | $chunk_size = $max_chunk_size;
|
76: | } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 1]))) {
|
77: | $chunk_size = $max_chunk_size - 1;
|
78: | } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 2]))) {
|
79: | $chunk_size = $max_chunk_size - 2;
|
80: | } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 3]))) {
|
81: | $chunk_size = $max_chunk_size - 3;
|
82: | } else {
|
83: | return false;
|
84: | }
|
85: | $chunk = substr($text, $i, $chunk_size);
|
86: | $r .= self::unsafeIconv($in, $out, $chunk);
|
87: | $i += $chunk_size;
|
88: | }
|
89: | return $r;
|
90: | } else {
|
91: | return false;
|
92: | }
|
93: | } else {
|
94: | return false;
|
95: | }
|
96: | }
|
97: |
|
98: | |
99: | |
100: | |
101: | |
102: | |
103: | |
104: | |
105: | |
106: | |
107: | |
108: | |
109: | |
110: | |
111: | |
112: | |
113: | |
114: | |
115: | |
116: | |
117: | |
118: | |
119: | |
120: | |
121: | |
122: | |
123: | |
124: | |
125: | |
126: | |
127: | |
128: | |
129: | |
130: | |
131: | |
132: | |
133: |
|
134: | public static function cleanUTF8($str, $force_php = false)
|
135: | {
|
136: |
|
137: |
|
138: |
|
139: | if (preg_match(
|
140: | '/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du',
|
141: | $str
|
142: | )) {
|
143: | return $str;
|
144: | }
|
145: |
|
146: | $mState = 0;
|
147: |
|
148: | $mUcs4 = 0;
|
149: | $mBytes = 1;
|
150: |
|
151: |
|
152: |
|
153: |
|
154: |
|
155: |
|
156: |
|
157: | $out = '';
|
158: | $char = '';
|
159: |
|
160: | $len = strlen($str);
|
161: | for ($i = 0; $i < $len; $i++) {
|
162: | $in = ord($str[$i]);
|
163: | $char .= $str[$i];
|
164: | if (0 == $mState) {
|
165: |
|
166: |
|
167: | if (0 == (0x80 & ($in))) {
|
168: |
|
169: | if (($in <= 31 || $in == 127) &&
|
170: | !($in == 9 || $in == 13 || $in == 10)
|
171: | ) {
|
172: |
|
173: | } else {
|
174: | $out .= $char;
|
175: | }
|
176: |
|
177: | $char = '';
|
178: | $mBytes = 1;
|
179: | } elseif (0xC0 == (0xE0 & ($in))) {
|
180: |
|
181: | $mUcs4 = ($in);
|
182: | $mUcs4 = ($mUcs4 & 0x1F) << 6;
|
183: | $mState = 1;
|
184: | $mBytes = 2;
|
185: | } elseif (0xE0 == (0xF0 & ($in))) {
|
186: |
|
187: | $mUcs4 = ($in);
|
188: | $mUcs4 = ($mUcs4 & 0x0F) << 12;
|
189: | $mState = 2;
|
190: | $mBytes = 3;
|
191: | } elseif (0xF0 == (0xF8 & ($in))) {
|
192: |
|
193: | $mUcs4 = ($in);
|
194: | $mUcs4 = ($mUcs4 & 0x07) << 18;
|
195: | $mState = 3;
|
196: | $mBytes = 4;
|
197: | } elseif (0xF8 == (0xFC & ($in))) {
|
198: |
|
199: |
|
200: |
|
201: |
|
202: |
|
203: |
|
204: |
|
205: |
|
206: |
|
207: | $mUcs4 = ($in);
|
208: | $mUcs4 = ($mUcs4 & 0x03) << 24;
|
209: | $mState = 4;
|
210: | $mBytes = 5;
|
211: | } elseif (0xFC == (0xFE & ($in))) {
|
212: |
|
213: |
|
214: | $mUcs4 = ($in);
|
215: | $mUcs4 = ($mUcs4 & 1) << 30;
|
216: | $mState = 5;
|
217: | $mBytes = 6;
|
218: | } else {
|
219: |
|
220: |
|
221: | $mState = 0;
|
222: | $mUcs4 = 0;
|
223: | $mBytes = 1;
|
224: | $char = '';
|
225: | }
|
226: | } else {
|
227: |
|
228: |
|
229: | if (0x80 == (0xC0 & ($in))) {
|
230: |
|
231: | $shift = ($mState - 1) * 6;
|
232: | $tmp = $in;
|
233: | $tmp = ($tmp & 0x0000003F) << $shift;
|
234: | $mUcs4 |= $tmp;
|
235: |
|
236: | if (0 == --$mState) {
|
237: |
|
238: |
|
239: |
|
240: |
|
241: |
|
242: |
|
243: | if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
|
244: | ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
|
245: | ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
|
246: | (4 < $mBytes) ||
|
247: |
|
248: | (($mUcs4 & 0xFFFFF800) == 0xD800) ||
|
249: |
|
250: | ($mUcs4 > 0x10FFFF)
|
251: | ) {
|
252: |
|
253: | } elseif (0xFEFF != $mUcs4 &&
|
254: |
|
255: | (
|
256: | 0x9 == $mUcs4 ||
|
257: | 0xA == $mUcs4 ||
|
258: | 0xD == $mUcs4 ||
|
259: | (0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
|
260: |
|
261: |
|
262: | (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
|
263: | (0xE000 <= $mUcs4 && 0xFFFD >= $mUcs4) ||
|
264: | (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
|
265: | )
|
266: | ) {
|
267: | $out .= $char;
|
268: | }
|
269: |
|
270: | $mState = 0;
|
271: | $mUcs4 = 0;
|
272: | $mBytes = 1;
|
273: | $char = '';
|
274: | }
|
275: | } else {
|
276: |
|
277: |
|
278: |
|
279: | $mState = 0;
|
280: | $mUcs4 = 0;
|
281: | $mBytes = 1;
|
282: | $char ='';
|
283: | }
|
284: | }
|
285: | }
|
286: | return $out;
|
287: | }
|
288: |
|
289: | |
290: | |
291: | |
292: | |
293: | |
294: | |
295: | |
296: | |
297: | |
298: | |
299: | |
300: |
|
301: |
|
302: |
|
303: |
|
304: |
|
305: |
|
306: |
|
307: |
|
308: |
|
309: |
|
310: |
|
311: |
|
312: |
|
313: |
|
314: |
|
315: | public static function unichr($code)
|
316: | {
|
317: | if ($code > 1114111 or $code < 0 or
|
318: | ($code >= 55296 and $code <= 57343) ) {
|
319: |
|
320: |
|
321: | return '';
|
322: | }
|
323: |
|
324: | $x = $y = $z = $w = 0;
|
325: | if ($code < 128) {
|
326: |
|
327: | $x = $code;
|
328: | } else {
|
329: |
|
330: | $x = ($code & 63) | 128;
|
331: | if ($code < 2048) {
|
332: | $y = (($code & 2047) >> 6) | 192;
|
333: | } else {
|
334: | $y = (($code & 4032) >> 6) | 128;
|
335: | if ($code < 65536) {
|
336: | $z = (($code >> 12) & 15) | 224;
|
337: | } else {
|
338: | $z = (($code >> 12) & 63) | 128;
|
339: | $w = (($code >> 18) & 7) | 240;
|
340: | }
|
341: | }
|
342: | }
|
343: |
|
344: | $ret = '';
|
345: | if ($w) {
|
346: | $ret .= chr($w);
|
347: | }
|
348: | if ($z) {
|
349: | $ret .= chr($z);
|
350: | }
|
351: | if ($y) {
|
352: | $ret .= chr($y);
|
353: | }
|
354: | $ret .= chr($x);
|
355: |
|
356: | return $ret;
|
357: | }
|
358: |
|
359: | |
360: | |
361: |
|
362: | public static function iconvAvailable()
|
363: | {
|
364: | static $iconv = null;
|
365: | if ($iconv === null) {
|
366: | $iconv = function_exists('iconv') && self::testIconvTruncateBug() != self::ICONV_UNUSABLE;
|
367: | }
|
368: | return $iconv;
|
369: | }
|
370: |
|
371: | |
372: | |
373: | |
374: | |
375: | |
376: | |
377: |
|
378: | public static function convertToUTF8($str, $config, $context)
|
379: | {
|
380: | $encoding = $config->get('Core.Encoding');
|
381: | if ($encoding === 'utf-8') {
|
382: | return $str;
|
383: | }
|
384: | static $iconv = null;
|
385: | if ($iconv === null) {
|
386: | $iconv = self::iconvAvailable();
|
387: | }
|
388: | if ($iconv && !$config->get('Test.ForceNoIconv')) {
|
389: |
|
390: | $str = self::unsafeIconv($encoding, 'utf-8//IGNORE', $str);
|
391: | if ($str === false) {
|
392: |
|
393: | trigger_error('Invalid encoding ' . $encoding, E_USER_ERROR);
|
394: | return '';
|
395: | }
|
396: |
|
397: |
|
398: |
|
399: | $str = strtr($str, self::testEncodingSupportsASCII($encoding));
|
400: | return $str;
|
401: | } elseif ($encoding === 'iso-8859-1' && function_exists('mb_convert_encoding')) {
|
402: | $str = mb_convert_encoding($str, 'UTF-8', 'ISO-8859-1');
|
403: | return $str;
|
404: | }
|
405: | $bug = HTMLPurifier_Encoder::testIconvTruncateBug();
|
406: | if ($bug == self::ICONV_OK) {
|
407: | trigger_error('Encoding not supported, please install iconv', E_USER_ERROR);
|
408: | } else {
|
409: | trigger_error(
|
410: | 'You have a buggy version of iconv, see https://bugs.php.net/bug.php?id=48147 ' .
|
411: | 'and http://sourceware.org/bugzilla/show_bug.cgi?id=13541',
|
412: | E_USER_ERROR
|
413: | );
|
414: | }
|
415: | }
|
416: |
|
417: | |
418: | |
419: | |
420: | |
421: | |
422: | |
423: | |
424: | |
425: |
|
426: | public static function convertFromUTF8($str, $config, $context)
|
427: | {
|
428: | $encoding = $config->get('Core.Encoding');
|
429: | if ($escape = $config->get('Core.EscapeNonASCIICharacters')) {
|
430: | $str = self::convertToASCIIDumbLossless($str);
|
431: | }
|
432: | if ($encoding === 'utf-8') {
|
433: | return $str;
|
434: | }
|
435: | static $iconv = null;
|
436: | if ($iconv === null) {
|
437: | $iconv = self::iconvAvailable();
|
438: | }
|
439: | if ($iconv && !$config->get('Test.ForceNoIconv')) {
|
440: |
|
441: | $ascii_fix = self::testEncodingSupportsASCII($encoding);
|
442: | if (!$escape && !empty($ascii_fix)) {
|
443: | $clear_fix = array();
|
444: | foreach ($ascii_fix as $utf8 => $native) {
|
445: | $clear_fix[$utf8] = '';
|
446: | }
|
447: | $str = strtr($str, $clear_fix);
|
448: | }
|
449: | $str = strtr($str, array_flip($ascii_fix));
|
450: |
|
451: | $str = self::iconv('utf-8', $encoding . '//IGNORE', $str);
|
452: | return $str;
|
453: | } elseif ($encoding === 'iso-8859-1' && function_exists('mb_convert_encoding')) {
|
454: | $str = mb_convert_encoding($str, 'ISO-8859-1', 'UTF-8');
|
455: | return $str;
|
456: | }
|
457: | trigger_error('Encoding not supported', E_USER_ERROR);
|
458: |
|
459: |
|
460: |
|
461: |
|
462: | }
|
463: |
|
464: | |
465: | |
466: | |
467: | |
468: | |
469: | |
470: | |
471: | |
472: | |
473: | |
474: | |
475: | |
476: | |
477: | |
478: | |
479: |
|
480: | public static function convertToASCIIDumbLossless($str)
|
481: | {
|
482: | $bytesleft = 0;
|
483: | $result = '';
|
484: | $working = 0;
|
485: | $len = strlen($str);
|
486: | for ($i = 0; $i < $len; $i++) {
|
487: | $bytevalue = ord($str[$i]);
|
488: | if ($bytevalue <= 0x7F) {
|
489: | $result .= chr($bytevalue);
|
490: | $bytesleft = 0;
|
491: | } elseif ($bytevalue <= 0xBF) {
|
492: | $working = $working << 6;
|
493: | $working += ($bytevalue & 0x3F);
|
494: | $bytesleft--;
|
495: | if ($bytesleft <= 0) {
|
496: | $result .= "&#" . $working . ";";
|
497: | }
|
498: | } elseif ($bytevalue <= 0xDF) {
|
499: | $working = $bytevalue & 0x1F;
|
500: | $bytesleft = 1;
|
501: | } elseif ($bytevalue <= 0xEF) {
|
502: | $working = $bytevalue & 0x0F;
|
503: | $bytesleft = 2;
|
504: | } else {
|
505: | $working = $bytevalue & 0x07;
|
506: | $bytesleft = 3;
|
507: | }
|
508: | }
|
509: | return $result;
|
510: | }
|
511: |
|
512: |
|
513: | const ICONV_OK = 0;
|
514: |
|
515: | |
516: |
|
517: | const ICONV_TRUNCATES = 1;
|
518: |
|
519: | |
520: |
|
521: | const ICONV_UNUSABLE = 2;
|
522: |
|
523: | |
524: | |
525: | |
526: | |
527: | |
528: | |
529: | |
530: | |
531: | |
532: | |
533: | |
534: | |
535: | |
536: |
|
537: | public static function testIconvTruncateBug()
|
538: | {
|
539: | static $code = null;
|
540: | if ($code === null) {
|
541: |
|
542: | $r = self::unsafeIconv('utf-8', 'ascii//IGNORE', "\xCE\xB1" . str_repeat('a', 9000));
|
543: | if ($r === false) {
|
544: | $code = self::ICONV_UNUSABLE;
|
545: | } elseif (($c = strlen($r)) < 9000) {
|
546: | $code = self::ICONV_TRUNCATES;
|
547: | } elseif ($c > 9000) {
|
548: | trigger_error(
|
549: | 'Your copy of iconv is extremely buggy. Please notify HTML Purifier maintainers: ' .
|
550: | 'include your iconv version as per phpversion()',
|
551: | E_USER_ERROR
|
552: | );
|
553: | } else {
|
554: | $code = self::ICONV_OK;
|
555: | }
|
556: | }
|
557: | return $code;
|
558: | }
|
559: |
|
560: | |
561: | |
562: | |
563: | |
564: | |
565: | |
566: | |
567: | |
568: | |
569: | |
570: |
|
571: | public static function testEncodingSupportsASCII($encoding, $bypass = false)
|
572: | {
|
573: |
|
574: |
|
575: |
|
576: |
|
577: |
|
578: | static $encodings = array();
|
579: | if (!$bypass) {
|
580: | if (isset($encodings[$encoding])) {
|
581: | return $encodings[$encoding];
|
582: | }
|
583: | $lenc = strtolower($encoding);
|
584: | switch ($lenc) {
|
585: | case 'shift_jis':
|
586: | return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');
|
587: | case 'johab':
|
588: | return array("\xE2\x82\xA9" => '\\');
|
589: | }
|
590: | if (strpos($lenc, 'iso-8859-') === 0) {
|
591: | return array();
|
592: | }
|
593: | }
|
594: | $ret = array();
|
595: | if (self::unsafeIconv('UTF-8', $encoding, 'a') === false) {
|
596: | return false;
|
597: | }
|
598: | for ($i = 0x20; $i <= 0x7E; $i++) {
|
599: | $c = chr($i);
|
600: | $r = self::unsafeIconv('UTF-8', "$encoding//IGNORE", $c);
|
601: | if ($r === '' ||
|
602: |
|
603: |
|
604: | ($r === $c && self::unsafeIconv($encoding, 'UTF-8//IGNORE', $r) !== $c)
|
605: | ) {
|
606: |
|
607: |
|
608: |
|
609: | $ret[self::unsafeIconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
|
610: | }
|
611: | }
|
612: | $encodings[$encoding] = $ret;
|
613: | return $ret;
|
614: | }
|
615: | }
|
616: |
|
617: |
|
618: | |