1: <?php
2:
3: /**
4: * A UTF-8 specific character encoder that handles cleaning and transforming.
5: * @note All functions in this class should be static.
6: */
7: class HTMLPurifier_Encoder
8: {
9:
10: /**
11: * Constructor throws fatal error if you attempt to instantiate class
12: */
13: private function __construct()
14: {
15: trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
16: }
17:
18: /**
19: * Error-handler that mutes errors, alternative to shut-up operator.
20: */
21: public static function muteErrorHandler()
22: {
23: }
24:
25: /**
26: * iconv wrapper which mutes errors, but doesn't work around bugs.
27: * @param string $in Input encoding
28: * @param string $out Output encoding
29: * @param string $text The text to convert
30: * @return string
31: */
32: public static function unsafeIconv($in, $out, $text)
33: {
34: set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
35: $r = iconv($in, $out, $text);
36: restore_error_handler();
37: return $r;
38: }
39:
40: /**
41: * iconv wrapper which mutes errors and works around bugs.
42: * @param string $in Input encoding
43: * @param string $out Output encoding
44: * @param string $text The text to convert
45: * @param int $max_chunk_size
46: * @return string
47: */
48: public static function iconv($in, $out, $text, $max_chunk_size = 8000)
49: {
50: $code = self::testIconvTruncateBug();
51: if ($code == self::ICONV_OK) {
52: return self::unsafeIconv($in, $out, $text);
53: } elseif ($code == self::ICONV_TRUNCATES) {
54: // we can only work around this if the input character set
55: // is utf-8
56: if ($in == 'utf-8') {
57: if ($max_chunk_size < 4) {
58: trigger_error('max_chunk_size is too small', E_USER_WARNING);
59: return false;
60: }
61: // split into 8000 byte chunks, but be careful to handle
62: // multibyte boundaries properly
63: if (($c = strlen($text)) <= $max_chunk_size) {
64: return self::unsafeIconv($in, $out, $text);
65: }
66: $r = '';
67: $i = 0;
68: while (true) {
69: if ($i + $max_chunk_size >= $c) {
70: $r .= self::unsafeIconv($in, $out, substr($text, $i));
71: break;
72: }
73: // wibble the boundary
74: if (0x80 != (0xC0 & ord($text[$i + $max_chunk_size]))) {
75: $chunk_size = $max_chunk_size;
76: } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 1]))) {
77: $chunk_size = $max_chunk_size - 1;
78: } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 2]))) {
79: $chunk_size = $max_chunk_size - 2;
80: } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 3]))) {
81: $chunk_size = $max_chunk_size - 3;
82: } else {
83: return false; // rather confusing UTF-8...
84: }
85: $chunk = substr($text, $i, $chunk_size); // substr doesn't mind overlong lengths
86: $r .= self::unsafeIconv($in, $out, $chunk);
87: $i += $chunk_size;
88: }
89: return $r;
90: } else {
91: return false;
92: }
93: } else {
94: return false;
95: }
96: }
97:
98: /**
99: * Cleans a UTF-8 string for well-formedness and SGML validity
100: *
101: * It will parse according to UTF-8 and return a valid UTF8 string, with
102: * non-SGML codepoints excluded.
103: *
104: * Specifically, it will permit:
105: * \x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}
106: * Source: https://www.w3.org/TR/REC-xml/#NT-Char
107: * Arguably this function should be modernized to the HTML5 set
108: * of allowed characters:
109: * https://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
110: * which simultaneously expand and restrict the set of allowed characters.
111: *
112: * @param string $str The string to clean
113: * @param bool $force_php
114: * @return string
115: *
116: * @note Just for reference, the non-SGML code points are 0 to 31 and
117: * 127 to 159, inclusive. However, we allow code points 9, 10
118: * and 13, which are the tab, line feed and carriage return
119: * respectively. 128 and above the code points map to multibyte
120: * UTF-8 representations.
121: *
122: * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
123: * hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
124: * LGPL license. Notes on what changed are inside, but in general,
125: * the original code transformed UTF-8 text into an array of integer
126: * Unicode codepoints. Understandably, transforming that back to
127: * a string would be somewhat expensive, so the function was modded to
128: * directly operate on the string. However, this discourages code
129: * reuse, and the logic enumerated here would be useful for any
130: * function that needs to be able to understand UTF-8 characters.
131: * As of right now, only smart lossless character encoding converters
132: * would need that, and I'm probably not going to implement them.
133: */
134: public static function cleanUTF8($str, $force_php = false)
135: {
136: // UTF-8 validity is checked since PHP 4.3.5
137: // This is an optimization: if the string is already valid UTF-8, no
138: // need to do PHP stuff. 99% of the time, this will be the case.
139: if (preg_match(
140: '/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du',
141: $str
142: )) {
143: return $str;
144: }
145:
146: $mState = 0; // cached expected number of octets after the current octet
147: // until the beginning of the next UTF8 character sequence
148: $mUcs4 = 0; // cached Unicode character
149: $mBytes = 1; // cached expected number of octets in the current sequence
150:
151: // original code involved an $out that was an array of Unicode
152: // codepoints. Instead of having to convert back into UTF-8, we've
153: // decided to directly append valid UTF-8 characters onto a string
154: // $out once they're done. $char accumulates raw bytes, while $mUcs4
155: // turns into the Unicode code point, so there's some redundancy.
156:
157: $out = '';
158: $char = '';
159:
160: $len = strlen($str);
161: for ($i = 0; $i < $len; $i++) {
162: $in = ord($str[$i]);
163: $char .= $str[$i]; // append byte to char
164: if (0 == $mState) {
165: // When mState is zero we expect either a US-ASCII character
166: // or a multi-octet sequence.
167: if (0 == (0x80 & ($in))) {
168: // US-ASCII, pass straight through.
169: if (($in <= 31 || $in == 127) &&
170: !($in == 9 || $in == 13 || $in == 10) // save \r\t\n
171: ) {
172: // control characters, remove
173: } else {
174: $out .= $char;
175: }
176: // reset
177: $char = '';
178: $mBytes = 1;
179: } elseif (0xC0 == (0xE0 & ($in))) {
180: // First octet of 2 octet sequence
181: $mUcs4 = ($in);
182: $mUcs4 = ($mUcs4 & 0x1F) << 6;
183: $mState = 1;
184: $mBytes = 2;
185: } elseif (0xE0 == (0xF0 & ($in))) {
186: // First octet of 3 octet sequence
187: $mUcs4 = ($in);
188: $mUcs4 = ($mUcs4 & 0x0F) << 12;
189: $mState = 2;
190: $mBytes = 3;
191: } elseif (0xF0 == (0xF8 & ($in))) {
192: // First octet of 4 octet sequence
193: $mUcs4 = ($in);
194: $mUcs4 = ($mUcs4 & 0x07) << 18;
195: $mState = 3;
196: $mBytes = 4;
197: } elseif (0xF8 == (0xFC & ($in))) {
198: // First octet of 5 octet sequence.
199: //
200: // This is illegal because the encoded codepoint must be
201: // either:
202: // (a) not the shortest form or
203: // (b) outside the Unicode range of 0-0x10FFFF.
204: // Rather than trying to resynchronize, we will carry on
205: // until the end of the sequence and let the later error
206: // handling code catch it.
207: $mUcs4 = ($in);
208: $mUcs4 = ($mUcs4 & 0x03) << 24;
209: $mState = 4;
210: $mBytes = 5;
211: } elseif (0xFC == (0xFE & ($in))) {
212: // First octet of 6 octet sequence, see comments for 5
213: // octet sequence.
214: $mUcs4 = ($in);
215: $mUcs4 = ($mUcs4 & 1) << 30;
216: $mState = 5;
217: $mBytes = 6;
218: } else {
219: // Current octet is neither in the US-ASCII range nor a
220: // legal first octet of a multi-octet sequence.
221: $mState = 0;
222: $mUcs4 = 0;
223: $mBytes = 1;
224: $char = '';
225: }
226: } else {
227: // When mState is non-zero, we expect a continuation of the
228: // multi-octet sequence
229: if (0x80 == (0xC0 & ($in))) {
230: // Legal continuation.
231: $shift = ($mState - 1) * 6;
232: $tmp = $in;
233: $tmp = ($tmp & 0x0000003F) << $shift;
234: $mUcs4 |= $tmp;
235:
236: if (0 == --$mState) {
237: // End of the multi-octet sequence. mUcs4 now contains
238: // the final Unicode codepoint to be output
239:
240: // Check for illegal sequences and codepoints.
241:
242: // From Unicode 3.1, non-shortest form is illegal
243: if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
244: ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
245: ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
246: (4 < $mBytes) ||
247: // From Unicode 3.2, surrogate characters = illegal
248: (($mUcs4 & 0xFFFFF800) == 0xD800) ||
249: // Codepoints outside the Unicode range are illegal
250: ($mUcs4 > 0x10FFFF)
251: ) {
252:
253: } elseif (0xFEFF != $mUcs4 && // omit BOM
254: // check for valid Char unicode codepoints
255: (
256: 0x9 == $mUcs4 ||
257: 0xA == $mUcs4 ||
258: 0xD == $mUcs4 ||
259: (0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
260: // 7F-9F is not strictly prohibited by XML,
261: // but it is non-SGML, and thus we don't allow it
262: (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
263: (0xE000 <= $mUcs4 && 0xFFFD >= $mUcs4) ||
264: (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
265: )
266: ) {
267: $out .= $char;
268: }
269: // initialize UTF8 cache (reset)
270: $mState = 0;
271: $mUcs4 = 0;
272: $mBytes = 1;
273: $char = '';
274: }
275: } else {
276: // ((0xC0 & (*in) != 0x80) && (mState != 0))
277: // Incomplete multi-octet sequence.
278: // used to result in complete fail, but we'll reset
279: $mState = 0;
280: $mUcs4 = 0;
281: $mBytes = 1;
282: $char ='';
283: }
284: }
285: }
286: return $out;
287: }
288:
289: /**
290: * Translates a Unicode codepoint into its corresponding UTF-8 character.
291: * @note Based on Feyd's function at
292: * <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
293: * which is in public domain.
294: * @note While we're going to do code point parsing anyway, a good
295: * optimization would be to refuse to translate code points that
296: * are non-SGML characters. However, this could lead to duplication.
297: * @note This is very similar to the unichr function in
298: * maintenance/generate-entity-file.php (although this is superior,
299: * due to its sanity checks).
300: */
301:
302: // +----------+----------+----------+----------+
303: // | 33222222 | 22221111 | 111111 | |
304: // | 10987654 | 32109876 | 54321098 | 76543210 | bit
305: // +----------+----------+----------+----------+
306: // | | | | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
307: // | | | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
308: // | | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
309: // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
310: // +----------+----------+----------+----------+
311: // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
312: // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
313: // +----------+----------+----------+----------+
314:
315: public static function unichr($code)
316: {
317: if ($code > 1114111 or $code < 0 or
318: ($code >= 55296 and $code <= 57343) ) {
319: // bits are set outside the "valid" range as defined
320: // by UNICODE 4.1.0
321: return '';
322: }
323:
324: $x = $y = $z = $w = 0;
325: if ($code < 128) {
326: // regular ASCII character
327: $x = $code;
328: } else {
329: // set up bits for UTF-8
330: $x = ($code & 63) | 128;
331: if ($code < 2048) {
332: $y = (($code & 2047) >> 6) | 192;
333: } else {
334: $y = (($code & 4032) >> 6) | 128;
335: if ($code < 65536) {
336: $z = (($code >> 12) & 15) | 224;
337: } else {
338: $z = (($code >> 12) & 63) | 128;
339: $w = (($code >> 18) & 7) | 240;
340: }
341: }
342: }
343: // set up the actual character
344: $ret = '';
345: if ($w) {
346: $ret .= chr($w);
347: }
348: if ($z) {
349: $ret .= chr($z);
350: }
351: if ($y) {
352: $ret .= chr($y);
353: }
354: $ret .= chr($x);
355:
356: return $ret;
357: }
358:
359: /**
360: * @return bool
361: */
362: public static function iconvAvailable()
363: {
364: static $iconv = null;
365: if ($iconv === null) {
366: $iconv = function_exists('iconv') && self::testIconvTruncateBug() != self::ICONV_UNUSABLE;
367: }
368: return $iconv;
369: }
370:
371: /**
372: * Convert a string to UTF-8 based on configuration.
373: * @param string $str The string to convert
374: * @param HTMLPurifier_Config $config
375: * @param HTMLPurifier_Context $context
376: * @return string
377: */
378: public static function convertToUTF8($str, $config, $context)
379: {
380: $encoding = $config->get('Core.Encoding');
381: if ($encoding === 'utf-8') {
382: return $str;
383: }
384: static $iconv = null;
385: if ($iconv === null) {
386: $iconv = self::iconvAvailable();
387: }
388: if ($iconv && !$config->get('Test.ForceNoIconv')) {
389: // unaffected by bugs, since UTF-8 support all characters
390: $str = self::unsafeIconv($encoding, 'utf-8//IGNORE', $str);
391: if ($str === false) {
392: // $encoding is not a valid encoding
393: trigger_error('Invalid encoding ' . $encoding, E_USER_ERROR);
394: return '';
395: }
396: // If the string is bjorked by Shift_JIS or a similar encoding
397: // that doesn't support all of ASCII, convert the naughty
398: // characters to their true byte-wise ASCII/UTF-8 equivalents.
399: $str = strtr($str, self::testEncodingSupportsASCII($encoding));
400: return $str;
401: } elseif ($encoding === 'iso-8859-1' && function_exists('mb_convert_encoding')) {
402: $str = mb_convert_encoding($str, 'UTF-8', 'ISO-8859-1');
403: return $str;
404: }
405: $bug = HTMLPurifier_Encoder::testIconvTruncateBug();
406: if ($bug == self::ICONV_OK) {
407: trigger_error('Encoding not supported, please install iconv', E_USER_ERROR);
408: } else {
409: trigger_error(
410: 'You have a buggy version of iconv, see https://bugs.php.net/bug.php?id=48147 ' .
411: 'and http://sourceware.org/bugzilla/show_bug.cgi?id=13541',
412: E_USER_ERROR
413: );
414: }
415: }
416:
417: /**
418: * Converts a string from UTF-8 based on configuration.
419: * @param string $str The string to convert
420: * @param HTMLPurifier_Config $config
421: * @param HTMLPurifier_Context $context
422: * @return string
423: * @note Currently, this is a lossy conversion, with unexpressable
424: * characters being omitted.
425: */
426: public static function convertFromUTF8($str, $config, $context)
427: {
428: $encoding = $config->get('Core.Encoding');
429: if ($escape = $config->get('Core.EscapeNonASCIICharacters')) {
430: $str = self::convertToASCIIDumbLossless($str);
431: }
432: if ($encoding === 'utf-8') {
433: return $str;
434: }
435: static $iconv = null;
436: if ($iconv === null) {
437: $iconv = self::iconvAvailable();
438: }
439: if ($iconv && !$config->get('Test.ForceNoIconv')) {
440: // Undo our previous fix in convertToUTF8, otherwise iconv will barf
441: $ascii_fix = self::testEncodingSupportsASCII($encoding);
442: if (!$escape && !empty($ascii_fix)) {
443: $clear_fix = array();
444: foreach ($ascii_fix as $utf8 => $native) {
445: $clear_fix[$utf8] = '';
446: }
447: $str = strtr($str, $clear_fix);
448: }
449: $str = strtr($str, array_flip($ascii_fix));
450: // Normal stuff
451: $str = self::iconv('utf-8', $encoding . '//IGNORE', $str);
452: return $str;
453: } elseif ($encoding === 'iso-8859-1' && function_exists('mb_convert_encoding')) {
454: $str = mb_convert_encoding($str, 'ISO-8859-1', 'UTF-8');
455: return $str;
456: }
457: trigger_error('Encoding not supported', E_USER_ERROR);
458: // You might be tempted to assume that the ASCII representation
459: // might be OK, however, this is *not* universally true over all
460: // encodings. So we take the conservative route here, rather
461: // than forcibly turn on %Core.EscapeNonASCIICharacters
462: }
463:
464: /**
465: * Lossless (character-wise) conversion of HTML to ASCII
466: * @param string $str UTF-8 string to be converted to ASCII
467: * @return string ASCII encoded string with non-ASCII character entity-ized
468: * @warning Adapted from MediaWiki, claiming fair use: this is a common
469: * algorithm. If you disagree with this license fudgery,
470: * implement it yourself.
471: * @note Uses decimal numeric entities since they are best supported.
472: * @note This is a DUMB function: it has no concept of keeping
473: * character entities that the projected character encoding
474: * can allow. We could possibly implement a smart version
475: * but that would require it to also know which Unicode
476: * codepoints the charset supported (not an easy task).
477: * @note Sort of with cleanUTF8() but it assumes that $str is
478: * well-formed UTF-8
479: */
480: public static function convertToASCIIDumbLossless($str)
481: {
482: $bytesleft = 0;
483: $result = '';
484: $working = 0;
485: $len = strlen($str);
486: for ($i = 0; $i < $len; $i++) {
487: $bytevalue = ord($str[$i]);
488: if ($bytevalue <= 0x7F) { //0xxx xxxx
489: $result .= chr($bytevalue);
490: $bytesleft = 0;
491: } elseif ($bytevalue <= 0xBF) { //10xx xxxx
492: $working = $working << 6;
493: $working += ($bytevalue & 0x3F);
494: $bytesleft--;
495: if ($bytesleft <= 0) {
496: $result .= "&#" . $working . ";";
497: }
498: } elseif ($bytevalue <= 0xDF) { //110x xxxx
499: $working = $bytevalue & 0x1F;
500: $bytesleft = 1;
501: } elseif ($bytevalue <= 0xEF) { //1110 xxxx
502: $working = $bytevalue & 0x0F;
503: $bytesleft = 2;
504: } else { //1111 0xxx
505: $working = $bytevalue & 0x07;
506: $bytesleft = 3;
507: }
508: }
509: return $result;
510: }
511:
512: /** No bugs detected in iconv. */
513: const ICONV_OK = 0;
514:
515: /** Iconv truncates output if converting from UTF-8 to another
516: * character set with //IGNORE, and a non-encodable character is found */
517: const ICONV_TRUNCATES = 1;
518:
519: /** Iconv does not support //IGNORE, making it unusable for
520: * transcoding purposes */
521: const ICONV_UNUSABLE = 2;
522:
523: /**
524: * glibc iconv has a known bug where it doesn't handle the magic
525: * //IGNORE stanza correctly. In particular, rather than ignore
526: * characters, it will return an EILSEQ after consuming some number
527: * of characters, and expect you to restart iconv as if it were
528: * an E2BIG. Old versions of PHP did not respect the errno, and
529: * returned the fragment, so as a result you would see iconv
530: * mysteriously truncating output. We can work around this by
531: * manually chopping our input into segments of about 8000
532: * characters, as long as PHP ignores the error code. If PHP starts
533: * paying attention to the error code, iconv becomes unusable.
534: *
535: * @return int Error code indicating severity of bug.
536: */
537: public static function testIconvTruncateBug()
538: {
539: static $code = null;
540: if ($code === null) {
541: // better not use iconv, otherwise infinite loop!
542: $r = self::unsafeIconv('utf-8', 'ascii//IGNORE', "\xCE\xB1" . str_repeat('a', 9000));
543: if ($r === false) {
544: $code = self::ICONV_UNUSABLE;
545: } elseif (($c = strlen($r)) < 9000) {
546: $code = self::ICONV_TRUNCATES;
547: } elseif ($c > 9000) {
548: trigger_error(
549: 'Your copy of iconv is extremely buggy. Please notify HTML Purifier maintainers: ' .
550: 'include your iconv version as per phpversion()',
551: E_USER_ERROR
552: );
553: } else {
554: $code = self::ICONV_OK;
555: }
556: }
557: return $code;
558: }
559:
560: /**
561: * This expensive function tests whether or not a given character
562: * encoding supports ASCII. 7/8-bit encodings like Shift_JIS will
563: * fail this test, and require special processing. Variable width
564: * encodings shouldn't ever fail.
565: *
566: * @param string $encoding Encoding name to test, as per iconv format
567: * @param bool $bypass Whether or not to bypass the precompiled arrays.
568: * @return Array of UTF-8 characters to their corresponding ASCII,
569: * which can be used to "undo" any overzealous iconv action.
570: */
571: public static function testEncodingSupportsASCII($encoding, $bypass = false)
572: {
573: // All calls to iconv here are unsafe, proof by case analysis:
574: // If ICONV_OK, no difference.
575: // If ICONV_TRUNCATE, all calls involve one character inputs,
576: // so bug is not triggered.
577: // If ICONV_UNUSABLE, this call is irrelevant
578: static $encodings = array();
579: if (!$bypass) {
580: if (isset($encodings[$encoding])) {
581: return $encodings[$encoding];
582: }
583: $lenc = strtolower($encoding);
584: switch ($lenc) {
585: case 'shift_jis':
586: return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');
587: case 'johab':
588: return array("\xE2\x82\xA9" => '\\');
589: }
590: if (strpos($lenc, 'iso-8859-') === 0) {
591: return array();
592: }
593: }
594: $ret = array();
595: if (self::unsafeIconv('UTF-8', $encoding, 'a') === false) {
596: return false;
597: }
598: for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars
599: $c = chr($i); // UTF-8 char
600: $r = self::unsafeIconv('UTF-8', "$encoding//IGNORE", $c); // initial conversion
601: if ($r === '' ||
602: // This line is needed for iconv implementations that do not
603: // omit characters that do not exist in the target character set
604: ($r === $c && self::unsafeIconv($encoding, 'UTF-8//IGNORE', $r) !== $c)
605: ) {
606: // Reverse engineer: what's the UTF-8 equiv of this byte
607: // sequence? This assumes that there's no variable width
608: // encoding that doesn't support ASCII.
609: $ret[self::unsafeIconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
610: }
611: }
612: $encodings[$encoding] = $ret;
613: return $ret;
614: }
615: }
616:
617: // vim: et sw=4 sts=4
618: