| 1: | <?php |
| 2: | |
| 3: | /** |
| 4: | * Validates a font family list according to CSS spec |
| 5: | */ |
| 6: | class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef |
| 7: | { |
| 8: | |
| 9: | protected $mask = null; |
| 10: | |
| 11: | public function __construct() |
| 12: | { |
| 13: | $this->mask = '_- '; |
| 14: | for ($c = 'a'; $c <= 'z'; $c++) { |
| 15: | $this->mask .= $c; |
| 16: | } |
| 17: | for ($c = 'A'; $c <= 'Z'; $c++) { |
| 18: | $this->mask .= $c; |
| 19: | } |
| 20: | for ($c = '0'; $c <= '9'; $c++) { |
| 21: | $this->mask .= $c; |
| 22: | } // cast-y, but should be fine |
| 23: | // special bytes used by UTF-8 |
| 24: | for ($i = 0x80; $i <= 0xFF; $i++) { |
| 25: | // We don't bother excluding invalid bytes in this range, |
| 26: | // because the our restriction of well-formed UTF-8 will |
| 27: | // prevent these from ever occurring. |
| 28: | $this->mask .= chr($i); |
| 29: | } |
| 30: | |
| 31: | /* |
| 32: | PHP's internal strcspn implementation is |
| 33: | O(length of string * length of mask), making it inefficient |
| 34: | for large masks. However, it's still faster than |
| 35: | preg_match 8) |
| 36: | for (p = s1;;) { |
| 37: | spanp = s2; |
| 38: | do { |
| 39: | if (*spanp == c || p == s1_end) { |
| 40: | return p - s1; |
| 41: | } |
| 42: | } while (spanp++ < (s2_end - 1)); |
| 43: | c = *++p; |
| 44: | } |
| 45: | */ |
| 46: | // possible optimization: invert the mask. |
| 47: | } |
| 48: | |
| 49: | /** |
| 50: | * @param string $string |
| 51: | * @param HTMLPurifier_Config $config |
| 52: | * @param HTMLPurifier_Context $context |
| 53: | * @return bool|string |
| 54: | */ |
| 55: | public function validate($string, $config, $context) |
| 56: | { |
| 57: | static $generic_names = array( |
| 58: | 'serif' => true, |
| 59: | 'sans-serif' => true, |
| 60: | 'monospace' => true, |
| 61: | 'fantasy' => true, |
| 62: | 'cursive' => true |
| 63: | ); |
| 64: | $allowed_fonts = $config->get('CSS.AllowedFonts'); |
| 65: | |
| 66: | // assume that no font names contain commas in them |
| 67: | $fonts = explode(',', $string); |
| 68: | $final = ''; |
| 69: | foreach ($fonts as $font) { |
| 70: | $font = trim($font); |
| 71: | if ($font === '') { |
| 72: | continue; |
| 73: | } |
| 74: | // match a generic name |
| 75: | if (isset($generic_names[$font])) { |
| 76: | if ($allowed_fonts === null || isset($allowed_fonts[$font])) { |
| 77: | $final .= $font . ', '; |
| 78: | } |
| 79: | continue; |
| 80: | } |
| 81: | // match a quoted name |
| 82: | if ($font[0] === '"' || $font[0] === "'") { |
| 83: | $length = strlen($font); |
| 84: | if ($length <= 2) { |
| 85: | continue; |
| 86: | } |
| 87: | $quote = $font[0]; |
| 88: | if ($font[$length - 1] !== $quote) { |
| 89: | continue; |
| 90: | } |
| 91: | $font = substr($font, 1, $length - 2); |
| 92: | } |
| 93: | |
| 94: | $font = $this->expandCSSEscape($font); |
| 95: | |
| 96: | // $font is a pure representation of the font name |
| 97: | |
| 98: | if ($allowed_fonts !== null && !isset($allowed_fonts[$font])) { |
| 99: | continue; |
| 100: | } |
| 101: | |
| 102: | if (ctype_alnum($font) && $font !== '') { |
| 103: | // very simple font, allow it in unharmed |
| 104: | $final .= $font . ', '; |
| 105: | continue; |
| 106: | } |
| 107: | |
| 108: | // bugger out on whitespace. form feed (0C) really |
| 109: | // shouldn't show up regardless |
| 110: | $font = str_replace(array("\n", "\t", "\r", "\x0C"), ' ', $font); |
| 111: | |
| 112: | // Here, there are various classes of characters which need |
| 113: | // to be treated differently: |
| 114: | // - Alphanumeric characters are essentially safe. We |
| 115: | // handled these above. |
| 116: | // - Spaces require quoting, though most parsers will do |
| 117: | // the right thing if there aren't any characters that |
| 118: | // can be misinterpreted |
| 119: | // - Dashes rarely occur, but they fairly unproblematic |
| 120: | // for parsing/rendering purposes. |
| 121: | // The above characters cover the majority of Western font |
| 122: | // names. |
| 123: | // - Arbitrary Unicode characters not in ASCII. Because |
| 124: | // most parsers give little thought to Unicode, treatment |
| 125: | // of these codepoints is basically uniform, even for |
| 126: | // punctuation-like codepoints. These characters can |
| 127: | // show up in non-Western pages and are supported by most |
| 128: | // major browsers, for example: "MS 明朝" is a |
| 129: | // legitimate font-name |
| 130: | // <http://ja.wikipedia.org/wiki/MS_明朝>. See |
| 131: | // the CSS3 spec for more examples: |
| 132: | // <http://www.w3.org/TR/2011/WD-css3-fonts-20110324/localizedfamilynames.png> |
| 133: | // You can see live samples of these on the Internet: |
| 134: | // <http://www.google.co.jp/search?q=font-family+MS+明朝|ゴシック> |
| 135: | // However, most of these fonts have ASCII equivalents: |
| 136: | // for example, 'MS Mincho', and it's considered |
| 137: | // professional to use ASCII font names instead of |
| 138: | // Unicode font names. Thanks Takeshi Terada for |
| 139: | // providing this information. |
| 140: | // The following characters, to my knowledge, have not been |
| 141: | // used to name font names. |
| 142: | // - Single quote. While theoretically you might find a |
| 143: | // font name that has a single quote in its name (serving |
| 144: | // as an apostrophe, e.g. Dave's Scribble), I haven't |
| 145: | // been able to find any actual examples of this. |
| 146: | // Internet Explorer's cssText translation (which I |
| 147: | // believe is invoked by innerHTML) normalizes any |
| 148: | // quoting to single quotes, and fails to escape single |
| 149: | // quotes. (Note that this is not IE's behavior for all |
| 150: | // CSS properties, just some sort of special casing for |
| 151: | // font-family). So a single quote *cannot* be used |
| 152: | // safely in the font-family context if there will be an |
| 153: | // innerHTML/cssText translation. Note that Firefox 3.x |
| 154: | // does this too. |
| 155: | // - Double quote. In IE, these get normalized to |
| 156: | // single-quotes, no matter what the encoding. (Fun |
| 157: | // fact, in IE8, the 'content' CSS property gained |
| 158: | // support, where they special cased to preserve encoded |
| 159: | // double quotes, but still translate unadorned double |
| 160: | // quotes into single quotes.) So, because their |
| 161: | // fixpoint behavior is identical to single quotes, they |
| 162: | // cannot be allowed either. Firefox 3.x displays |
| 163: | // single-quote style behavior. |
| 164: | // - Backslashes are reduced by one (so \\ -> \) every |
| 165: | // iteration, so they cannot be used safely. This shows |
| 166: | // up in IE7, IE8 and FF3 |
| 167: | // - Semicolons, commas and backticks are handled properly. |
| 168: | // - The rest of the ASCII punctuation is handled properly. |
| 169: | // We haven't checked what browsers do to unadorned |
| 170: | // versions, but this is not important as long as the |
| 171: | // browser doesn't /remove/ surrounding quotes (as IE does |
| 172: | // for HTML). |
| 173: | // |
| 174: | // With these results in hand, we conclude that there are |
| 175: | // various levels of safety: |
| 176: | // - Paranoid: alphanumeric, spaces and dashes(?) |
| 177: | // - International: Paranoid + non-ASCII Unicode |
| 178: | // - Edgy: Everything except quotes, backslashes |
| 179: | // - NoJS: Standards compliance, e.g. sod IE. Note that |
| 180: | // with some judicious character escaping (since certain |
| 181: | // types of escaping doesn't work) this is theoretically |
| 182: | // OK as long as innerHTML/cssText is not called. |
| 183: | // We believe that international is a reasonable default |
| 184: | // (that we will implement now), and once we do more |
| 185: | // extensive research, we may feel comfortable with dropping |
| 186: | // it down to edgy. |
| 187: | |
| 188: | // Edgy: alphanumeric, spaces, dashes, underscores and Unicode. Use of |
| 189: | // str(c)spn assumes that the string was already well formed |
| 190: | // Unicode (which of course it is). |
| 191: | if (strspn($font, $this->mask) !== strlen($font)) { |
| 192: | continue; |
| 193: | } |
| 194: | |
| 195: | // Historical: |
| 196: | // In the absence of innerHTML/cssText, these ugly |
| 197: | // transforms don't pose a security risk (as \\ and \" |
| 198: | // might--these escapes are not supported by most browsers). |
| 199: | // We could try to be clever and use single-quote wrapping |
| 200: | // when there is a double quote present, but I have choosen |
| 201: | // not to implement that. (NOTE: you can reduce the amount |
| 202: | // of escapes by one depending on what quoting style you use) |
| 203: | // $font = str_replace('\\', '\\5C ', $font); |
| 204: | // $font = str_replace('"', '\\22 ', $font); |
| 205: | // $font = str_replace("'", '\\27 ', $font); |
| 206: | |
| 207: | // font possibly with spaces, requires quoting |
| 208: | $final .= "'$font', "; |
| 209: | } |
| 210: | $final = rtrim($final, ', '); |
| 211: | if ($final === '') { |
| 212: | return false; |
| 213: | } |
| 214: | return $final; |
| 215: | } |
| 216: | |
| 217: | } |
| 218: | |
| 219: | // vim: et sw=4 sts=4 |
| 220: |