| 1: | <?php
|
| 2: |
|
| 3: |
|
| 4: |
|
| 5: |
|
| 6: |
|
| 7: | |
| 8: | |
| 9: |
|
| 10: | class HTMLPurifier_EntityParser
|
| 11: | {
|
| 12: |
|
| 13: | |
| 14: | |
| 15: | |
| 16: |
|
| 17: | protected $_entity_lookup;
|
| 18: |
|
| 19: | |
| 20: | |
| 21: | |
| 22: |
|
| 23: | protected $_textEntitiesRegex;
|
| 24: |
|
| 25: | |
| 26: | |
| 27: | |
| 28: |
|
| 29: | protected $_attrEntitiesRegex;
|
| 30: |
|
| 31: | |
| 32: | |
| 33: |
|
| 34: | protected $_semiOptionalPrefixRegex;
|
| 35: |
|
| 36: | public function __construct() {
|
| 37: |
|
| 38: |
|
| 39: | $semi_optional = "quot|QUOT|lt|LT|gt|GT|amp|AMP|AElig|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|Iacute|Icirc|Igrave|Iuml|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml";
|
| 40: |
|
| 41: |
|
| 42: |
|
| 43: | $this->_semiOptionalPrefixRegex = "/&()()()($semi_optional)/";
|
| 44: |
|
| 45: | $this->_textEntitiesRegex =
|
| 46: | '/&(?:'.
|
| 47: |
|
| 48: | '[#]x([a-fA-F0-9]+);?|'.
|
| 49: |
|
| 50: | '[#]0*(\d+);?|'.
|
| 51: |
|
| 52: |
|
| 53: | '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
|
| 54: |
|
| 55: | "($semi_optional)".
|
| 56: | ')/';
|
| 57: |
|
| 58: | $this->_attrEntitiesRegex =
|
| 59: | '/&(?:'.
|
| 60: |
|
| 61: | '[#]x([a-fA-F0-9]+);?|'.
|
| 62: |
|
| 63: | '[#]0*(\d+);?|'.
|
| 64: |
|
| 65: |
|
| 66: | '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
|
| 67: |
|
| 68: |
|
| 69: |
|
| 70: | "($semi_optional)(?![=;A-Za-z0-9])".
|
| 71: | ')/';
|
| 72: |
|
| 73: | }
|
| 74: |
|
| 75: | |
| 76: | |
| 77: | |
| 78: | |
| 79: | |
| 80: | |
| 81: |
|
| 82: | public function substituteTextEntities($string)
|
| 83: | {
|
| 84: | return preg_replace_callback(
|
| 85: | $this->_textEntitiesRegex,
|
| 86: | array($this, 'entityCallback'),
|
| 87: | $string
|
| 88: | );
|
| 89: | }
|
| 90: |
|
| 91: | |
| 92: | |
| 93: | |
| 94: | |
| 95: | |
| 96: | |
| 97: |
|
| 98: | public function substituteAttrEntities($string)
|
| 99: | {
|
| 100: | return preg_replace_callback(
|
| 101: | $this->_attrEntitiesRegex,
|
| 102: | array($this, 'entityCallback'),
|
| 103: | $string
|
| 104: | );
|
| 105: | }
|
| 106: |
|
| 107: | |
| 108: | |
| 109: | |
| 110: | |
| 111: | |
| 112: | |
| 113: | |
| 114: |
|
| 115: |
|
| 116: | protected function entityCallback($matches)
|
| 117: | {
|
| 118: | $entity = $matches[0];
|
| 119: | $hex_part = @$matches[1];
|
| 120: | $dec_part = @$matches[2];
|
| 121: | $named_part = empty($matches[3]) ? (empty($matches[4]) ? "" : $matches[4]) : $matches[3];
|
| 122: | if ($hex_part !== NULL && $hex_part !== "") {
|
| 123: | return HTMLPurifier_Encoder::unichr(hexdec($hex_part));
|
| 124: | } elseif ($dec_part !== NULL && $dec_part !== "") {
|
| 125: | return HTMLPurifier_Encoder::unichr((int) $dec_part);
|
| 126: | } else {
|
| 127: | if (!$this->_entity_lookup) {
|
| 128: | $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
|
| 129: | }
|
| 130: | if (isset($this->_entity_lookup->table[$named_part])) {
|
| 131: | return $this->_entity_lookup->table[$named_part];
|
| 132: | } else {
|
| 133: |
|
| 134: |
|
| 135: |
|
| 136: |
|
| 137: | if (!empty($matches[3])) {
|
| 138: | return preg_replace_callback(
|
| 139: | $this->_semiOptionalPrefixRegex,
|
| 140: | array($this, 'entityCallback'),
|
| 141: | $entity
|
| 142: | );
|
| 143: | }
|
| 144: | return $entity;
|
| 145: | }
|
| 146: | }
|
| 147: | }
|
| 148: |
|
| 149: |
|
| 150: |
|
| 151: | |
| 152: | |
| 153: | |
| 154: |
|
| 155: | protected $_substituteEntitiesRegex =
|
| 156: | '/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/';
|
| 157: |
|
| 158: |
|
| 159: | |
| 160: | |
| 161: | |
| 162: |
|
| 163: | protected $_special_dec2str =
|
| 164: | array(
|
| 165: | 34 => '"',
|
| 166: | 38 => '&',
|
| 167: | 39 => "'",
|
| 168: | 60 => '<',
|
| 169: | 62 => '>'
|
| 170: | );
|
| 171: |
|
| 172: | |
| 173: | |
| 174: | |
| 175: |
|
| 176: | protected $_special_ent2dec =
|
| 177: | array(
|
| 178: | 'quot' => 34,
|
| 179: | 'amp' => 38,
|
| 180: | 'lt' => 60,
|
| 181: | 'gt' => 62
|
| 182: | );
|
| 183: |
|
| 184: | |
| 185: | |
| 186: | |
| 187: | |
| 188: | |
| 189: | |
| 190: | |
| 191: |
|
| 192: | public function substituteNonSpecialEntities($string)
|
| 193: | {
|
| 194: |
|
| 195: | return preg_replace_callback(
|
| 196: | $this->_substituteEntitiesRegex,
|
| 197: | array($this, 'nonSpecialEntityCallback'),
|
| 198: | $string
|
| 199: | );
|
| 200: | }
|
| 201: |
|
| 202: | |
| 203: | |
| 204: | |
| 205: | |
| 206: | |
| 207: | |
| 208: | |
| 209: |
|
| 210: |
|
| 211: | protected function nonSpecialEntityCallback($matches)
|
| 212: | {
|
| 213: |
|
| 214: | $entity = $matches[0];
|
| 215: | $is_num = (@$matches[0][1] === '#');
|
| 216: | if ($is_num) {
|
| 217: | $is_hex = (@$entity[2] === 'x');
|
| 218: | $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
|
| 219: |
|
| 220: | if (isset($this->_special_dec2str[$code])) {
|
| 221: | return $entity;
|
| 222: | }
|
| 223: | return HTMLPurifier_Encoder::unichr($code);
|
| 224: | } else {
|
| 225: | if (isset($this->_special_ent2dec[$matches[3]])) {
|
| 226: | return $entity;
|
| 227: | }
|
| 228: | if (!$this->_entity_lookup) {
|
| 229: | $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
|
| 230: | }
|
| 231: | if (isset($this->_entity_lookup->table[$matches[3]])) {
|
| 232: | return $this->_entity_lookup->table[$matches[3]];
|
| 233: | } else {
|
| 234: | return $entity;
|
| 235: | }
|
| 236: | }
|
| 237: | }
|
| 238: |
|
| 239: | |
| 240: | |
| 241: | |
| 242: | |
| 243: | |
| 244: | |
| 245: | |
| 246: | |
| 247: |
|
| 248: | public function substituteSpecialEntities($string)
|
| 249: | {
|
| 250: | return preg_replace_callback(
|
| 251: | $this->_substituteEntitiesRegex,
|
| 252: | array($this, 'specialEntityCallback'),
|
| 253: | $string
|
| 254: | );
|
| 255: | }
|
| 256: |
|
| 257: | |
| 258: | |
| 259: | |
| 260: | |
| 261: | |
| 262: | |
| 263: | |
| 264: | |
| 265: | |
| 266: |
|
| 267: | protected function specialEntityCallback($matches)
|
| 268: | {
|
| 269: | $entity = $matches[0];
|
| 270: | $is_num = (@$matches[0][1] === '#');
|
| 271: | if ($is_num) {
|
| 272: | $is_hex = (@$entity[2] === 'x');
|
| 273: | $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
|
| 274: | return isset($this->_special_dec2str[$int]) ?
|
| 275: | $this->_special_dec2str[$int] :
|
| 276: | $entity;
|
| 277: | } else {
|
| 278: | return isset($this->_special_ent2dec[$matches[3]]) ?
|
| 279: | $this->_special_dec2str[$this->_special_ent2dec[$matches[3]]] :
|
| 280: | $entity;
|
| 281: | }
|
| 282: | }
|
| 283: | }
|
| 284: |
|
| 285: |
|
| 286: | |