| 1: | <?php
|
| 2: |
|
| 3: | |
| 4: | |
| 5: | |
| 6: | |
| 7: | |
| 8: | |
| 9: | |
| 10: | |
| 11: | |
| 12: | |
| 13: | |
| 14: | |
| 15: | |
| 16: | |
| 17: | |
| 18: | |
| 19: | |
| 20: | |
| 21: | |
| 22: | |
| 23: | |
| 24: | |
| 25: | |
| 26: | |
| 27: | |
| 28: | |
| 29: | |
| 30: | |
| 31: | |
| 32: | |
| 33: | |
| 34: | |
| 35: | |
| 36: | |
| 37: | |
| 38: | |
| 39: | |
| 40: | |
| 41: |
|
| 42: | class HTMLPurifier_Lexer
|
| 43: | {
|
| 44: |
|
| 45: | |
| 46: | |
| 47: | |
| 48: |
|
| 49: | public $tracksLineNumbers = false;
|
| 50: |
|
| 51: | |
| 52: | |
| 53: |
|
| 54: | private $_entity_parser;
|
| 55: |
|
| 56: |
|
| 57: |
|
| 58: | |
| 59: | |
| 60: | |
| 61: | |
| 62: | |
| 63: | |
| 64: | |
| 65: | |
| 66: | |
| 67: | |
| 68: | |
| 69: | |
| 70: | |
| 71: | |
| 72: | |
| 73: |
|
| 74: | public static function create($config)
|
| 75: | {
|
| 76: | if (!($config instanceof HTMLPurifier_Config)) {
|
| 77: | $lexer = $config;
|
| 78: | trigger_error(
|
| 79: | "Passing a prototype to
|
| 80: | HTMLPurifier_Lexer::create() is deprecated, please instead
|
| 81: | use %Core.LexerImpl",
|
| 82: | E_USER_WARNING
|
| 83: | );
|
| 84: | } else {
|
| 85: | $lexer = $config->get('Core.LexerImpl');
|
| 86: | }
|
| 87: |
|
| 88: | $needs_tracking =
|
| 89: | $config->get('Core.MaintainLineNumbers') ||
|
| 90: | $config->get('Core.CollectErrors');
|
| 91: |
|
| 92: | $inst = null;
|
| 93: | if (is_object($lexer)) {
|
| 94: | $inst = $lexer;
|
| 95: | } else {
|
| 96: | if (is_null($lexer)) {
|
| 97: | do {
|
| 98: |
|
| 99: | if ($needs_tracking) {
|
| 100: | $lexer = 'DirectLex';
|
| 101: | break;
|
| 102: | }
|
| 103: |
|
| 104: | if (class_exists('DOMDocument', false) &&
|
| 105: | method_exists('DOMDocument', 'loadHTML') &&
|
| 106: | !extension_loaded('domxml')
|
| 107: | ) {
|
| 108: |
|
| 109: |
|
| 110: |
|
| 111: |
|
| 112: | $lexer = 'DOMLex';
|
| 113: | } else {
|
| 114: | $lexer = 'DirectLex';
|
| 115: | }
|
| 116: | } while (0);
|
| 117: | }
|
| 118: |
|
| 119: |
|
| 120: | switch ($lexer) {
|
| 121: | case 'DOMLex':
|
| 122: | $inst = new HTMLPurifier_Lexer_DOMLex();
|
| 123: | break;
|
| 124: | case 'DirectLex':
|
| 125: | $inst = new HTMLPurifier_Lexer_DirectLex();
|
| 126: | break;
|
| 127: | case 'PH5P':
|
| 128: | $inst = new HTMLPurifier_Lexer_PH5P();
|
| 129: | break;
|
| 130: | default:
|
| 131: | throw new HTMLPurifier_Exception(
|
| 132: | "Cannot instantiate unrecognized Lexer type " .
|
| 133: | htmlspecialchars($lexer)
|
| 134: | );
|
| 135: | }
|
| 136: | }
|
| 137: |
|
| 138: | if (!$inst) {
|
| 139: | throw new HTMLPurifier_Exception('No lexer was instantiated');
|
| 140: | }
|
| 141: |
|
| 142: |
|
| 143: |
|
| 144: | if ($needs_tracking && !$inst->tracksLineNumbers) {
|
| 145: | throw new HTMLPurifier_Exception(
|
| 146: | 'Cannot use lexer that does not support line numbers with ' .
|
| 147: | 'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'
|
| 148: | );
|
| 149: | }
|
| 150: |
|
| 151: | return $inst;
|
| 152: |
|
| 153: | }
|
| 154: |
|
| 155: |
|
| 156: |
|
| 157: | public function __construct()
|
| 158: | {
|
| 159: | $this->_entity_parser = new HTMLPurifier_EntityParser();
|
| 160: | }
|
| 161: |
|
| 162: | |
| 163: | |
| 164: | |
| 165: |
|
| 166: | protected $_special_entity2str =
|
| 167: | array(
|
| 168: | '"' => '"',
|
| 169: | '&' => '&',
|
| 170: | '<' => '<',
|
| 171: | '>' => '>',
|
| 172: | ''' => "'",
|
| 173: | ''' => "'",
|
| 174: | ''' => "'"
|
| 175: | );
|
| 176: |
|
| 177: | public function parseText($string, $config) {
|
| 178: | return $this->parseData($string, false, $config);
|
| 179: | }
|
| 180: |
|
| 181: | public function parseAttr($string, $config) {
|
| 182: | return $this->parseData($string, true, $config);
|
| 183: | }
|
| 184: |
|
| 185: | |
| 186: | |
| 187: | |
| 188: | |
| 189: | |
| 190: | |
| 191: | |
| 192: | |
| 193: |
|
| 194: | public function parseData($string, $is_attr, $config)
|
| 195: | {
|
| 196: |
|
| 197: | if ($string === '') {
|
| 198: | return '';
|
| 199: | }
|
| 200: |
|
| 201: |
|
| 202: | $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
|
| 203: | ($string[strlen($string) - 1] === '&' ? 1 : 0);
|
| 204: |
|
| 205: | if (!$num_amp) {
|
| 206: | return $string;
|
| 207: | }
|
| 208: | $num_esc_amp = substr_count($string, '&');
|
| 209: | $string = strtr($string, $this->_special_entity2str);
|
| 210: |
|
| 211: |
|
| 212: | $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
|
| 213: | ($string[strlen($string) - 1] === '&' ? 1 : 0);
|
| 214: |
|
| 215: | if ($num_amp_2 <= $num_esc_amp) {
|
| 216: | return $string;
|
| 217: | }
|
| 218: |
|
| 219: |
|
| 220: | if ($config->get('Core.LegacyEntityDecoder')) {
|
| 221: | $string = $this->_entity_parser->substituteSpecialEntities($string);
|
| 222: | } else {
|
| 223: | if ($is_attr) {
|
| 224: | $string = $this->_entity_parser->substituteAttrEntities($string);
|
| 225: | } else {
|
| 226: | $string = $this->_entity_parser->substituteTextEntities($string);
|
| 227: | }
|
| 228: | }
|
| 229: | return $string;
|
| 230: | }
|
| 231: |
|
| 232: | |
| 233: | |
| 234: | |
| 235: | |
| 236: | |
| 237: | |
| 238: |
|
| 239: | public function tokenizeHTML($string, $config, $context)
|
| 240: | {
|
| 241: | trigger_error('Call to abstract class', E_USER_ERROR);
|
| 242: | }
|
| 243: |
|
| 244: | |
| 245: | |
| 246: | |
| 247: | |
| 248: |
|
| 249: | protected static function escapeCDATA($string)
|
| 250: | {
|
| 251: | return preg_replace_callback(
|
| 252: | '/<!\[CDATA\[(.+?)\]\]>/s',
|
| 253: | array('HTMLPurifier_Lexer', 'CDATACallback'),
|
| 254: | $string
|
| 255: | );
|
| 256: | }
|
| 257: |
|
| 258: | |
| 259: | |
| 260: | |
| 261: | |
| 262: |
|
| 263: | protected static function escapeCommentedCDATA($string)
|
| 264: | {
|
| 265: | return preg_replace_callback(
|
| 266: | '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
|
| 267: | array('HTMLPurifier_Lexer', 'CDATACallback'),
|
| 268: | $string
|
| 269: | );
|
| 270: | }
|
| 271: |
|
| 272: | |
| 273: | |
| 274: | |
| 275: | |
| 276: |
|
| 277: | protected static function removeIEConditional($string)
|
| 278: | {
|
| 279: | return preg_replace(
|
| 280: | '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si',
|
| 281: | '',
|
| 282: | $string
|
| 283: | );
|
| 284: | }
|
| 285: |
|
| 286: | |
| 287: | |
| 288: | |
| 289: | |
| 290: | |
| 291: | |
| 292: | |
| 293: | |
| 294: |
|
| 295: | protected static function CDATACallback($matches)
|
| 296: | {
|
| 297: |
|
| 298: | return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
|
| 299: | }
|
| 300: |
|
| 301: | |
| 302: | |
| 303: | |
| 304: | |
| 305: | |
| 306: | |
| 307: | |
| 308: | |
| 309: |
|
| 310: | public function normalize($html, $config, $context)
|
| 311: | {
|
| 312: |
|
| 313: | if ($config->get('Core.NormalizeNewlines')) {
|
| 314: | $html = str_replace("\r\n", "\n", (string)$html);
|
| 315: | $html = str_replace("\r", "\n", (string)$html);
|
| 316: | }
|
| 317: |
|
| 318: | if ($config->get('HTML.Trusted')) {
|
| 319: |
|
| 320: | $html = $this->escapeCommentedCDATA($html);
|
| 321: | }
|
| 322: |
|
| 323: |
|
| 324: | $html = $this->escapeCDATA($html);
|
| 325: |
|
| 326: | $html = $this->removeIEConditional($html);
|
| 327: |
|
| 328: |
|
| 329: | if ($config->get('Core.ConvertDocumentToFragment')) {
|
| 330: | $e = false;
|
| 331: | if ($config->get('Core.CollectErrors')) {
|
| 332: | $e =& $context->get('ErrorCollector');
|
| 333: | }
|
| 334: | $new_html = $this->extractBody($html);
|
| 335: | if ($e && $new_html != $html) {
|
| 336: | $e->send(E_WARNING, 'Lexer: Extracted body');
|
| 337: | }
|
| 338: | $html = $new_html;
|
| 339: | }
|
| 340: |
|
| 341: |
|
| 342: | if ($config->get('Core.LegacyEntityDecoder')) {
|
| 343: | $html = $this->_entity_parser->substituteNonSpecialEntities($html);
|
| 344: | }
|
| 345: |
|
| 346: |
|
| 347: |
|
| 348: |
|
| 349: | $html = HTMLPurifier_Encoder::cleanUTF8($html);
|
| 350: |
|
| 351: |
|
| 352: | if ($config->get('Core.RemoveProcessingInstructions')) {
|
| 353: | $html = preg_replace('#<\?.+?\?>#s', '', $html);
|
| 354: | }
|
| 355: |
|
| 356: | $hidden_elements = $config->get('Core.HiddenElements');
|
| 357: | if ($config->get('Core.AggressivelyRemoveScript') &&
|
| 358: | !($config->get('HTML.Trusted') || !$config->get('Core.RemoveScriptContents')
|
| 359: | || empty($hidden_elements["script"]))) {
|
| 360: | $html = preg_replace('#<script[^>]*>.*?</script>#i', '', $html);
|
| 361: | }
|
| 362: |
|
| 363: | return $html;
|
| 364: | }
|
| 365: |
|
| 366: | |
| 367: | |
| 368: | |
| 369: |
|
| 370: | public function extractBody($html)
|
| 371: | {
|
| 372: | $matches = array();
|
| 373: | $result = preg_match('|(.*?)<body[^>]*>(.*)</body>|is', $html, $matches);
|
| 374: | if ($result) {
|
| 375: |
|
| 376: | $comment_start = strrpos($matches[1], '<!--');
|
| 377: | $comment_end = strrpos($matches[1], '-->');
|
| 378: | if ($comment_start === false ||
|
| 379: | ($comment_end !== false && $comment_end > $comment_start)) {
|
| 380: | return $matches[2];
|
| 381: | }
|
| 382: | }
|
| 383: | return $html;
|
| 384: | }
|
| 385: | }
|
| 386: |
|
| 387: |
|
| 388: | |