1: <?php
2:
3: /**
4: * Parses a URI into the components and fragment identifier as specified
5: * by RFC 3986.
6: */
7: class HTMLPurifier_URIParser
8: {
9:
10: /**
11: * Instance of HTMLPurifier_PercentEncoder to do normalization with.
12: */
13: protected $percentEncoder;
14:
15: public function __construct()
16: {
17: $this->percentEncoder = new HTMLPurifier_PercentEncoder();
18: }
19:
20: /**
21: * Parses a URI.
22: * @param $uri string URI to parse
23: * @return HTMLPurifier_URI representation of URI. This representation has
24: * not been validated yet and may not conform to RFC.
25: */
26: public function parse($uri)
27: {
28: $uri = $this->percentEncoder->normalize($uri);
29:
30: // Regexp is as per Appendix B.
31: // Note that ["<>] are an addition to the RFC's recommended
32: // characters, because they represent external delimeters.
33: $r_URI = '!'.
34: '(([a-zA-Z0-9\.\+\-]+):)?'. // 2. Scheme
35: '(//([^/?#"<>]*))?'. // 4. Authority
36: '([^?#"<>]*)'. // 5. Path
37: '(\?([^#"<>]*))?'. // 7. Query
38: '(#([^"<>]*))?'. // 8. Fragment
39: '!';
40:
41: $matches = array();
42: $result = preg_match($r_URI, $uri, $matches);
43:
44: if (!$result) return false; // *really* invalid URI
45:
46: // seperate out parts
47: $scheme = !empty($matches[1]) ? $matches[2] : null;
48: $authority = !empty($matches[3]) ? $matches[4] : null;
49: $path = $matches[5]; // always present, can be empty
50: $query = !empty($matches[6]) ? $matches[7] : null;
51: $fragment = !empty($matches[8]) ? $matches[9] : null;
52:
53: // further parse authority
54: if ($authority !== null) {
55: $r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
56: $matches = array();
57: preg_match($r_authority, $authority, $matches);
58: $userinfo = !empty($matches[1]) ? $matches[2] : null;
59: $host = !empty($matches[3]) ? $matches[3] : '';
60: $port = !empty($matches[4]) ? (int) $matches[5] : null;
61: } else {
62: $port = $host = $userinfo = null;
63: }
64:
65: return new HTMLPurifier_URI(
66: $scheme, $userinfo, $host, $port, $path, $query, $fragment);
67: }
68:
69: }
70:
71: // vim: et sw=4 sts=4
72: