1: <?php
2: namespace Geekwright\RegDom;
3:
4: /**
5: * Class RegisteredDomain
6: *
7: * Determine the registrable domain portion of a URL, respecting the public suffix list conventions
8: *
9: * @package Geekwright\RegDom
10: * @author Florian Sager, 06.08.2008, <sager@agitos.de>
11: * @author Marcus Bointon (https://github.com/Synchro/regdom-php)
12: * @author Richard Griffith <richard@geekwright.com>
13: * @license Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
14: */
15: class RegisteredDomain
16: {
17: protected $tree;
18: protected $psl;
19:
20: /**
21: * RegisteredDomain constructor.
22: *
23: * @param PublicSuffixList|null $psl PublicSuffixList object, or null to use defaults
24: */
25: public function __construct(PublicSuffixList $psl = null)
26: {
27: if (null === $psl) {
28: $psl = new PublicSuffixList();
29: }
30: $this->psl = $psl;
31: }
32:
33: /**
34: * Given a URL or bare host name, return a normalized host name, converting punycode to UTF-8
35: * and converting to lower case
36: *
37: * @param string $url URL or host name
38: *
39: * @return string
40: */
41: protected function normalizeHost($url)
42: {
43: $host = (false!==strpos($url, '/')) ? parse_url($url, PHP_URL_HOST) : $url;
44: $parts = explode('.', $host);
45: $utf8Host = '';
46: foreach ($parts as $part) {
47: $utf8Host = $utf8Host . (($utf8Host === '') ? '' : '.') . $this->convertPunycode($part);
48: }
49:
50: return mb_strtolower($utf8Host);
51: }
52:
53: /**
54: * Convert a punycode string to UTF-8 if needed
55: *
56: * @param string $part host component
57: *
58: * @return string host component as UTF-8
59: */
60: protected function convertPunycode($part)
61: {
62: if (strpos($part, 'xn--')===0) {
63: if (function_exists('idn_to_utf8')) {
64: if (defined('INTL_IDNA_VARIANT_UTS46')) { // PHP 7.2
65: return idn_to_utf8($part, 0, INTL_IDNA_VARIANT_UTS46);
66: }
67: return idn_to_utf8($part);
68: }
69: return $this->decodePunycode($part);
70: }
71: return $part;
72: }
73:
74: /**
75: * convert punycode to UTF-8 (the hard way) Used only if idn_to_utf8() is not available
76: *
77: * This fallback adapted from https://ckon.wordpress.com/2010/08/24/punycode-to-unicode-converter-php/
78: *
79: * @param string $encoded
80: * @return string
81: */
82: protected function decodePunycode($encoded)
83: {
84: $prefix = 'xn--';
85: $safe_char = 0xFFFC;
86: $base = 36;
87: $tmin = 1;
88: $tmax = 26;
89: $skew = 38;
90: $damp = 700;
91:
92: if (strpos($encoded, $prefix) !== 0 || strlen(trim(str_replace($prefix, '', $encoded))) == 0) {
93: return $encoded;
94: }
95:
96: $is_first = true;
97: $bias = 72;
98: $idx = 0;
99: $char = 0x80;
100: $decoded = array();
101: $output = '';
102:
103: $delim_pos = strrpos($encoded, '-');
104: if ($delim_pos > strlen($prefix)) {
105: for ($k = strlen($prefix); $k < $delim_pos; ++$k) {
106: $decoded[] = ord($encoded[$k]);
107: }
108: }
109: $deco_len = count($decoded);
110: $enco_len = strlen($encoded);
111:
112: for ($enco_idx = $delim_pos ? ($delim_pos + 1) : 0; $enco_idx < $enco_len; ++$deco_len) {
113: for ($old_idx = $idx, $w = 1, $k = $base; 1; $k += $base) {
114: $cp = ord($encoded[$enco_idx++]);
115: $digit = ($cp - 48 < 10) ? $cp - 22 : (($cp - 65 < 26) ? $cp - 65 : (($cp - 97 < 26) ? $cp - 97 : $base));
116: $idx += $digit * $w;
117: $t = ($k <= $bias) ? $tmin : (($k >= $bias + $tmax) ? $tmax : ($k - $bias));
118: if ($digit < $t) {
119: break;
120: }
121: $w = (int)($w * ($base - $t));
122: }
123: $delta = $idx - $old_idx;
124: $delta = (int) ($is_first ? ($delta / $damp) : ($delta / 2));
125: $delta += (int) ($delta / ($deco_len + 1));
126: for ($k = 0; $delta > (($base - $tmin) * $tmax) / 2; $k += $base) {
127: $delta = (int) ($delta / ($base - $tmin));
128: }
129: $bias = (int) ($k + ($base - $tmin + 1) * $delta / ($delta + $skew));
130: $is_first = false;
131: $char += (int)($idx / ($deco_len + 1));
132: $idx %= ($deco_len + 1);
133: if ($deco_len > 0) {
134: for ($i = $deco_len; $i > $idx; $i--) {
135: $decoded[$i] = $decoded[($i - 1)];
136: }
137: }
138: $decoded[$idx++] = $char;
139: }
140:
141: foreach ($decoded as $k => $v) {
142: if ($v < 128) {
143: $output .= chr($v);
144: } // 7bit are transferred literally
145: elseif ($v < (1 << 11)) {
146: $output .= chr(192 + ($v >> 6)) . chr(128 + ($v & 63));
147: } // 2 bytes
148: elseif ($v < (1 << 16)) {
149: $output .= chr(224 + ($v >> 12)) . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
150: } // 3 bytes
151: elseif ($v < (1 << 21)) {
152: $output .= chr(240 + ($v >> 18)) . chr(128 + (($v >> 12) & 63)) . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
153: } // 4 bytes
154: else {
155: $output .= $safe_char;
156: } // 'Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.$k
157: }
158: return $output;
159: }
160:
161: /**
162: * Determine the registered domain portion of the supplied host string
163: *
164: * @param string $host a host name or URL containing a host name
165: *
166: * @return string|null shortest registrable domain portion of the supplied host or null if invalid
167: */
168: public function getRegisteredDomain($host)
169: {
170: $this->tree = $this->psl->getTree();
171:
172: $signingDomain = $this->normalizeHost($host);
173: $signingDomainParts = explode('.', $signingDomain);
174:
175: $result = $this->findRegisteredDomain($signingDomainParts, $this->tree);
176:
177: if (empty($result)) {
178: // this is an invalid domain name
179: return null;
180: }
181:
182: // assure there is at least 1 TLD in the stripped signing domain
183: if (!strpos($result, '.')) {
184: $cnt = count($signingDomainParts);
185: if ($cnt == 1 || $signingDomainParts[$cnt-2] == '') {
186: return null;
187: }
188: return $signingDomainParts[$cnt-2] . '.' . $signingDomainParts[$cnt-1];
189: }
190: return $result;
191: }
192:
193: /**
194: * Recursive helper method to query the PSL tree
195: *
196: * @param string[] $remainingSigningDomainParts parts of domain being queried
197: * @param string[] $treeNode subset of tree array by reference
198: *
199: * @return null|string
200: */
201: protected function findRegisteredDomain($remainingSigningDomainParts, &$treeNode)
202: {
203: $sub = array_pop($remainingSigningDomainParts);
204:
205: $result = null;
206: if (isset($treeNode['!'])) {
207: return '';
208: } elseif (is_array($treeNode) && array_key_exists($sub, $treeNode)) {
209: $result = $this->findRegisteredDomain($remainingSigningDomainParts, $treeNode[$sub]);
210: } elseif (is_array($treeNode) && array_key_exists('*', $treeNode)) {
211: $result = $this->findRegisteredDomain($remainingSigningDomainParts, $treeNode['*']);
212: } else {
213: return $sub;
214: }
215:
216: if ($result === '') {
217: return $sub;
218: } elseif (strlen($result)>0) {
219: return $result . '.' . $sub;
220: }
221: return null;
222: }
223: }
224: