1: <?php
2: namespace Geekwright\RegDom;
3:
4: /**
5: * Manage the Public Suffix List (PSL) data. This includes, downloading, converting to an array tree
6: * structure for access in PHP, and caching the results.
7: *
8: * @package Geekwright\RegDom
9: * @author Florian Sager, 06.08.2008, <sager@agitos.de>
10: * @author Marcus Bointon (https://github.com/Synchro/regdom-php)
11: * @author Richard Griffith <richard@geekwright.com>
12: * @license Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
13: */
14: class PublicSuffixList
15: {
16: protected $sourceURL = 'https://publicsuffix.org/list/public_suffix_list.dat';
17: protected $localPSL = 'public_suffix_list.dat';
18: protected $cachedPrefix = 'cached_';
19:
20: protected $tree;
21: protected $url;
22: protected $dataDir = '/../data/'; // relative to __DIR__
23:
24: /**
25: * PublicSuffixList constructor.
26: * @param string|null $url URL for the PSL or null to use default
27: */
28: public function __construct($url = null)
29: {
30: $this->setURL($url);
31: }
32:
33: /**
34: * Set the URL, and clear any existing tree
35: *
36: * @param string|null $url URL for the PSL or null to use default
37: *
38: * @return void
39: */
40: public function setURL($url)
41: {
42: $this->url = $url;
43: $this->tree = null;
44: }
45:
46: /**
47: * Set a fallback (default) for the URL. If we have a locally saved version, prefer it, but use a
48: * remote URL if there is no local source.
49: *
50: * @return void
51: */
52: protected function setFallbackURL()
53: {
54: $this->setLocalPSLName($this->url);
55: if (null === $this->url) {
56: $this->url = file_exists(__DIR__ . $this->localPSL) ? $this->localPSL : $this->sourceURL;
57: }
58: }
59:
60: /**
61: * load the PSL tree, automatically handling caches
62: *
63: * @return void (results in $this->tree)
64: *
65: * @throws \RuntimeException
66: */
67: protected function loadTree()
68: {
69: $this->setFallbackURL();
70:
71: $this->tree = $this->readCachedPSL($this->url);
72: if (false !== $this->tree) {
73: return;
74: }
75:
76: $this->tree = array();
77: $list = $this->readPSL();
78:
79: if (false===$list) {
80: throw new \RuntimeException('Cannot read ' . $this->url);
81: }
82:
83: $this->parsePSL($list);
84: $this->cachePSL($this->url);
85: }
86:
87: /**
88: * Parse the PSL data
89: *
90: * @param string $fileData the PSL data
91: *
92: * @return void (results in $this->tree)
93: */
94: protected function parsePSL($fileData)
95: {
96: $lines = explode("\n", $fileData);
97:
98: foreach ($lines as $line) {
99: if ($this->startsWith($line, "//") || $line == '') {
100: continue;
101: }
102:
103: // this line should be a TLD
104: $tldParts = explode('.', $line);
105:
106: $this->buildSubDomain($this->tree, $tldParts);
107: }
108: }
109:
110: /**
111: * Does $search start with $startString?
112: *
113: * @param string $search the string to test
114: * @param string $startString the starting string to match
115: *
116: * @return bool
117: */
118: protected function startsWith($search, $startString)
119: {
120: return (0 === strpos($search, $startString));
121: }
122:
123: /**
124: * Add domains to tree
125: *
126: * @param array $node tree array by reference
127: * @param string[] $tldParts array of domain parts
128: *
129: * @return void - changes made to $node by reference
130: */
131: protected function buildSubDomain(&$node, $tldParts)
132: {
133: $dom = trim(array_pop($tldParts));
134:
135: $isNotDomain = false;
136: if ($this->startsWith($dom, "!")) {
137: $dom = substr($dom, 1);
138: $isNotDomain = true;
139: }
140:
141: if (!array_key_exists($dom, $node)) {
142: if ($isNotDomain) {
143: $node[$dom] = array("!" => "");
144: } else {
145: $node[$dom] = array();
146: }
147: }
148:
149: if (!$isNotDomain && count($tldParts) > 0) {
150: $this->buildSubDomain($node[$dom], $tldParts);
151: }
152: }
153:
154: /**
155: * Return the current tree, loading it if needed
156: *
157: * @return array the PSL tree
158: * @throws \RuntimeException if PSL cannot be loaded
159: */
160: public function getTree()
161: {
162: if (null===$this->tree) {
163: $this->loadTree();
164: }
165: return $this->tree;
166: }
167:
168: /**
169: * Read PSL from the URL or file specified in $this->url.
170: * If we process a remote URL, save a local copy.
171: *
172: * @return string|false PSL file contents or false on error
173: */
174: protected function readPSL()
175: {
176: $parts = parse_url($this->url);
177: $remote = isset($parts['scheme']) || isset($parts['host']);
178: // try to read with file_get_contents
179: $newPSL = file_get_contents(($remote ? '' : __DIR__) . $this->url);
180: if (false !== $newPSL) {
181: if ($remote) {
182: $this->saveLocalPSL($newPSL);
183: }
184: return $newPSL;
185: }
186:
187: // try again with curl if file_get_contents failed
188: if (function_exists('curl_init') && false !== ($curlHandle = curl_init())) {
189: curl_setopt($curlHandle, CURLOPT_URL, $this->url);
190: curl_setopt($curlHandle, CURLOPT_FAILONERROR, true);
191: curl_setopt($curlHandle, CURLOPT_RETURNTRANSFER, 1);
192: curl_setopt($curlHandle, CURLOPT_CONNECTTIMEOUT, 5);
193: $curlReturn = curl_exec($curlHandle);
194: curl_close($curlHandle);
195: if (false !== $curlReturn) {
196: if ($remote) {
197: $this->saveLocalPSL($curlReturn);
198: }
199: return $curlReturn;
200: }
201: }
202: return false;
203: }
204:
205: /**
206: * Determine cache file name for a specified source
207: *
208: * @param string $url URL/filename of source PSL
209: *
210: * @return string cache file name for given resource
211: */
212: protected function getCacheFileName($url)
213: {
214: return __DIR__ . $this->dataDir . $this->cachedPrefix . md5($url);
215: }
216:
217: /**
218: * Attempt to load a cached Public Suffix List tree for a given source
219: *
220: * @param string $url URL/filename of source PSL
221: *
222: * @return false|string[] PSL tree
223: */
224: protected function readCachedPSL($url)
225: {
226: $cacheFile = $this->getCacheFileName($url);
227: if (file_exists($cacheFile)) {
228: $cachedTree = file_get_contents($cacheFile);
229: if((int) PHP_VERSION_ID < 70000) {
230: return unserialize($cachedTree);
231: }
232: return unserialize($cachedTree, array('allowed_classes' => false));
233: }
234: return false;
235: }
236:
237: /**
238: * Cache the current Public Suffix List tree and associate with the specified source
239: *
240: * @param string $url URL/filename of source PSL
241: *
242: * @return bool|int the number of bytes that were written to the file, or false on failure
243: */
244: protected function cachePSL($url)
245: {
246: return file_put_contents($this->getCacheFileName($url), serialize($this->tree));
247: }
248:
249: /**
250: * Save a local copy of a retrieved Public Suffix List
251: *
252: * @param string $fileContents URL/filename of source PSL
253: *
254: * @return bool|int the number of bytes that were written to the file, or false on failure
255: */
256: protected function saveLocalPSL($fileContents)
257: {
258: return file_put_contents(__DIR__ . $this->localPSL, $fileContents);
259: }
260:
261: /**
262: * Set localPSL name based on URL
263: *
264: * @param null|string $url the URL for the PSL
265: *
266: * @return void (sets $this->localPSL)
267: */
268: protected function setLocalPSLName($url)
269: {
270: if (null === $url) {
271: $url = $this->sourceURL;
272: }
273: $parts = parse_url($url);
274: $fileName = basename($parts['path']);
275: $this->localPSL = $this->dataDir . $fileName;
276: }
277:
278: /**
279: * Delete files in the data directory
280: *
281: * @param bool $cacheOnly true to limit clearing to cached serialized PSLs, false to clear all
282: *
283: * @return void
284: */
285: public function clearDataDirectory($cacheOnly = false)
286: {
287: $dir = __DIR__ . $this->dataDir;
288: if (is_dir($dir)) {
289: if ($dirHandle = opendir($dir)) {
290: while (($file = readdir($dirHandle)) !== false) {
291: if (filetype($dir . $file) === 'file'
292: && (false === $cacheOnly || $this->startsWith($file, $this->cachedPrefix))) {
293: unlink($dir . $file);
294: }
295: }
296: closedir($dirHandle);
297: }
298: }
299: }
300: }
301: