1: <?php
2: /*
3: You may not change or alter any portion of this comment or credits
4: of supporting developers from this source code or any supporting source code
5: which is considered copyrighted (c) material of the original comment or credit authors.
6:
7: This program is distributed in the hope that it will be useful,
8: but WITHOUT ANY WARRANTY; without even the implied warranty of
9: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
10: */
11:
12: namespace Xmf;
13:
14: /**
15: * Metagen facilitates generating and assigning page meta tags
16: *
17: * @category Xmf\Metagen
18: * @package Xmf
19: * @author Richard Griffith <richard@geekwright.com>
20: * @author trabis <lusopoemas@gmail.com>
21: * @copyright 2011-2018 XOOPS Project (https://xoops.org)
22: * @license GNU GPL 2.0 or later (https://www.gnu.org/licenses/gpl-2.0.html)
23: * @link https://xoops.org
24: */
25: class Metagen
26: {
27:
28: /**
29: * mbstring encoding
30: */
31: const ENCODING = 'UTF-8';
32:
33: /**
34: * horizontal ellipsis
35: * This will be used to replace omitted text.
36: */
37: const ELLIPSIS = "...";
38:
39: /**
40: * assignTitle set the page title
41: *
42: * @param string $title page title
43: *
44: * @return void
45: */
46: public static function assignTitle($title)
47: {
48: $title = trim($title);
49: $title = static::asPlainText($title);
50: static::assignTemplateVar('xoops_pagetitle', $title);
51: }
52:
53: /**
54: * assignKeywords set the meta keywords tag
55: *
56: * @param string[] $keywords keywords list
57: *
58: * @return void
59: */
60: public static function assignKeywords($keywords)
61: {
62: if (!empty($keywords) && \is_array($keywords)) {
63: $keyword_tag = implode(', ', $keywords);
64: static::assignThemeMeta('keywords', $keyword_tag);
65: }
66: }
67:
68: /**
69: * assignDescription set the meta description tag
70: *
71: * @param string $description page description
72: *
73: * @return void
74: */
75: public static function assignDescription($description)
76: {
77: $description = trim($description);
78: if (!empty($description)) {
79: static::assignThemeMeta('description', $description);
80: }
81: }
82:
83: /**
84: * assign meta variables in template engine
85: *
86: * @param string $name meta name (keywords, description)
87: * @param string $value meta value
88: */
89: protected static function assignThemeMeta($name, $value)
90: {
91: if (class_exists('Xoops', false)) {
92: \Xoops::getInstance()->theme()->addMeta('meta', $name, $value);
93: } else {
94: global $xoTheme;
95: $xoTheme->addMeta('meta', $name, $value);
96: }
97: }
98:
99: /**
100: * assign meta variables in template engine
101: *
102: * @param string $name variable name (i.e. xoops_pagtitle)
103: * @param string $value meta value
104: */
105: protected static function assignTemplateVar($name, $value)
106: {
107: if (class_exists('Xoops', false)) {
108: \Xoops::getInstance()->tpl()->assign($name, $value);
109: } else {
110: global $xoopsTpl;
111: $xoopsTpl->assign($name, $value);
112: }
113: }
114:
115: /**
116: * generateKeywords builds a set of keywords from text body
117: *
118: * @param string $body text to extract keywords from
119: * @param integer $count number of keywords to use
120: * @param integer $minLength minimum length of word to consider as a keyword
121: * @param string[]|null $forceKeys array of keywords to force use, or null for none
122: *
123: * @return array of keywords
124: */
125: public static function generateKeywords(
126: $body,
127: $count = 20,
128: $minLength = 4,
129: $forceKeys = null
130: ) {
131: $keyCount = array();
132: if (!is_array($forceKeys)) {
133: $forceKeys = array();
134: }
135:
136: $text = static::asPlainText($body);
137: if (function_exists('mb_strtolower')) {
138: $text = mb_strtolower($text, static::ENCODING);
139: } else {
140: $text = strtolower($text);
141: }
142:
143: $originalKeywords = preg_split(
144: '/[^\w\']+/u',
145: $text,
146: -1,
147: PREG_SPLIT_NO_EMPTY
148: );
149:
150: foreach ($originalKeywords as $originalKeyword) {
151: if (static::stopWordsObject()->check($originalKeyword)) {
152: $secondRoundKeywords = explode("'", $originalKeyword);
153: foreach ($secondRoundKeywords as $secondRoundKeyword) {
154: if (static::stopWordsObject()->check($secondRoundKeyword)
155: && strlen($secondRoundKeyword) >= $minLength
156: ) {
157: $keyCount[$secondRoundKeyword] =
158: empty($keyCount[$secondRoundKeyword]) ? 1 : $keyCount[$secondRoundKeyword] + 1;
159: }
160: }
161: }
162: }
163:
164: while (!empty($forceKeys)) {
165: $tempKey = strtolower(array_pop($forceKeys));
166: $keyCount[$tempKey] = 999999;
167: }
168:
169: arsort($keyCount, SORT_NUMERIC);
170: $key = array_keys($keyCount);
171: $keywords = array_slice($key, 0, $count);
172:
173: return $keywords;
174: }
175:
176: /**
177: * generateDescription - generate a short description from a body of text
178: *
179: * @param string $body body text
180: * @param integer $wordCount maximum word count for description
181: *
182: * @return string
183: */
184: public static function generateDescription($body, $wordCount = 100)
185: {
186: $text = static::asPlainText($body);
187:
188: $words = explode(" ", $text);
189:
190: // Only keep $maxWords words
191: $newWords = array();
192: $i = 0;
193: while ($i < $wordCount - 1 && $i < count($words)) {
194: $newWords[] = $words[$i];
195: ++$i;
196: }
197: $ret = implode(' ', $newWords);
198: if (function_exists('mb_strlen')) {
199: $len = mb_strlen($ret, static::ENCODING);
200: $lastPeriod = mb_strrpos($ret, '.', 0, static::ENCODING);
201: $ret .= ($lastPeriod === false) ? static::ELLIPSIS : '';
202: if ($len > 100 && ($len - $lastPeriod) < 30) {
203: $ret = mb_substr($ret, 0, $lastPeriod + 1, static::ENCODING);
204: }
205: } else {
206: $len = strlen($ret);
207: $lastPeriod = strrpos($ret, '.');
208: $ret .= ($lastPeriod === false) ? static::ELLIPSIS : '';
209: if ($len > 100 && ($len - $lastPeriod) < 30) {
210: $ret = substr($ret, 0, $lastPeriod + 1);
211: }
212: }
213:
214: return $ret;
215: }
216:
217: /**
218: * generateMetaTags - generate and assign all meta tags
219: *
220: * @param string $title title
221: * @param string $body body text
222: * @param int $count maximum keywords to use
223: * @param int $minLength minimum length of word to consider as keyword
224: * @param int $wordCount maximum word count for description summary
225: * @param string[]|null $forceKeys associative array of keywords to force use
226: *
227: * @return void
228: */
229: public static function generateMetaTags(
230: $title,
231: $body,
232: $count = 20,
233: $minLength = 4,
234: $wordCount = 100,
235: $forceKeys = null
236: ) {
237: $title_keywords = static::generateKeywords($title, $count, 3, $forceKeys);
238: $keywords = static::generateKeywords($body, $count, $minLength, $title_keywords);
239: $description = static::generateDescription($body, $wordCount);
240: static::assignTitle($title);
241: static::assignKeywords($keywords);
242: static::assignDescription($description);
243: }
244:
245: /**
246: * Return true if the string is length > 0
247: *
248: * @param string $var to test
249: *
250: * @return boolean
251: *
252: * @author psylove
253: */
254: protected static function nonEmptyString($var)
255: {
256: return (strlen($var) > 0);
257: }
258:
259: /**
260: * Create a title for the short_url field of an article
261: *
262: * @param string $title title of the article
263: * @param string $extension extension to add
264: *
265: * @return string sort_url for the article
266: *
267: * @author psylove
268: */
269: public static function generateSeoTitle($title = '', $extension = '')
270: {
271: $title = preg_replace("/[^\p{N}\p{L}]/u", "-", $title);
272:
273: $tableau = explode("-", $title);
274: $tableau = array_filter($tableau, 'static::nonEmptyString');
275: $tableau = array_filter($tableau, array(static::stopWordsObject(), 'check'));
276: $title = implode("-", $tableau);
277:
278: $title = (empty($title)) ? '' : $title . $extension;
279: return $title;
280: }
281:
282: /**
283: * getSearchSummary splits a string into string no larger than a
284: * specified length, and centered around the first occurrence
285: * of any of an array of needles, or starting at the beginning
286: * of the string if no needles are specified or found.
287: *
288: * The string will be broken on spaces and an ellipsis (…) will be
289: * added to the string when broken.
290: *
291: * @param string $haystack the string to summarize
292: * @param mixed $needles search term, array of search terms, or null
293: * @param int $length maximum character length for the summary
294: *
295: * @return string a substring of haystack
296: */
297: public static function getSearchSummary($haystack, $needles = null, $length = 120)
298: {
299: $haystack = static::asPlainText($haystack);
300: $pos = static::getNeedlePositions($haystack, $needles);
301:
302: $start = empty($pos) ? 0 : min($pos);
303:
304: $start = max($start - (int) ($length / 2), 0);
305:
306: $pre = ($start > 0); // need an ellipsis in front?
307: if (function_exists('mb_strlen')) {
308: if ($pre) {
309: // we are not at the beginning so find first blank
310: $temp = mb_strpos($haystack, ' ', $start, static::ENCODING);
311: $start = ($temp === false) ? $start : $temp;
312: $haystack = mb_substr($haystack, $start, mb_strlen($haystack), static::ENCODING);
313: }
314:
315: $post = !(mb_strlen($haystack, static::ENCODING) < $length); // need an ellipsis in back?
316: if ($post) {
317: $haystack = mb_substr($haystack, 0, $length, static::ENCODING);
318: $end = mb_strrpos($haystack, ' ', 0, static::ENCODING);
319: if ($end) {
320: $haystack = mb_substr($haystack, 0, $end, static::ENCODING);
321: }
322: }
323: } else {
324: if ($pre) {
325: // we are not at the beginning so find first blank
326: $temp = strpos($haystack, ' ', $start);
327: $start = ($temp === false) ? $start : $temp;
328: $haystack = substr($haystack, $start);
329: }
330:
331: $post = !(strlen($haystack) < $length); // need an ellipsis in back?
332: if ($post) {
333: $haystack = substr($haystack, 0, $length);
334: $end = strrpos($haystack, ' ', 0);
335: if ($end) {
336: $haystack = substr($haystack, 0, $end);
337: }
338: }
339: }
340: $haystack = ($pre ? static::ELLIPSIS : '') . trim($haystack) . ($post ? static::ELLIPSIS : '');
341: return $haystack;
342: }
343:
344: /**
345: * asPlainText - clean string to be plain text, without control characters
346: * such as newlines, html markup, or leading trailing or repeating spaces.
347: *
348: * @param string $rawText a text string to be cleaned
349: *
350: * @return string
351: */
352: protected static function asPlainText($rawText)
353: {
354: $text = $rawText;
355: $text = static::html2text($text);
356: $text = static::purifyText($text);
357:
358: $text = str_replace(array("\n", "\r"), ' ', $text);
359: $text = preg_replace('/[ ]* [ ]*/', ' ', $text);
360:
361: return trim($text);
362: }
363:
364: /**
365: * getNeedlePositions - Essentially this is a strpos() for an array of needles.
366: * Given a haystack and an array of needles, return an array of all initial
367: * positions, if any, of those needles in that haystack.
368: *
369: * @param string $haystack the string to summarize
370: * @param mixed $needles search term, array of search terms, or null
371: *
372: * @return integer[] array of initial positions of substring of haystack
373: */
374: protected static function getNeedlePositions($haystack, $needles)
375: {
376: $pos = array();
377: $needles = empty($needles) ? array() : (array) $needles;
378: foreach ($needles as $needle) {
379: if (function_exists('mb_stripos')) {
380: $i = mb_stripos($haystack, $needle, 0, static::ENCODING);
381: } else {
382: $i = stripos($haystack, $needle, 0);
383: }
384: if ($i !== false) {
385: $pos[] = $i; // only store matches
386: }
387: }
388: return $pos;
389: }
390:
391: /**
392: * purifyText
393: *
394: * @param string $text text to clean
395: * @param boolean $keyword replace some punctuation with white space
396: *
397: * @return string cleaned text
398: */
399: protected static function purifyText($text, $keyword = false)
400: {
401: $text = str_replace('&nbsp;', ' ', $text);
402: $text = str_replace('<br />', ' ', $text);
403: $text = str_replace('<br/>', ' ', $text);
404: $text = str_replace('<br', ' ', $text);
405: $text = strip_tags($text);
406: $text = html_entity_decode($text);
407: $text = htmlspecialchars_decode($text, ENT_QUOTES);
408: $text = str_replace(')', ' ', $text);
409: $text = str_replace('(', ' ', $text);
410: $text = str_replace(':', ' ', $text);
411: $text = str_replace('&euro', ' euro ', $text);
412: $text = str_replace('&hellip', '...', $text);
413: $text = str_replace('&rsquo', ' ', $text);
414: $text = str_replace('!', ' ', $text);
415: $text = str_replace('?', ' ', $text);
416: $text = str_replace('"', ' ', $text);
417: $text = str_replace('-', ' ', $text);
418: $text = str_replace('\n', ' ', $text);
419: $text = str_replace('&#8213;', ' ', $text);
420:
421: if ($keyword) {
422: $text = str_replace('.', ' ', $text);
423: $text = str_replace(',', ' ', $text);
424: $text = str_replace('\'', ' ', $text);
425: }
426: $text = str_replace(';', ' ', $text);
427:
428: return $text;
429: }
430:
431: /**
432: * html2text
433: * This will remove HTML tags, javascript sections and white space. It will also
434: * convert some common HTML entities to their text equivalent. Credits to newbb2
435: *
436: * @param string $document HTML to be converted
437: *
438: * @return string Text version of $document parameter
439: */
440: protected static function html2text($document)
441: {
442: $search = array(
443: "'<script[^>]*?>.*?</script>'si", // Strip out javascript
444: "'<img.*?/>'si", // Strip out img tags
445: "'<[\/\!]*?[^<>]*?>'si", // Strip out HTML tags
446: "'([\r\n])[\s]+'", // Strip out white space
447: "'&(quot|#34);'i", // Replace HTML entities
448: "'&(amp|#38);'i",
449: "'&(lt|#60);'i",
450: "'&(gt|#62);'i",
451: "'&(nbsp|#160);'i",
452: "'&(iexcl|#161);'i",
453: "'&(cent|#162);'i",
454: "'&(pound|#163);'i",
455: "'&(copy|#169);'i"
456: );
457:
458: $replace = array(
459: "",
460: "",
461: "",
462: "\\1",
463: "\"",
464: "&",
465: "<",
466: ">",
467: " ",
468: chr(161),
469: chr(162),
470: chr(163),
471: chr(169)
472: );
473:
474: $text = preg_replace($search, $replace, $document);
475:
476: preg_replace_callback(
477: '/&#(\d+);/',
478: function ($matches) {
479: return chr($matches[1]);
480: },
481: $document
482: );
483:
484: return $text;
485: }
486:
487: /**
488: * checkStopWords - look up a word in a list of stop words and
489: * classify it as a significant word or a stop word.
490: *
491: * @param string $key the word to check
492: *
493: * @return bool True if word is significant, false if it is a stop word
494: * @deprecated since v1.2.0 - use Xmf\StopWords::check()
495: */
496: public static function checkStopWords($key)
497: {
498: return static::stopWordsObject()->check($key);
499: }
500:
501: /**
502: * Get a StopWords object
503: *
504: * @return StopWords
505: */
506: protected static function stopWordsObject()
507: {
508: static $object;
509: if (null === $object) {
510: $object = new StopWords();
511: }
512: return $object;
513: }
514: }
515: