1: <?php
2: /*
3: You may not change or alter any portion of this comment or credits
4: of supporting developers from this source code or any supporting source code
5: which is considered copyrighted (c) material of the original comment or credit authors.
6:
7: This program is distributed in the hope that it will be useful,
8: but WITHOUT ANY WARRANTY; without even the implied warranty of
9: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
10: */
11:
12: /**
13: * Class protector_postcommon_post_language_match
14: *
15: * Check post content conformance to the system language. Requires UTF-8 environment with mbstring.
16: *
17: * This filter compares post data to the characters that define the current system language.
18: * If the number of characters that are not normally used in the language exceeds a threshold,
19: * the post will be rejected.
20: *
21: * The threshold can be adjusted in $maximumTolerance.
22: *
23: * A value of 0.02 (2% non-language characters) can often discriminate between multiple Latin languages,
24: * while values approaching 1.0 (100% non-language) indicate totally different alphabets, such as comparing
25: * English (Latin) to Russian (Cyrillic.) Some commonalities are always possible with "loanwords," so this
26: * number always represents tendency, not absolutes.
27: *
28: * Certain ranges are common to all languages, whitespace, punctuations, currency symbols, emoji, etc.
29: * These are automatically excluded from the analysis.
30: *
31: * If site requirements are for multiple languages concurrently, a $customRange can be set to include
32: * the requirements of both languages.
33: *
34: * Ranges are in regular expression format as used in preg_replace()
35: *
36: * If the language filter detects a mismatch, the post is denied. If the mismatch is more that double (2 times)
37: * the configured threshold, the account is deactivated.
38: *
39: * @category Protector\Filter
40: * @package Protector
41: * @author Richard Griffith <richard@geekwright.com>
42: * @copyright 2016 XOOPS Project (https://xoops.org)
43: * @license GNU GPL 2.0 or later (https://www.gnu.org/licenses/gpl-2.0.html)
44: * @link https://xoops.org
45: */
46: class Protector_postcommon_post_language_match extends ProtectorFilterAbstract
47: {
48:
49: /** @var int after this number of posts by the user, skip this filter */
50: protected $minPosts = 10;
51:
52: /** @var float maximum proportion of off-language characters to accept */
53: protected $maximumTolerance = 0.02;
54:
55: /** @var string|null custom character range to match, null to use default for current language */
56: protected $customRange = null;
57:
58: /** @var int do not run analysis if input length is less than this */
59: protected $minLength = 15;
60:
61: /** @var string[] script names we do NOT want to process */
62: protected $skipThese = array('edituser.php', 'register.php', 'search.php', 'user.php', 'lostpass.php');
63:
64: // map regex compatible unicode script range to a XOOPS language name
65: // https://php.net/manual/en/regexp.reference.unicode.php
66: // http://www.regular-expressions.info/unicode.html
67: // http://www.localizingjapan.com/blog/2012/01/20/regular-expressions-for-japanese-text/
68: protected $scriptCodes = array(
69: 'arabic' => '\p{Arabic}',
70: 'brazilian' => 'A-Za-zÁáÂâĀãÀàÇçÉéÊêÍíÓóÔôŌõÚú',
71: 'bulgarian' => '\p{Cyrillic}',
72: 'chinese_zh' => '\p{Han}',
73: 'croatian' => 'A-PR-Va-pr-vĆćČčĐ𩹮ž',
74: 'czech' => 'A-Za-zÁáČčĎďÉéĚěÍíŇňÓóŘřŠšŤťÚúŮůÝýŽž',
75: 'danish' => 'A-Za-zÆØÅæøå',
76: 'dutch' => 'A-Za-zIJij',
77: 'english' => 'A-Za-z',
78: 'french' => 'A-Za-zÀàÂâÆæÇçÈèÉéÊêËëÎîÏïÔôŒœÙùÛûÜü',
79: 'german' => 'A-Za-zÄäÉéÖöÜüß',
80: 'greek' => '\p{Greek}',
81: 'hebrew' => '\p{Hebrew}',
82: 'hungarian' => '\p{Latin}',
83: 'italian' => 'A-IL-VZa-il-vzÀÈÉÌÒÙàèéìòù',
84: 'japanese' => '\p{Han}\p{Hiragana}\p{Katakana}',
85: 'korean' => '\p{Hangul}',
86: 'malaysian' => 'A-Za-z',
87: 'norwegian' => 'A-Za-zÆØÅæøå',
88: 'persian' => '\p{Arabic}',
89: 'polish' => 'A-Za-zĄąĘęÓóĆ棳ŃńŚśŹźŻż',
90: 'portuguesebr' => 'A-Za-zÁáÂâĀãÀàÇçÉéÊêÍíÓóÔôŌõÚú',
91: 'portuguese' => 'A-Za-zÁáÂâĀãÀàÇçÉéÊêÍíÓóÔôŌõÚú',
92: 'russian' => '\p{Cyrillic}',
93: 'schinese' => '\p{Han}',
94: 'slovak' => 'A-Za-zÁáČčĎďÉéÍíĹ弾ŇňÓóÔôŔ੹ŤťÚúÝýŽž',
95: 'slovenian' => 'A-PR-VZa-pr-vzČ芚Žž',
96: 'spanish' => 'A-Za-zÁáÉéÍíÑñÓóÚúÜü',
97: 'swedish' => 'A-Za-zÅåÄäÖö',
98: 'tchinese' => '\p{Han}',
99: 'thai' => '\p{Thai}',
100: 'turkish' => 'A-PR-VYZÇĞİÖŞÜÂÎÛa-pr-vyzçğiöşüâîû',
101: 'vietnamese' => 'A-Za-zàÀảẢãÃáÁạẠăĂằẰẳẲẵẴắẮặẶâÂầẦẩẨẫẪấẤậẬđĐèÈẻẺẽẼéÉẹẸêÊềỀểỂễỄếẾệỆìÌỉỈĩĨíÍịỊòÒỏỎõÕóÓọỌôÔồỒổỔỗỖốỐộỘơƠờỜởỞỡỠớỚợỢùÙủỦũŨúÚụỤưƯừỪửỬữỮứỨựỰỳỲỷỶỹỸýÝỵỴ',
102: );
103:
104: /**
105: * stripEmoji - remove pictographic characters, i.e. emoji and dingbats from a string
106: *
107: * @param string $string UTF-8 encoded string
108: *
109: * @return string without pictographs
110: */
111: protected function stripEmoji($string)
112: {
113: return preg_replace('/([0-9#][\x{20E3}])|[\x{00ae}\x{00a9}\x{203C}\x{2047}\x{2048}\x{2049}\x{3030}\x{303D}\x{2139}\x{2122}\x{3297}\x{3299}][\x{FE00}-\x{FEFF}]?|[\x{2190}-\x{21FF}][\x{FE00}-\x{FEFF}]?|[\x{2300}-\x{23FF}][\x{FE00}-\x{FEFF}]?|[\x{2460}-\x{24FF}][\x{FE00}-\x{FEFF}]?|[\x{25A0}-\x{25FF}][\x{FE00}-\x{FEFF}]?|[\x{2600}-\x{27BF}][\x{FE00}-\x{FEFF}]?|[\x{2900}-\x{297F}][\x{FE00}-\x{FEFF}]?|[\x{2B00}-\x{2BF0}][\x{FE00}-\x{FEFF}]?|[\x{1F000}-\x{1F6FF}][\x{FE00}-\x{FEFF}]?/u', '', $string);
114: }
115:
116: /**
117: * Execute the filter
118: *
119: * @return bool
120: */
121: public function execute()
122: {
123: /** @var XoopsUser $xoopsUser */
124: global $xoopsUser;
125:
126: if (!function_exists('mb_strlen')) {
127: return true;
128: }
129:
130: // we only check POST transactions
131: if ($_SERVER['REQUEST_METHOD'] !== 'POST') {
132: return true;
133: }
134:
135: // don't process for admin and experienced users
136: if (is_object($xoopsUser) && ($xoopsUser->isAdmin() || $this->minPosts < $xoopsUser->posts())) {
137: return true;
138: }
139:
140: $uid = is_object($xoopsUser) ? $xoopsUser->uid() : 0;
141:
142: // skip register.php and edituser.php updates (your name is your name)
143: if (in_array(basename($_SERVER['SCRIPT_FILENAME']), $this->skipThese)) {
144: return true;
145: }
146:
147: // get all strings from $_POST
148: $testString = '';
149: foreach ($_POST as $key => $postData) {
150: // dare to ignore arrays/objects
151: if (!is_string($postData)) {
152: continue;
153: }
154: $testString .= $postData;
155: }
156:
157: // not big enough to analyse effectively
158: if (mb_strlen($testString) < $this->minLength) {
159: return true;
160: }
161:
162: $language = $GLOBALS['xoopsConfig']['language'];
163: $range = isset($this->scriptCodes[$language]) ? $this->scriptCodes[$language] : 'p\{Latin}';
164: $range = !empty($this->customRange) ? $this->customRange : $range;
165:
166: // remove emoji from computations (a smilie cat is universal)
167: $testString = $this->stripEmoji($testString);
168:
169: $reduced = preg_replace('/[\p{Common}' . $range . ']+/u', '', $testString);
170:
171: $remainingLength = (float) mb_strlen($reduced, 'UTF-8');
172: $fullLength = (float) mb_strlen($testString, 'UTF-8');
173: $percent = ($fullLength > 0) ? $remainingLength / $fullLength : 0.0;
174:
175: if ($percent > $this->maximumTolerance) {
176: $report = array(
177: 'score' => $percent,
178: 'uri' => $_SERVER['REQUEST_URI'],
179: 'post' => $_POST,
180: );
181: $this->protector->message = json_encode($report);
182: $this->protector->output_log('SPAM Language Map', $uid);
183: if ($uid > 0 && $percent > (2.0 * $this->maximumTolerance)) {
184: $this->protector->deactivateCurrentUser();
185: $this->protector->_should_be_banned_time0 = true;
186: } else {
187: $this->protector->purgeNoExit();
188: }
189: // write any message as you like
190: echo 'Your post has been denied. '
191: . 'If you feel this is in error, please contact the site administrator.';
192: exit;
193: }
194:
195: return true;
196: }
197: }
198: