|
| 1 | +<?php |
| 2 | +function normalizeRussianText(string $text): string |
| 3 | +{ |
| 4 | + $text = mb_strtolower($text, 'UTF-8'); |
| 5 | + $text = str_replace('ё', 'е', $text); |
| 6 | + |
| 7 | + $latinToCyrillic = [ |
| 8 | + 'a' => 'а', |
| 9 | + 'c' => 'с', |
| 10 | + 'e' => 'е', |
| 11 | + 'o' => 'о', |
| 12 | + 'p' => 'р', |
| 13 | + 'x' => 'х', |
| 14 | + 'y' => 'у', |
| 15 | + 'b' => 'б', |
| 16 | + 'm' => 'м', |
| 17 | + 'h' => 'н', |
| 18 | + 'k' => 'к', |
| 19 | + 't' => 'т', |
| 20 | + 'B' => 'В', |
| 21 | + 'D' => 'Д', |
| 22 | + 'H' => 'Н', |
| 23 | + 'K' => 'К', |
| 24 | + 'M' => 'М', |
| 25 | + 'O' => 'О', |
| 26 | + 'P' => 'Р', |
| 27 | + 'C' => 'С', |
| 28 | + 'T' => 'Т', |
| 29 | + 'X' => 'Х', |
| 30 | + 'Y' => 'У' |
| 31 | + ]; |
| 32 | + $latinToCyrillic += [ |
| 33 | + 'b' => 'б', 'd' => 'д', 'f' => 'ф', 'g' => 'г', 'i' => 'и', 'j' => 'ј', 'l' => 'л', 'n' => 'п', 'q' => 'қ', 'v' => 'в', 'w' => 'ш', 'u' => 'u' |
| 34 | + ]; |
| 35 | + $text = strtr($text, $latinToCyrillic); |
| 36 | + |
| 37 | + $charSubstitutions = [ |
| 38 | + '@' => 'а', |
| 39 | + '€' => 'е', |
| 40 | + '£' => 'л', |
| 41 | + '₽' => 'р', |
| 42 | + '0' => 'о', |
| 43 | + '3' => 'з', |
| 44 | + '4' => 'ч', |
| 45 | + '6' => 'б', |
| 46 | + '1' => 'л', |
| 47 | + '$' => 's', |
| 48 | + '|' => 'л', |
| 49 | + '!' => 'і', |
| 50 | + '?' => '', |
| 51 | + '*' => '', |
| 52 | + '.' => '', |
| 53 | + ',' => '', |
| 54 | + '-' => '', |
| 55 | + '_' => '', |
| 56 | + '+' => '', |
| 57 | + '=' => '', |
| 58 | + '/' => '', |
| 59 | + '\\' => '', |
| 60 | + '"' => '', |
| 61 | + '\''=> '', |
| 62 | + ':' => '', |
| 63 | + ';' => '', |
| 64 | + '~' => '', |
| 65 | + '`' => '', |
| 66 | + '^' => '', |
| 67 | + ]; |
| 68 | + $text = strtr($text, $charSubstitutions); |
| 69 | + |
| 70 | + $text = preg_replace('/[^\\p{L}\\p{N}]+/u', '', $text); |
| 71 | + |
| 72 | + return $text; |
| 73 | +} |
| 74 | + |
| 75 | +function filterRussianProfanity(string $text, string $path = ''): ?array |
| 76 | +{ |
| 77 | + $profanities = include __DIR__ . '/ru.php'; |
| 78 | + $normalizedText = normalizeRussianText($text); |
| 79 | + |
| 80 | + $found = []; |
| 81 | + foreach ($profanities as $badWord) { |
| 82 | + if ($badWord === '') { |
| 83 | + continue; |
| 84 | + } |
| 85 | + if (mb_strpos($normalizedText, $badWord) !== false) { |
| 86 | + $found[] = $badWord; |
| 87 | + } |
| 88 | + } |
| 89 | + if (!empty($found)) { |
| 90 | + $uniqueWords = array_unique($found); |
| 91 | + $message = "Profanity detected"; |
| 92 | + if ($path !== '') { |
| 93 | + $message .= " in file '{$path}'"; |
| 94 | + } |
| 95 | + $message .= ": [" . implode(', ', $uniqueWords) . "]"; |
| 96 | + |
| 97 | + error_log($message); |
| 98 | + return $uniqueWords; |
| 99 | + } |
| 100 | + return null; |
| 101 | +} |
| 102 | + |
| 103 | +function assertNoRussianProfanity(string $filePath) |
| 104 | +{ |
| 105 | + $lines = file($filePath); |
| 106 | + $badWords = include __DIR__ . '/ru.php'; |
| 107 | + $offenses = []; |
| 108 | + |
| 109 | + foreach ($lines as $num => $line) { |
| 110 | + $normalizedLine = normalizeRussianText($line); |
| 111 | + foreach ($badWords as $bad) { |
| 112 | + if ($bad !== '' && mb_strpos($normalizedLine, $bad) !== false) { |
| 113 | + $offenses[] = $num + 1; |
| 114 | + break; |
| 115 | + } |
| 116 | + } |
| 117 | + } |
| 118 | + |
| 119 | + if (!empty($offenses)) { |
| 120 | + $lineList = implode(', ', $offenses); |
| 121 | + $message = "Expecting '{$filePath}' to not use profanity.\nat {$filePath}:{$lineList}"; |
| 122 | + throw new \Exception($message); |
| 123 | + } |
| 124 | +} |
0 commit comments