|
18 | 18 | *
|
19 | 19 | * To generate a regular expression pattern:
|
20 | 20 | * php -q extractor.php stopwords_en_US.txt -p
|
| 21 | + * |
| 22 | + * To generate a regular expression pattern from a php array: |
| 23 | + * php -q extractor.php en_US.php -p |
21 | 24 | */
|
22 | 25 |
|
23 | 26 | /**
|
@@ -105,6 +108,11 @@ function load_stopwords($stopwords_file)
|
105 | 108 | return array_fill_keys($stopwords, true);
|
106 | 109 | }
|
107 | 110 |
|
| 111 | + if ($ext === 'php') { |
| 112 | + $stopwords = require $stopwords_file; |
| 113 | + return array_fill_keys($stopwords, true); |
| 114 | + } |
| 115 | + |
108 | 116 | return [];
|
109 | 117 | }
|
110 | 118 |
|
@@ -155,10 +163,20 @@ function render_pattern_output(array $stopwords)
|
155 | 163 | $regex = [];
|
156 | 164 |
|
157 | 165 | foreach ($stopwords as $word) {
|
158 |
| - $regex[] = '\b' . $word . '\b'; |
| 166 | + if (mb_strlen($word) === 1) { |
| 167 | + // This pattern allows for words such as a-class and j'aimerais, however, |
| 168 | + // words such as day-z will be broken up into day- and the z will go |
| 169 | + // missing. A possible workaround is to set the pattern as: |
| 170 | + // '\b(?!-)' . $word . '(?!(-|\'))\b' |
| 171 | + // but then two character words such as WA will also be stripped out. |
| 172 | + $regex[] = '\b' . $word . '(?!(-|\'))\b'; |
| 173 | + // $regex[] = '\b(?!-)' . $word . '(?!(-|\'))\b'; |
| 174 | + } else { |
| 175 | + $regex[] = '\b' . $word . '\b'; |
| 176 | + } |
159 | 177 | }
|
160 | 178 |
|
161 |
| - echo '/' . implode('|', $regex) . '/iu' . "\n"; |
| 179 | + echo '/' . implode('|', $regex) . '/i' . "\n"; |
162 | 180 | }
|
163 | 181 |
|
164 | 182 | /**
|
|
0 commit comments