|
111 | 111 | becomes u, ç becomes c. |
112 | 112 | --trim Enables removing newlines representations from end and beginning. Newline |
113 | 113 | representations detected are '\\n', '\\r', '\n', '\r', '<br>', and '<br />'. |
| 114 | + --transliterate <language> Transliterate a strings, for example "ipsum" becomes "իպսում". Language is iso |
| 115 | + 2 letter code. Examples: ru, sr, ua |
114 | 116 |
|
115 | 117 | Add modules (Modify a line, but keep the original as well): |
116 | 118 | --add-lower If a line contains a capital letter this will add the lower case variant |
|
169 | 171 | from nltk import str2tuple |
170 | 172 | from nltk.tokenize import WhitespaceTokenizer |
171 | 173 | from tqdm import tqdm |
| 174 | +from transliterate import translit |
172 | 175 | from unidecode import unidecode |
173 | 176 |
|
174 | 177 |
|
175 | | -version = '4.5.1' |
| 178 | +version = '4.6.0' |
176 | 179 |
|
177 | 180 | # Search from start to finish for the string $HEX[], with block of a-f0-9 with even number |
178 | 181 | # of hex chars. The first match group is repeated. |
@@ -680,6 +683,23 @@ def clean_cut(line, delimiters, fields): |
680 | 683 | return False, line |
681 | 684 |
|
682 | 685 |
|
| 686 | +def clean_transliterate(line, language): |
| 687 | + """Transliterate a string |
| 688 | +
|
| 689 | + Params: |
| 690 | + line (Unicode) |
| 691 | + language (str) |
| 692 | +
|
| 693 | + Returns: |
| 694 | + line (Unicode) |
| 695 | + """ |
| 696 | + cleaned_line = translit(line, language, reversed=True) |
| 697 | + if line != cleaned_line: |
| 698 | + return True, cleaned_line |
| 699 | + else: |
| 700 | + return False, line |
| 701 | + |
| 702 | + |
683 | 703 | def clean_non_ascii(line): |
684 | 704 | """Replace non ascii chars with there ascii representation. |
685 | 705 |
|
@@ -1129,6 +1149,12 @@ def clean_up(lines): |
1129 | 1149 | if status and config['debug']: |
1130 | 1150 | log.append(f'Clean_umlaut; umlaut replaced; {line_decoded}{linesep}') |
1131 | 1151 |
|
| 1152 | + # Transliterate |
| 1153 | + if config.get('transliterate') and not stop: |
| 1154 | + status, line_decoded = clean_transliterate(line_decoded, config.get('transliterate')) |
| 1155 | + if status and config['debug']: |
| 1156 | + log.append(f'Clean_transliterate; translitatered; {line_decoded}{linesep}') |
| 1157 | + |
1132 | 1158 | # Replace non-ascii |
1133 | 1159 | if config.get('non-ascii') and not stop: |
1134 | 1160 | status, line_decoded = clean_non_ascii(line_decoded) |
@@ -1409,6 +1435,7 @@ def main(): |
1409 | 1435 | 'umlaut': False, |
1410 | 1436 | 'non-ascii': False, |
1411 | 1437 | 'title_case': False, |
| 1438 | + 'transliterate': False, |
1412 | 1439 |
|
1413 | 1440 | # Check |
1414 | 1441 | 'length': False, |
@@ -1543,6 +1570,9 @@ def main(): |
1543 | 1570 | if arguments.get('--trim'): |
1544 | 1571 | config['trim'] = True |
1545 | 1572 |
|
| 1573 | + if arguments.get('--transliterate'): |
| 1574 | + config['transliterate'] = arguments.get('--transliterate') |
| 1575 | + |
1546 | 1576 | # Check modules |
1547 | 1577 | if arguments.get('--check-min-length'): |
1548 | 1578 | config['check-length'] = True |
|
0 commit comments