Skip to content

Commit fbea76c

Browse files
authored
Adding transliterate feature (#69)
* Adding transliterate feature
1 parent 9804cc2 commit fbea76c

File tree

5 files changed

+63
-1
lines changed

5 files changed

+63
-1
lines changed

bin/demeuk.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,8 @@
111111
becomes u, ç becomes c.
112112
--trim Enables removing newlines representations from end and beginning. Newline
113113
representations detected are '\\n', '\\r', '\n', '\r', '<br>', and '<br />'.
114+
--transliterate <language> Transliterate a strings, for example "ipsum" becomes "իպսում". Language is iso
115+
2 letter code. Examples: ru, sr, ua
114116
115117
Add modules (Modify a line, but keep the original as well):
116118
--add-lower If a line contains a capital letter this will add the lower case variant
@@ -169,10 +171,11 @@
169171
from nltk import str2tuple
170172
from nltk.tokenize import WhitespaceTokenizer
171173
from tqdm import tqdm
174+
from transliterate import translit
172175
from unidecode import unidecode
173176

174177

175-
version = '4.5.1'
178+
version = '4.6.0'
176179

177180
# Search from start to finish for the string $HEX[], with block of a-f0-9 with even number
178181
# of hex chars. The first match group is repeated.
@@ -680,6 +683,23 @@ def clean_cut(line, delimiters, fields):
680683
return False, line
681684

682685

686+
def clean_transliterate(line, language):
687+
"""Transliterate a string
688+
689+
Params:
690+
line (Unicode)
691+
language (str)
692+
693+
Returns:
694+
line (Unicode)
695+
"""
696+
cleaned_line = translit(line, language, reversed=True)
697+
if line != cleaned_line:
698+
return True, cleaned_line
699+
else:
700+
return False, line
701+
702+
683703
def clean_non_ascii(line):
684704
"""Replace non ascii chars with there ascii representation.
685705
@@ -1129,6 +1149,12 @@ def clean_up(lines):
11291149
if status and config['debug']:
11301150
log.append(f'Clean_umlaut; umlaut replaced; {line_decoded}{linesep}')
11311151

1152+
# Transliterate
1153+
if config.get('transliterate') and not stop:
1154+
status, line_decoded = clean_transliterate(line_decoded, config.get('transliterate'))
1155+
if status and config['debug']:
1156+
log.append(f'Clean_transliterate; translitatered; {line_decoded}{linesep}')
1157+
11321158
# Replace non-ascii
11331159
if config.get('non-ascii') and not stop:
11341160
status, line_decoded = clean_non_ascii(line_decoded)
@@ -1409,6 +1435,7 @@ def main():
14091435
'umlaut': False,
14101436
'non-ascii': False,
14111437
'title_case': False,
1438+
'transliterate': False,
14121439

14131440
# Check
14141441
'length': False,
@@ -1543,6 +1570,9 @@ def main():
15431570
if arguments.get('--trim'):
15441571
config['trim'] = True
15451572

1573+
if arguments.get('--transliterate'):
1574+
config['transliterate'] = arguments.get('--transliterate')
1575+
15461576
# Check modules
15471577
if arguments.get('--check-min-length'):
15481578
config['check-length'] = True

docs/usage.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,17 @@ https://pypi.org/project/Unidecode/
418418

419419
For example a line like 'kožušček' is replaced to kozuscek.
420420

421+
transliterate
422+
~~~~~~~~~~~~~
423+
Replaces Cyrillic characters with their Latin equivalents. For example, жута becomes Žuta. To take this even further,
424+
combine it with --non-ascii to convert this to zuta.
425+
426+
The follow languages are supported: ka, sr, l1, ru, mn, uk, mk, el, hy and bg
427+
428+
--transliterate ru
429+
430+
Check https://pypi.org/project/transliterate/ for more details.
431+
421432
lowercase
422433
~~~~~~~~~~
423434
Replace lines like 'Test Test Test' to 'test test test'. Basically lowercasing all

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ nltk
44
ftfy
55
unidecode
66
tqdm
7+
transliterate

tests/conftest.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,3 +397,7 @@
397397
with open('testdata/input54', 'w') as file:
398398
file.write(f'Golf Trip{linesep}')
399399
file.write(f'Sequences{linesep}')
400+
401+
with open('testdata/input55', 'w') as file:
402+
file.write(f'здраво пријатељу{linesep}')
403+
file.write(f'жута банана{linesep}')

tests/test_app.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1003,3 +1003,19 @@ def test_infinite_loop():
10031003
assert 'Sequences' in filecontent
10041004
assert 'golf trip' in filecontent
10051005
assert 'sequences' in filecontent
1006+
1007+
1008+
def test_transliterate():
1009+
testargs = [
1010+
'demeuk', '-i', 'testdata/input55', '-o', 'testdata/output55', '-l', 'testdata/log55',
1011+
'--transliterate', 'sr', '--non-ascii'
1012+
]
1013+
1014+
with patch.object(sys, 'argv', testargs):
1015+
main()
1016+
1017+
with open('testdata/output55') as f:
1018+
filecontent = f.read()
1019+
1020+
assert 'zdravo prijatelju' in filecontent
1021+
assert 'zuta banana' in filecontent

0 commit comments

Comments
 (0)