Adding transliterate feature (#69)

zyronix · web-flow · commit fbea76ce89c5 · 2025-07-24T10:33:32.000+02:00
* Adding transliterate feature
diff --git a/bin/demeuk.py b/bin/demeuk.py
@@ -111,6 +111,8 @@
                                         becomes u, ç becomes c.
         --trim                          Enables removing newlines representations from end and beginning. Newline
                                         representations detected are '\\n', '\\r', '\n', '\r', '<br>', and '<br />'.
+        --transliterate <language>      Transliterate a strings, for example "ipsum" becomes "իպսում". Language is iso
+                                        2 letter code. Examples: ru, sr, ua
 
     Add modules (Modify a line, but keep the original as well):
         --add-lower                     If a line contains a capital letter this will add the lower case variant
@@ -169,10 +171,11 @@
 from nltk import str2tuple
 from nltk.tokenize import WhitespaceTokenizer
 from tqdm import tqdm
+from transliterate import translit
 from unidecode import unidecode
 
 
-version = '4.5.1'
+version = '4.6.0'
 
 # Search from start to finish for the string $HEX[], with block of a-f0-9 with even number
 # of hex chars. The first match group is repeated.
@@ -680,6 +683,23 @@ def clean_cut(line, delimiters, fields):
         return False, line
 
 
+def clean_transliterate(line, language):
+    """Transliterate a string
+
+    Params:
+        line (Unicode)
+        language (str)
+
+    Returns:
+        line (Unicode)
+    """
+    cleaned_line = translit(line, language, reversed=True)
+    if line != cleaned_line:
+        return True, cleaned_line
+    else:
+        return False, line
+
+
 def clean_non_ascii(line):
     """Replace non ascii chars with there ascii representation.
 
@@ -1129,6 +1149,12 @@ def clean_up(lines):
             if status and config['debug']:
                 log.append(f'Clean_umlaut; umlaut replaced; {line_decoded}{linesep}')
 
+        # Transliterate
+        if config.get('transliterate') and not stop:
+            status, line_decoded = clean_transliterate(line_decoded, config.get('transliterate'))
+            if status and config['debug']:
+                log.append(f'Clean_transliterate; translitatered; {line_decoded}{linesep}')
+
         # Replace non-ascii
         if config.get('non-ascii') and not stop:
             status, line_decoded = clean_non_ascii(line_decoded)
@@ -1409,6 +1435,7 @@ def main():
         'umlaut': False,
         'non-ascii': False,
         'title_case': False,
+        'transliterate': False,
 
         # Check
         'length': False,
@@ -1543,6 +1570,9 @@ def main():
     if arguments.get('--trim'):
         config['trim'] = True
 
+    if arguments.get('--transliterate'):
+        config['transliterate'] = arguments.get('--transliterate')
+
     # Check modules
     if arguments.get('--check-min-length'):
         config['check-length'] = True
diff --git a/docs/usage.rst b/docs/usage.rst
@@ -418,6 +418,17 @@ https://pypi.org/project/Unidecode/
 
 For example a line like 'kožušček' is replaced to kozuscek.
 
+transliterate
+~~~~~~~~~~~~~
+Replaces Cyrillic characters with their Latin equivalents. For example, жута becomes Žuta. To take this even further,
+combine it with --non-ascii to convert this to zuta.
+
+The follow languages are supported: ka, sr, l1, ru, mn, uk, mk, el, hy and bg
+
+--transliterate ru
+
+Check https://pypi.org/project/transliterate/ for more details.
+
 lowercase
 ~~~~~~~~~~
 Replace lines like 'Test Test Test' to 'test test test'. Basically lowercasing all
diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,4 @@ nltk
 ftfy
 unidecode
 tqdm
+transliterate
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -397,3 +397,7 @@
 with open('testdata/input54', 'w') as file:
     file.write(f'Golf Trip{linesep}')
     file.write(f'Sequences{linesep}')
+
+with open('testdata/input55', 'w') as file:
+    file.write(f'здраво пријатељу{linesep}')
+    file.write(f'жута банана{linesep}')
diff --git a/tests/test_app.py b/tests/test_app.py
@@ -1003,3 +1003,19 @@ def test_infinite_loop():
     assert 'Sequences' in filecontent
     assert 'golf trip' in filecontent
     assert 'sequences' in filecontent
+
+
+def test_transliterate():
+    testargs = [
+        'demeuk', '-i', 'testdata/input55', '-o', 'testdata/output55', '-l', 'testdata/log55',
+        '--transliterate', 'sr', '--non-ascii'
+    ]
+
+    with patch.object(sys, 'argv', testargs):
+        main()
+
+    with open('testdata/output55') as f:
+        filecontent = f.read()
+
+    assert 'zdravo prijatelju' in filecontent
+    assert 'zuta banana' in filecontent