From 9d80d2cd0e78c6026c34d409fda2b87a6a2e2dfa Mon Sep 17 00:00:00 2001 From: Peter Schneider-Kamp Date: Tue, 9 May 2023 11:37:25 +0200 Subject: [PATCH] added danish, faroese, icelandic, italian, norwegian, scandinavian, spanish, swedish --- cleantext/clean.py | 8 ++++---- cleantext/specials.py | 44 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 44 insertions(+), 8 deletions(-) diff --git a/cleantext/clean.py b/cleantext/clean.py index 50ddcb8..bb192c3 100644 --- a/cleantext/clean.py +++ b/cleantext/clean.py @@ -11,7 +11,7 @@ from ftfy import fix_text from . import constants -from .specials import save_replace +from .specials import save_replace, specials_map from .utils import remove_substrings log = logging.getLogger() @@ -78,13 +78,13 @@ def to_ascii_unicode(text, lang="en", no_emoji=False): lang = lang.lower() # special handling for German text to preserve umlauts - if lang == "de": + if lang in specials_map: text = save_replace(text, lang=lang) text = unidecode(text) # important to remove utility characters - if lang == "de": + if lang in specials_map: text = save_replace(text, lang=lang, back=True) if not no_emoji: @@ -253,7 +253,7 @@ def clean( replace_with_digit (str): special DIGIT token, default "0", replace_with_currency_symbol (str): special CURRENCY token, default "", replace_with_punct (str): replace punctuations with this token, default "", - lang (str): special language-depended preprocessing. Besides the default English ('en'), only German ('de') is supported + lang (str): special language-depended preprocessing. Besides the default English ('en'), Danish ('da'), Faroese ('fo'), French ('fr'), German ('de'), Icelandic ('is'), Italian ('it'), Norwegian ('no'), Scandinavian ('sv'), Spanish ('es'), and Swedish ('se') are supported Returns: str: input ``text`` processed according to function args diff --git a/cleantext/specials.py b/cleantext/specials.py index 7d47da8..2429835 100644 --- a/cleantext/specials.py +++ b/cleantext/specials.py @@ -5,10 +5,46 @@ import unicodedata # add new languages here -specials = { +specials_map = { "de": { "case_insensitive": [["ä", "ae"], ["ü", "ue"], ["ö", "oe"]], "case_sensitive": [["ß", "ss"]], + }, + "da": { + "case_insensitive": [["é", "e"], ["Æ", "ae"], ["ø", "oe"], ["å", "aa"]], + "case_sensitive": [], + }, + "es": { + "case_insensitive": [["á", "a"], ["é", "e"], ["í", "i"], ["ó", "o"], ["ú", "u"], ["ñ", "n"]], + "case_sensitive": [], + }, + "fo": { + "case_insensitive": [["á", "a"], ["ð", "d"], ["í", "i"], ["ó", "o"], ["ú", "u"], ["Æ", "ae"], ["ø", "oe"]], + "case_sensitive": [], + }, + "fr": { + "case_insensitive": [["é", "e"], ["à", "a"], ["è", "e"], ["ù", "u"], ["â", "a"], ["ê", "e"], ["î", "oe"], ["ô", "o"], ["û", "u"], ["ë", "e"], ["ï", "i"], ["ü", "u"], ["ÿ", "y"], ["ç", "c"]], + "case_sensitive": [], + }, + "is": { + "case_insensitive": [["á", "a"], ["ð", "d"], ["é","e"], ["í", "i"], ["ó", "o"], ["ú", "u"], ["ý", "y"], ["þ", "th"], ["Æ", "ae"], ["ö", "oe"]], + "case_sensitive": [], + }, + "it": { + "case_insensitive": [["á", "a"], ["é", "e"], ["í", "i"], ["ó", "o"], ["ú", "u"], ["à", "a"], ["è", "e"], ["ì", "i"], ["ò", "o"], ["ù", "u"]], + "case_sensitive": [], + }, + "no": { + "case_insensitive": [["é", "e"], ["ó", "o"], ["è", "e"], ["ò", "o"], ["ù", "u"], ["ê", "e"], ["ô", "o"], ["Æ", "ae"], ["ø", "oe"], ["å", "aa"]], + "case_sensitive": [], + }, + "sv": { + "case_insensitive": [["á", "a"], ["é", "e"], ["í", "i"], ["ó", "o"], ["ú", "u"], ["è", "e"], ["ý", "y"], ["ò", "o"], ["ù", "u"], ["ê", "e"], ["ô", "o"], ["ð", "d"], ["þ", "th"], ["Æ", "ae"], ["ø", "oe"], ["å", "aa"], ["ä", "oe"], ["ö", "oe"]], + "case_sensitive": [], + }, + "se": { + "case_insensitive": [["å", "aa"], ["ä", "oe"], ["ö", "oe"]], + "case_sensitive": [], } } escape_sequence = "xxxxx" @@ -26,11 +62,11 @@ def save_replace(text, lang, back=False): text = norm(text) possibilities = ( - specials[lang]["case_sensitive"] - + [[norm(x[0]), x[1]] for x in specials[lang]["case_insensitive"]] + specials_map[lang]["case_sensitive"] + + [[norm(x[0]), x[1]] for x in specials_map[lang]["case_insensitive"]] + [ [norm(x[0].upper()), x[1].upper()] - for x in specials[lang]["case_insensitive"] + for x in specials_map[lang]["case_insensitive"] ] ) for pattern, target in possibilities: