From 9d80d2cd0e78c6026c34d409fda2b87a6a2e2dfa Mon Sep 17 00:00:00 2001
From: Peter Schneider-Kamp <petersk@imada.sdu.dk>
Date: Tue, 9 May 2023 11:37:25 +0200
Subject: [PATCH] added danish, faroese, icelandic, italian, norwegian,
 scandinavian, spanish, swedish

---
 cleantext/clean.py    |  8 ++++----
 cleantext/specials.py | 44 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 44 insertions(+), 8 deletions(-)
diff --git a/cleantext/clean.py b/cleantext/clean.py
index 50ddcb8..bb192c3 100644
--- a/cleantext/clean.py
+++ b/cleantext/clean.py
@@ -11,7 +11,7 @@
 from ftfy import fix_text
 
 from . import constants
-from .specials import save_replace
+from .specials import save_replace, specials_map
 from .utils import remove_substrings
 
 log = logging.getLogger()
@@ -78,13 +78,13 @@ def to_ascii_unicode(text, lang="en", no_emoji=False):
 
     lang = lang.lower()
     # special handling for German text to preserve umlauts
-    if lang == "de":
+    if lang in specials_map:
         text = save_replace(text, lang=lang)
 
     text = unidecode(text)
 
     # important to remove utility characters
-    if lang == "de":
+    if lang in specials_map:
         text = save_replace(text, lang=lang, back=True)
 
     if not no_emoji:
@@ -253,7 +253,7 @@ def clean(
         replace_with_digit (str): special DIGIT token, default "0",
         replace_with_currency_symbol (str): special CURRENCY token, default "<CUR>",
         replace_with_punct (str): replace punctuations with this token, default "",
-        lang (str): special language-depended preprocessing. Besides the default English ('en'), only German ('de') is supported
+        lang (str): special language-depended preprocessing. Besides the default English ('en'), Danish ('da'), Faroese ('fo'), French ('fr'), German ('de'), Icelandic ('is'), Italian ('it'), Norwegian ('no'), Scandinavian ('sv'), Spanish ('es'), and Swedish ('se') are supported
 
     Returns:
         str: input ``text`` processed according to function args
diff --git a/cleantext/specials.py b/cleantext/specials.py
index 7d47da8..2429835 100644
--- a/cleantext/specials.py
+++ b/cleantext/specials.py
@@ -5,10 +5,46 @@
 import unicodedata
 
 # add new languages here
-specials = {
+specials_map = {
     "de": {
         "case_insensitive": [["ä", "ae"], ["ü", "ue"], ["ö", "oe"]],
         "case_sensitive": [["ß", "ss"]],
+    },
+    "da": {
+        "case_insensitive": [["é", "e"], ["Æ", "ae"], ["ø", "oe"], ["å", "aa"]],
+        "case_sensitive": [],
+    },
+    "es": {
+        "case_insensitive": [["á", "a"], ["é", "e"], ["í", "i"], ["ó", "o"], ["ú", "u"], ["ñ", "n"]],
+        "case_sensitive": [],
+    },
+    "fo": {
+        "case_insensitive": [["á", "a"], ["ð", "d"], ["í", "i"], ["ó", "o"], ["ú", "u"], ["Æ", "ae"], ["ø", "oe"]],
+        "case_sensitive": [],
+    },
+    "fr": {
+        "case_insensitive": [["é", "e"], ["à", "a"], ["è", "e"], ["ù", "u"], ["â", "a"], ["ê", "e"], ["î", "oe"], ["ô", "o"], ["û", "u"], ["ë", "e"], ["ï", "i"], ["ü", "u"], ["ÿ", "y"], ["ç", "c"]],
+        "case_sensitive": [],
+    },
+    "is": {
+        "case_insensitive": [["á", "a"], ["ð", "d"], ["é","e"], ["í", "i"], ["ó", "o"], ["ú", "u"], ["ý", "y"], ["þ", "th"], ["Æ", "ae"], ["ö", "oe"]],
+        "case_sensitive": [],
+    },
+    "it": {
+        "case_insensitive": [["á", "a"], ["é", "e"], ["í", "i"], ["ó", "o"], ["ú", "u"], ["à", "a"], ["è", "e"], ["ì", "i"], ["ò", "o"], ["ù", "u"]],
+        "case_sensitive": [],
+    },
+    "no": {
+        "case_insensitive": [["é", "e"], ["ó", "o"], ["è", "e"], ["ò", "o"], ["ù", "u"], ["ê", "e"], ["ô", "o"], ["Æ", "ae"], ["ø", "oe"], ["å", "aa"]],
+        "case_sensitive": [],
+    },
+    "sv": {
+        "case_insensitive": [["á", "a"], ["é", "e"], ["í", "i"], ["ó", "o"], ["ú", "u"], ["è", "e"], ["ý", "y"], ["ò", "o"], ["ù", "u"], ["ê", "e"], ["ô", "o"], ["ð", "d"], ["þ", "th"], ["Æ", "ae"], ["ø", "oe"], ["å", "aa"], ["ä", "oe"], ["ö", "oe"]],
+        "case_sensitive": [],
+    },
+    "se": {
+        "case_insensitive": [["å", "aa"], ["ä", "oe"], ["ö", "oe"]],
+        "case_sensitive": [],
     }
 }
 escape_sequence = "xxxxx"
@@ -26,11 +62,11 @@ def save_replace(text, lang, back=False):
     text = norm(text)
 
     possibilities = (
-        specials[lang]["case_sensitive"]
-        + [[norm(x[0]), x[1]] for x in specials[lang]["case_insensitive"]]
+        specials_map[lang]["case_sensitive"]
+        + [[norm(x[0]), x[1]] for x in specials_map[lang]["case_insensitive"]]
         + [
             [norm(x[0].upper()), x[1].upper()]
-            for x in specials[lang]["case_insensitive"]
+            for x in specials_map[lang]["case_insensitive"]
         ]
     )
     for pattern, target in possibilities: