diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 74d3910b51..dd93cadd8a 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -11,6 +11,9 @@ from .english.number_norm import normalize_numbers as en_normalize_numbers from .english.time_norm import expand_time_english from .french.abbreviations import abbreviations_fr +from .italian.abbreviations import abbreviations_it +from .italian.number_norm import normalize_numbers as it_normalize_numbers +from .italian.time_norm import expand_time_italian # Regular expression matching whitespace: _whitespace_re = re.compile(r"\s+") @@ -21,6 +24,8 @@ def expand_abbreviations(text, lang="en"): _abbreviations = abbreviations_en elif lang == "fr": _abbreviations = abbreviations_fr + elif lang == "it": + _abbreviations = abbreviations_it for regex, replacement in _abbreviations: text = re.sub(regex, replacement, text) return text @@ -70,6 +75,8 @@ def replace_symbols(text, lang="en"): text = text.replace("&", " et ") elif lang == "pt": text = text.replace("&", " e ") + elif lang == "it": + text = text.replace("&", " e ") elif lang == "ca": text = text.replace("&", " i ") text = text.replace("'", "") @@ -130,6 +137,21 @@ def phoneme_cleaners(text): return text +def italian_cleaners(text): + """Pipeline for Italian text: time + light number + abbreviations + symbol cleanup.""" + # optional: normalize common smart punctuation + text = text.replace("’", "'").replace("“", '"').replace("”", '"').replace("–", "-").replace("—", "-") + + text = lowercase(text) + text = expand_time_italian(text) + text = it_normalize_numbers(text) + text = expand_abbreviations(text, lang="it") + text = replace_symbols(text, lang="it") + text = remove_aux_symbols(text) + text = collapse_whitespace(text) + return text + + def french_cleaners(text): """Pipeline for French text. There is no need to expand numbers, phonemizer already does that""" text = expand_abbreviations(text, lang="fr") diff --git a/TTS/tts/utils/text/italian/__init__.py b/TTS/tts/utils/text/italian/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/TTS/tts/utils/text/italian/abbreviations.py b/TTS/tts/utils/text/italian/abbreviations.py new file mode 100644 index 0000000000..5af643749b --- /dev/null +++ b/TTS/tts/utils/text/italian/abbreviations.py @@ -0,0 +1,112 @@ +import re + +_END = r"(?=\s|$|[,\.;:\)\]\}\!?])" + + +def _c(pattern: str) -> re.Pattern: + """Compile a case-insensitive regex.""" + return re.compile(pattern, re.IGNORECASE) + + +abbreviations_it = [ + # --- Courtesy / correspondence + (_c(r"\bspett\.?\s*(?:le\.?)?\b" + _END), "spettabile"), + (_c(r"\begr\.?\b" + _END), "egregio"), + (_c(r"\bgent\.?\s*mo\.?\b" + _END), "gentilissimo"), + (_c(r"\bgent\.?\s*ma\.?\b" + _END), "gentilissima"), + (_c(r"\bill\.?\s*mo\.?\b" + _END), "illustrissimo"), + (_c(r"\bill\.?\s*ma\.?\b" + _END), "illustrissima"), + (_c(r"\batt\.?\b" + _END), "attenzione"), + (_c(r"\bc\.?\s*a\.?\b" + _END), "cortese attenzione"), # often "alla c.a." + # --- People / titles / professions + (_c(r"\bsig\.?\b" + _END), "signor"), + (_c(r"\bsig\.?\s*ra\.?\b" + _END), "signora"), + (_c(r"\bsig\.?\s*na\.?\b" + _END), "signorina"), + (_c(r"\bsigg\.?\b" + _END), "signori"), + (_c(r"\bdott\.?\b" + _END), "dottore"), + (_c(r"\bdott\.?\s*ssa\.?\b" + _END), "dottoressa"), + (_c(r"\bprof\.?\b" + _END), "professore"), + (_c(r"\bprof\.?\s*ssa\.?\b" + _END), "professoressa"), + (_c(r"\bing\.?\b" + _END), "ingegnere"), + (_c(r"\barch\.?\b" + _END), "architetto"), + (_c(r"\bavv\.?\b" + _END), "avvocato"), + (_c(r"\bgeom\.?\b" + _END), "geometra"), + (_c(r"\brag\.?\b" + _END), "ragioniere"), + (_c(r"\bcomm\.?\b" + _END), "commercialista"), + (_c(r"\bdott\.?\s*comm\.?\b" + _END), "dottore commercialista"), + # --- Generic shorthand / Latin-ish + (_c(r"\becc\.?\b" + _END), "eccetera"), + (_c(r"\betc\.?\b" + _END), "eccetera"), + (_c(r"\bes\.?\b" + _END), "per esempio"), + (_c(r"\bp\.?\s*es\.?\b" + _END), "per esempio"), + (_c(r"\bad\.?\s*es\.?\b" + _END), "ad esempio"), + (_c(r"\bcioe\.?\b" + _END), "cioè"), + (_c(r"\boss\.?\b" + _END), "ossia"), + (_c(r"\bcfr\.?\b" + _END), "confronta"), + (_c(r"\bvd\.?\b" + _END), "vedi"), + (_c(r"\bvv\.?\b" + _END), "vedi"), + # --- Document structure / references (useful for academic text) + (_c(r"\bfig\.?\b" + _END), "figura"), + (_c(r"\btab\.?\b" + _END), "tabella"), + (_c(r"\beq\.?\b" + _END), "equazione"), + (_c(r"\bsez\.?\b" + _END), "sezione"), + (_c(r"\bsec\.?\b" + _END), "sezione"), + (_c(r"\bcap\.?\b" + _END), "capitolo"), + (_c(r"\bpar\.?\b" + _END), "paragrafo"), + (_c(r"\bapp\.?\b" + _END), "appendice"), + (_c(r"\bpagg\.?\b" + _END), "pagine"), + (_c(r"\bpag\.?\b" + _END), "pagina"), + # --- Identifiers / contact / numbering + (_c(r"\btel\.?\b" + _END), "telefono"), + (_c(r"\bcell\.?\b" + _END), "cellulare"), + (_c(r"\bint\.?\b" + _END), "interno"), + (_c(r"\bn\.?\b" + _END), "numero"), + (_c(r"\bnn\.?\b" + _END), "numeri"), + (_c(r"\bnr\.?\b" + _END), "numero"), + (_c(r"\bcod\.?\s*fisc\.?\b" + _END), "codice fiscale"), + (_c(r"\bc\.?\s*f\.?\b" + _END), "codice fiscale"), + (_c(r"\bp\.?\s*iva\.?\b" + _END), "partita iva"), + (_c(r"\bpiva\b" + _END), "partita iva"), + # --- Dates / formal ref (common in letters) + (_c(r"\bu\.?\s*s\.?\b" + _END), "ultimo scorso"), + (_c(r"\bp\.?\s*v\.?\b" + _END), "prossimo venturo"), + # --- Company legal forms (common in names) + (_c(r"\bs\.?\s*p\.?\s*a\.?\b" + _END), "società per azioni"), + (_c(r"\bs\.?\s*r\.?\s*l\.?\b" + _END), "società a responsabilità limitata"), + (_c(r"\bs\.?\s*n\.?\s*c\.?\b" + _END), "società in nome collettivo"), + (_c(r"\bs\.?\s*a\.?\s*s\.?\b" + _END), "società in accomandita semplice"), + (_c(r"\bcoop\.?\b" + _END), "cooperativa"), + (_c(r"\bonlus\b" + _END), "organizzazione non lucrativa di utilità sociale"), + # --- Months (common in dates) + (_c(r"\bgen\.?\b" + _END), "gennaio"), + (_c(r"\bfeb\.?\b" + _END), "febbraio"), + (_c(r"\bmar\.?\b" + _END), "marzo"), + (_c(r"\bapr\.?\b" + _END), "aprile"), + (_c(r"\bmag\.?\b" + _END), "maggio"), + (_c(r"\bgiu\.?\b" + _END), "giugno"), + (_c(r"\blug\.?\b" + _END), "luglio"), + (_c(r"\bago\.?\b" + _END), "agosto"), + (_c(r"\bset\.?\b" + _END), "settembre"), + (_c(r"\bott\.?\b" + _END), "ottobre"), + (_c(r"\bnov\.?\b" + _END), "novembre"), + (_c(r"\bdic\.?\b" + _END), "dicembre"), + # --- Web/contact tokens (optional but often helpful in prompts) + (_c(r"\be-?mail\b" + _END), "email"), + (_c(r"\bwww\.?\b" + _END), "doppia vu doppia vu doppia vu"), +] + +# More aggressive abbreviations +abbreviations_it_aggressive = [ + # Address components + (_c(r"\bv\.?\s*le\.?\b" + _END), "viale"), # v.le + (_c(r"\bp\.?\s*za\.?\b" + _END), "piazza"), # p.za + (_c(r"\bp\.?\s*zza\.?\b" + _END), "piazza"), # p.zza + (_c(r"\bp\.?\s*le\.?\b" + _END), "piazzale"), # p.le + (_c(r"\bc\.?\s*so\.?\b" + _END), "corso"), # c.so + # Saints: expand "S." to "san" only when followed by a capitalized token + # (helps avoid matching "s." in arbitrary contexts). + (_c(r"\bs\.(?=\s+[A-ZÀ-ÖØ-Ý])"), "san"), + (_c(r"\bs\.\s*maria\b"), "santa maria"), +] + +abbreviations_it_all = abbreviations_it + abbreviations_it_aggressive diff --git a/TTS/tts/utils/text/italian/number_norm.py b/TTS/tts/utils/text/italian/number_norm.py new file mode 100644 index 0000000000..e5572a460e --- /dev/null +++ b/TTS/tts/utils/text/italian/number_norm.py @@ -0,0 +1,32 @@ +import re + +# 1.234.567 -> 1234567 (Italian thousands separator) +_THOUSANDS_DOT_RE = re.compile(r"\b(\d{1,3})(\.\d{3})+\b") + +# decimal comma: 3,14 -> 3 virgola 14 +_DECIMAL_COMMA_RE = re.compile(r"(?<=\d),(?=\d)") + +# decimal dot (only when it doesn't look like thousands groups): 3.14 -> 3 virgola 14 +_DECIMAL_DOT_RE = re.compile(r"(?<=\d)\.(?=\d)") + + +def normalize_numbers(text: str) -> str: + # Remove thousands separators like 1.234.567 + def _rm_thousands(m: re.Match) -> str: + return m.group(0).replace(".", "") + + text = _THOUSANDS_DOT_RE.sub(_rm_thousands, text) + + # Percent / currency / degree signs (simple, robust) + text = re.sub(r"(\d)\s*%", r"\1 percento", text) + text = re.sub(r"€\s*(\d+(?:[.,]\d+)?)", r"\1 euro", text) + text = re.sub(r"(\d+(?:[.,]\d+)?)\s*€", r"\1 euro", text) + text = re.sub(r"(\d+(?:[.,]\d+)?)\s*°C\b", r"\1 gradi celsius", text, flags=re.IGNORECASE) + text = re.sub(r"(\d+(?:[.,]\d+)?)\s*°\b", r"\1 gradi", text) + + # Convert decimal separators between digits to spoken form. + # Order matters: after removing thousands dots, remaining dot between digits is likely decimal. + text = _DECIMAL_COMMA_RE.sub(" virgola ", text) + text = _DECIMAL_DOT_RE.sub(" virgola ", text) + + return text diff --git a/TTS/tts/utils/text/italian/time_norm.py b/TTS/tts/utils/text/italian/time_norm.py new file mode 100644 index 0000000000..bca5fd7a66 --- /dev/null +++ b/TTS/tts/utils/text/italian/time_norm.py @@ -0,0 +1,63 @@ +import re + +_TIME_RE = re.compile(r"\b([01]?\d|2[0-3])[:\.]([0-5]\d)\b") + +# 0..59 (sufficient for minutes) +_IT_0_19 = [ + "zero", + "uno", + "due", + "tre", + "quattro", + "cinque", + "sei", + "sette", + "otto", + "nove", + "dieci", + "undici", + "dodici", + "tredici", + "quattordici", + "quindici", + "sedici", + "diciassette", + "diciotto", + "diciannove", +] +_IT_TENS = { + 20: "venti", + 30: "trenta", + 40: "quaranta", + 50: "cinquanta", +} + + +def _it_number_0_59(n: int) -> str: + if 0 <= n < 20: + return _IT_0_19[n] + tens = (n // 10) * 10 + unit = n % 10 + if tens not in _IT_TENS: + return str(n) + if unit == 0: + return _IT_TENS[tens] + # Keep it simple (no elision rules like "ventuno/ventotto"); XTTS usually handles both. + return f"{_IT_TENS[tens]} { _IT_0_19[unit] }" + + +def expand_time_italian(text: str) -> str: + """ + 09:00 -> 9 + 09:05 -> 9 e cinque + 14.30 -> 14 e trenta + """ + + def repl(m: re.Match) -> str: + hh = int(m.group(1)) + mm = int(m.group(2)) + if mm == 0: + return f"{hh}" + return f"{hh} e {_it_number_0_59(mm)}" + + return _TIME_RE.sub(repl, text) diff --git a/tests/text_tests/test_text_cleaners.py b/tests/text_tests/test_text_cleaners.py index fcfa71e77d..df5f52df55 100644 --- a/tests/text_tests/test_text_cleaners.py +++ b/tests/text_tests/test_text_cleaners.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -from TTS.tts.utils.text.cleaners import english_cleaners, phoneme_cleaners +from TTS.tts.utils.text.cleaners import english_cleaners, italian_cleaners, phoneme_cleaners def test_time() -> None: @@ -19,3 +19,15 @@ def test_currency() -> None: def test_expand_numbers() -> None: assert phoneme_cleaners("-1") == "minus one" assert phoneme_cleaners("1") == "one" + + +def test_italian_cleaners_numbers_and_abbreviations() -> None: + text = "Alle 09:05 Sig. Bianchi ha pagato €1.234,50 per 50%" + expected = "alle 9 e cinque signor bianchi ha pagato 1234 virgola 50 euro per 50 percento" + assert italian_cleaners(text) == expected + + +def test_italian_cleaners_temperature_and_time() -> None: + text = "Temperatura: 3.5\N{DEGREE SIGN}C alle 14.00" + expected = "temperatura, 3 virgola 5 gradi celsius alle 14" + assert italian_cleaners(text) == expected