Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 92 additions & 3 deletions tests/test_3_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
Mis à jour le"""

WORDS = {
"empty": Word.empty(),
"empty": Word([], [], [], [], []),
"foo": Word(["pron"], ["gender"], ["etyl"], ["def 1", ("sdef 1",)], []),
"foos": Word(["pron"], ["gender"], ["etyl"], ["def 1", ("sdef 1", ("ssdef 1",))], ["baz"]),
"baz": Word(["pron"], ["gender"], ["etyl"], ["def 1", ("sdef 1",)], ["foobar"]),
Expand Down Expand Up @@ -296,6 +296,95 @@ def test_word_rendering(
include_etymology=include_etymology,
)

kwargs = {"name": "mu", "words": WORDS} if isinstance(cls, convert.KoboFormat) else {}
content = next(cls.handle_word("Multiple Etymologies", WORDS["Multiple Etymologies"], **kwargs))
content = next(cls.handle_word("Multiple Etymologies", WORDS["Multiple Etymologies"], WORDS))
assert content == expected


WORDS_VARIANTS_FR = words = {
"être": Word(
pronunciations=["\\ɛtʁ\\"],
genders=["m"],
etymology=["<i>(Date à préciser)</i> Du moyen français <i>estre</i> ..."],
definitions=[
"Définir un état, une caractéristique du sujet.",
"Se situer, se trouver, rester, spécifiant une location, une situation.",
"<i>(Absolument)</i> Exister.",
],
variants=[],
),
"suis": Word(
pronunciations=["\\sɥi\\"],
genders=[],
etymology=["<i>(Forme de verbe 1)</i> De l’ancien français <i>suis</i>..."],
definitions=[],
variants=["suivre", "être", "foo"],
),
"suivre": Word(
pronunciations=["\\sɥivʁ\\"],
genders=[],
etymology=[
"<i>(Date à préciser)</i> Du moyen français...",
"Les parentés proches de ce mot incluent, ...",
],
definitions=[
"Aller ou venir après.",
"Aller, continuer d’aller dans une même direction.",
("S’emploie figurément dans le même sens.",),
],
variants=[],
),
}
WORDS_VARIANTS_ES = {
"gastada": Word(pronunciations=[], genders=[], etymology=[], definitions=[], variants=["gastado"]),
"gastado": Word(pronunciations=[], genders=[], etymology=[], definitions=[], variants=["gastar"]),
"gastar": Word(
pronunciations=[],
genders=[],
etymology=['Del latín <i>vastāre</i> ("devastar").'],
definitions=[
"Provocar el consumo, deterioro o destrucción de algo por el uso.",
"Digerir, asimilar los alimentos.",
],
variants=[],
),
}


def test_make_variants() -> None:
assert convert.make_variants(WORDS_VARIANTS_FR) == {"foo": ["suis"], "suivre": ["suis"], "être": ["suis"]}
assert convert.make_variants(WORDS_VARIANTS_ES) == {"gastado": ["gastada"], "gastar": ["gastado"]}


def test_kobo_format_variants_different_prefix(tmp_path: Path) -> None:
words = WORDS_VARIANTS_FR
variants = convert.make_variants(words)
kobo_formater = convert.KoboFormat("fr", tmp_path, words, variants, "20250322")

assert kobo_formater.make_groups(words) == {
"su": {"suis": words["suis"], "suivre": words["suivre"]},
"êt": {"être": words["être"]},
}

assert "variant" not in "".join(kobo_formater.handle_word("suis", words["suis"], words))
assert '<variant name="suis"/></var>' in "".join(kobo_formater.handle_word("être", words["être"], words))
assert '<variant name="suis"/></var>' in "".join(kobo_formater.handle_word("suivre", words["suivre"], words))


def test_kobo_format_variants_empty_variant_level_1(tmp_path: Path) -> None:
words = WORDS_VARIANTS_ES
variants = convert.make_variants(words)
kobo_formater = convert.KoboFormat("es", tmp_path, words, variants, "20250322")

assert kobo_formater.make_groups(words) == {
"ga": {
"gastada": words["gastada"],
"gastado": words["gastado"],
"gastar": words["gastar"],
}
}

assert "variant" not in "".join(kobo_formater.handle_word("gastada", words["gastada"], words))
assert "variant" not in "".join(kobo_formater.handle_word("gastado", words["gastado"], words))
assert '<variant name="gastada"/><variant name="gastado"/></var>' in "".join(
kobo_formater.handle_word("gastar", words["gastar"], words)
)
142 changes: 44 additions & 98 deletions wikidict/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,19 @@
{% endif %}
</p>
{% if variants %}
{{ variants }}
<var>
{%- for variant in variants -%}
<variant name="{{ variant }}"/>
{%- endfor -%}
</var>
{% endif %}
</w>
"""
)

# DictFile-related dictionaries
# Source: https://pgaskin.net/dictutil/dictgen/#dictfile-format
# Source: https://github.com/hunspell/hunspell/blob/ecc6dbb52025bdf3a766429988e64190d912765f/man/hunspell.1#L93-L139 (for later, in case of issues with other sub-formats)
WORD_TPL_DICTFILE = Template(
"""\
@ {{ word }}
Expand Down Expand Up @@ -149,14 +155,7 @@
class BaseFormat:
"""Base class for all dictionaries."""

__slots__ = {
"locale",
"output_dir",
"snapshot",
"words",
"variants",
"include_etymology",
}
template = Template("") # To be set by subclasses

def __init__(
self,
Expand Down Expand Up @@ -188,8 +187,37 @@ def dictionary_file(self, output_file: str) -> Path:
self.locale, "" if self.include_etymology else constants.NO_ETYMOLOGY_SUFFIX
)

def handle_word(self, word: str, details: Word, **kwargs: Any) -> Generator[str]: # pragma: nocover
raise NotImplementedError()
def handle_word(self, word: str, details: Word, words: Words) -> Generator[str]:
if not details.definitions:
return

variants_final = []

if variants := self.variants.get(word, []):
# Add variants of empty* variant, only 1 redirection:
# [ES] gastada* -> gastado* -> gastar --> (gastada, gastado) -> gastar
# Note: the process works backward: from gastar up to gastado up to gastada.
for variant in variants.copy():
if (wv := words.get(variant)) and not wv.definitions:
variants.extend(self.variants.get(variant, []))

if isinstance(self, KoboFormat):
# Variant must be normalized by trimming whitespace and lowercasing it.
variants = [variant.lower().strip() for variant in variants]

# Remove potential duplicates, and sort by length then name
variants_final = sorted(set(variants), key=lambda s: (len(s), s))

yield self.render_word(
self.template,
word=word,
current_word=word,
definitions=details.definitions,
pronunciation=convert_pronunciation(details.pronunciations),
gender=convert_gender(details.genders),
etymologies=details.etymology if self.include_etymology else [],
variants=variants_final,
)

def process(self) -> None: # pragma: nocover
raise NotImplementedError()
Expand All @@ -213,6 +241,7 @@ class KoboFormat(BaseFormat):
"""Save the data into Kobo-specific ZIP file."""

output_file = "dicthtml-{0}-{0}{1}.zip"
template = WORD_TPL_KOBO

def process(self) -> None:
self.groups = self.make_groups(self.words)
Expand Down Expand Up @@ -251,76 +280,6 @@ def make_groups(words: Words) -> Groups:
groups[guess_prefix(word)][word] = details
return groups

def handle_word(self, word: str, details: Word, **kwargs: Any) -> Generator[str]:
name: str = kwargs["name"]
words: Words = kwargs["words"]
current_words: Words = {word: details}

# use variant definitions for a word if one variant prefix is different
# "suis" listed with the definitions of "être" and "suivre"
if details.variants:
found_different_prefix = False
for variant in details.variants:
if guess_prefix(variant) != name:
if root_details := self.words.get(variant):
found_different_prefix = True
break
variants_words = {}
# if we found one variant, then list them all
if found_different_prefix:
for variant in details.variants:
if root_details := self.words.get(variant):
variants_words[variant] = root_details
if word.endswith("s"): # crude detection of plural
Copy link
Copy Markdown
Member Author

@BoboTiG BoboTiG Mar 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since our variants "detectors" are way better now, this hack is no more useful. Given the complexity of the variants code, I prefer to delete that piece.

singular = word[:-1]
maybe_noun = self.words.get(singular) # do we have the singular?
# make sure we are not redirecting to a verb (je mange, tu manges)
# verb form is also a singular noun
if isinstance(maybe_noun, Word) and not maybe_noun.variants:
variants_words[singular] = maybe_noun
for variant in details.variants:
if maybe_verb := self.words.get(variant):
variants_words[variant] = maybe_verb
if variants_words:
current_words = variants_words

# write to file
for current_word, current_details in current_words.items():
if not current_details.definitions:
continue

variants = ""
if word_variants := self.variants.get(word, []):
# add variants of empty* variant, only 1 redirection...
# gastada* -> gastado* -> gastar --> (gastada, gastado) -> gastar
for v in word_variants.copy():
wv: Word = words.get(v, Word.empty())
if wv and not wv.definitions:
for vv in self.variants.get(v, []):
word_variants.append(vv)
word_variants.sort(key=lambda s: (len(s), s))
variants = "<var>"
for v in word_variants:
# no variant with different prefix
v = v.lower().strip()
if guess_prefix(v) == name:
variants += f'<variant name="{v}"/>'
variants += "</var>"
# no empty var tag
if len(variants) < 15:
variants = ""

yield self.render_word(
WORD_TPL_KOBO,
word=word,
current_word=current_word,
definitions=current_details.definitions,
pronunciation=convert_pronunciation(current_details.pronunciations),
gender=convert_gender(current_details.genders),
etymologies=current_details.etymology if self.include_etymology else [],
variants=variants,
)

def save(self) -> None: # sourcery skip: extract-method
"""
Format of resulting dicthtml-LOCALE-LOCALE.zip:
Expand Down Expand Up @@ -394,7 +353,7 @@ def save_html(
raw_output = output_dir / f"{name}.raw.html"
with raw_output.open(mode="w", encoding="utf-8") as fh:
for word, details in words.items():
fh.writelines(self.handle_word(word, details, name=name, words=words))
fh.writelines(self.handle_word(word, details, words))

# Compress the HTML with gzip
output = output_dir / f"{name}.html"
Expand All @@ -408,24 +367,13 @@ class DictFileFormat(BaseFormat):
"""Save the data into a *.df* DictFile."""

output_file = "dict-{0}-{0}{1}.df"

def handle_word(self, word: str, details: Word, **kwargs: Any) -> Generator[str]:
if details.definitions:
yield self.render_word(
WORD_TPL_DICTFILE,
word=word,
definitions=details.definitions,
pronunciation=convert_pronunciation(details.pronunciations),
gender=convert_gender(details.genders),
etymologies=details.etymology if self.include_etymology else [],
variants=self.variants.get(word, []),
)
template = WORD_TPL_DICTFILE

def process(self) -> None:
file = self.dictionary_file(self.output_file)
with file.open(mode="w", encoding="utf-8") as fh:
for word, details in self.words.items():
fh.writelines(self.handle_word(word, details))
fh.writelines(self.handle_word(word, details, self.words))

self.summary(file)

Expand Down Expand Up @@ -688,10 +636,8 @@ def make_variants(words: Words) -> Variants:
"""Group word by variant."""
variants: Variants = defaultdict(list)
for word, details in words.items():
# Variant must be normalized by trimming whitespace and lowercasing it.
for variant in details.variants:
if variant:
variants[variant].append(word)
variants[variant].append(word)
return variants


Expand Down
4 changes: 0 additions & 4 deletions wikidict/stubs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,6 @@ class Word(NamedTuple):
definitions: list[Definitions]
variants: list[str]

@classmethod
def empty(cls) -> "Word":
return cls([], [], [], [], [])


Words = dict[str, Word]
Groups = dict[str, Words]