Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 97 additions & 3 deletions tests/test_3_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
Mis à jour le"""

WORDS = {
"empty": Word.empty(),
"empty": Word([], [], [], [], []),
"foo": Word(["pron"], ["gender"], ["etyl"], ["def 1", ("sdef 1",)], []),
"foos": Word(["pron"], ["gender"], ["etyl"], ["def 1", ("sdef 1", ("ssdef 1",))], ["baz"]),
"baz": Word(["pron"], ["gender"], ["etyl"], ["def 1", ("sdef 1",)], ["foobar"]),
Expand Down Expand Up @@ -296,6 +296,100 @@ def test_word_rendering(
include_etymology=include_etymology,
)

kwargs = {"name": "mu", "words": WORDS} if isinstance(cls, convert.KoboFormat) else {}
content = next(cls.handle_word("Multiple Etymologies", WORDS["Multiple Etymologies"], **kwargs))
content = next(cls.handle_word("Multiple Etymologies", WORDS))
assert content == expected


WORDS_VARIANTS_FR = words = {
"être": Word(
pronunciations=["\\ɛtʁ\\"],
genders=["m"],
etymology=["<i>(Date à préciser)</i> Du moyen français <i>estre</i> ..."],
definitions=[
"Définir un état, une caractéristique du sujet.",
"Se situer, se trouver, rester, spécifiant une location, une situation.",
"<i>(Absolument)</i> Exister.",
],
variants=[],
),
"suis": Word(
pronunciations=["\\sɥi\\"],
genders=[],
etymology=["<i>(Forme de verbe 1)</i> De l’ancien français <i>suis</i>..."],
definitions=[],
variants=["suivre", "être"],
),
"suivre": Word(
pronunciations=["\\sɥivʁ\\"],
genders=[],
etymology=[
"<i>(Date à préciser)</i> Du moyen français...",
"Les parentés proches de ce mot incluent, ...",
],
definitions=[
"Aller ou venir après.",
"Aller, continuer d’aller dans une même direction.",
("S’emploie figurément dans le même sens.",),
],
variants=[],
),
}
WORDS_VARIANTS_ES = {
"gastada": Word(pronunciations=[], genders=[], etymology=[], definitions=[], variants=["gastado"]),
"gastado": Word(pronunciations=[], genders=[], etymology=[], definitions=[], variants=["gastar"]),
"gastar": Word(
pronunciations=[],
genders=[],
etymology=['Del latín <i>vastāre</i> ("devastar").'],
definitions=[
"Provocar el consumo, deterioro o destrucción de algo por el uso.",
"Digerir, asimilar los alimentos.",
],
variants=[],
),
}


def test_make_variants() -> None:
assert convert.make_variants(WORDS_VARIANTS_FR) == {"suivre": ["suis"], "être": ["suis"]}
assert convert.make_variants(WORDS_VARIANTS_ES) == {"gastado": ["gastada"], "gastar": ["gastado"]}


def test_kobo_format_variants_different_prefix(tmp_path: Path) -> None:
words = WORDS_VARIANTS_FR
variants = convert.make_variants(words)
kobo_formater = convert.KoboFormat("fr", tmp_path, words, variants, "20250322")

assert kobo_formater.make_groups(words) == {
"su": {"suis": words["suis"], "suivre": words["suivre"]},
"êt": {"être": words["être"]},
}

suis = "".join(kobo_formater.handle_word("suis", words))
être = "".join(kobo_formater.handle_word("être", words))
suivre = "".join(kobo_formater.handle_word("suivre", words))
assert suis
assert "variant" not in suis
assert être[22:] == suis[22:] # Skip word metadata: '<w><p><a name="être"/>' != '<w><p><a name="suis"/>'
assert '<var><variant name="suis"/></var>' in suivre


def test_kobo_format_variants_empty_variant_level_1(tmp_path: Path) -> None:
words = WORDS_VARIANTS_ES
variants = convert.make_variants(words)
kobo_formater = convert.KoboFormat("es", tmp_path, words, variants, "20250322")

assert kobo_formater.make_groups(words) == {
"ga": {
"gastada": words["gastada"],
"gastado": words["gastado"],
"gastar": words["gastar"],
}
}

gastada = "".join(kobo_formater.handle_word("gastada", words))
gastado = "".join(kobo_formater.handle_word("gastado", words))
gastar = "".join(kobo_formater.handle_word("gastar", words))
assert "variant" not in gastada
assert "variant" not in gastado
assert '<var><variant name="gastada"/><variant name="gastado"/></var>' in gastar
162 changes: 61 additions & 101 deletions wikidict/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,19 @@
{% endif %}
</p>
{% if variants %}
{{ variants }}
<var>
{%- for variant in variants -%}
<variant name="{{ variant }}"/>
{%- endfor -%}
</var>
{% endif %}
</w>
"""
)

# DictFile-related dictionaries
# Source: https://pgaskin.net/dictutil/dictgen/#dictfile-format
# Source: https://github.com/hunspell/hunspell/blob/ecc6dbb52025bdf3a766429988e64190d912765f/man/hunspell.1#L93-L139 (for later, in case of issues with other sub-formats)
WORD_TPL_DICTFILE = Template(
"""\
@ {{ word }}
Expand Down Expand Up @@ -149,14 +155,7 @@
class BaseFormat:
"""Base class for all dictionaries."""

__slots__ = {
"locale",
"output_dir",
"snapshot",
"words",
"variants",
"include_etymology",
}
template = Template("") # To be set by subclasses

def __init__(
self,
Expand Down Expand Up @@ -188,8 +187,51 @@ def dictionary_file(self, output_file: str) -> Path:
self.locale, "" if self.include_etymology else constants.NO_ETYMOLOGY_SUFFIX
)

def handle_word(self, word: str, details: Word, **kwargs: Any) -> Generator[str]: # pragma: nocover
raise NotImplementedError()
def handle_word(self, word: str, words: Words) -> Generator[str]:
details = words[word]
current_words = {word: details}

if details.variants:
# Variants are more like typos, or misses, and so devices expect word & variants to start with same letters, at least.
# An example in FR, where "suis" (verb flexion) is a variant of both "ếtre" & "suivre": "suis" & "être" are quite differents.
# As a workaround, we replace etymology + definitions of "suis" with ones from "être", while keeping other "suis" variants as well.
# Note: it works for 1 different variant only, the fist one with a different prefix.
current_group_prefix = guess_prefix(word)
for variant in details.variants:
if guess_prefix(variant) != current_group_prefix and (root := self.words.get(variant)):
current_words[variant] = root
break

for current_word, current_details in current_words.items():
if not current_details.definitions:
continue

if variants := self.variants.get(current_word, []):
# Add variants of empty* variant, only 1 redirection:
# [ES] gastada* -> gastado* -> gastar --> (gastada, gastado) -> gastar
# Note: the process works backward: from gastar up to gastado up to gastada.
for variant in variants.copy():
if (wv := words.get(variant)) and not wv.definitions:
variants.extend(self.variants.get(variant, []))

# Filter out variants with a different prefix that their word
current_group_prefix = guess_prefix(current_word)
variants = [variant for variant in variants if guess_prefix(variant) == current_group_prefix]

if isinstance(self, KoboFormat):
# Variant must be normalized by trimming whitespace and lowercasing it
variants = [variant.lower().strip() for variant in variants]

yield self.render_word(
self.template,
word=word,
current_word=current_word,
definitions=current_details.definitions,
pronunciation=convert_pronunciation(current_details.pronunciations),
gender=convert_gender(current_details.genders),
etymologies=current_details.etymology if self.include_etymology else [],
variants=sorted(variants, key=lambda s: (len(s), s)),
)

def process(self) -> None: # pragma: nocover
raise NotImplementedError()
Expand All @@ -213,6 +255,7 @@ class KoboFormat(BaseFormat):
"""Save the data into Kobo-specific ZIP file."""

output_file = "dicthtml-{0}-{0}{1}.zip"
template = WORD_TPL_KOBO

def process(self) -> None:
self.groups = self.make_groups(self.words)
Expand Down Expand Up @@ -251,76 +294,6 @@ def make_groups(words: Words) -> Groups:
groups[guess_prefix(word)][word] = details
return groups

def handle_word(self, word: str, details: Word, **kwargs: Any) -> Generator[str]:
name: str = kwargs["name"]
words: Words = kwargs["words"]
current_words: Words = {word: details}

# use variant definitions for a word if one variant prefix is different
# "suis" listed with the definitions of "être" and "suivre"
if details.variants:
found_different_prefix = False
for variant in details.variants:
if guess_prefix(variant) != name:
if root_details := self.words.get(variant):
found_different_prefix = True
break
variants_words = {}
# if we found one variant, then list them all
if found_different_prefix:
for variant in details.variants:
if root_details := self.words.get(variant):
variants_words[variant] = root_details
if word.endswith("s"): # crude detection of plural
Copy link
Copy Markdown
Member Author

@BoboTiG BoboTiG Mar 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since our variants "detectors" are way better now, this hack is no more useful. Given the complexity of the variants code, I prefer to delete that piece.

singular = word[:-1]
maybe_noun = self.words.get(singular) # do we have the singular?
# make sure we are not redirecting to a verb (je mange, tu manges)
# verb form is also a singular noun
if isinstance(maybe_noun, Word) and not maybe_noun.variants:
variants_words[singular] = maybe_noun
for variant in details.variants:
if maybe_verb := self.words.get(variant):
variants_words[variant] = maybe_verb
if variants_words:
current_words = variants_words

# write to file
for current_word, current_details in current_words.items():
if not current_details.definitions:
continue

variants = ""
if word_variants := self.variants.get(word, []):
# add variants of empty* variant, only 1 redirection...
# gastada* -> gastado* -> gastar --> (gastada, gastado) -> gastar
for v in word_variants.copy():
wv: Word = words.get(v, Word.empty())
if wv and not wv.definitions:
for vv in self.variants.get(v, []):
word_variants.append(vv)
word_variants.sort(key=lambda s: (len(s), s))
variants = "<var>"
for v in word_variants:
# no variant with different prefix
v = v.lower().strip()
if guess_prefix(v) == name:
variants += f'<variant name="{v}"/>'
variants += "</var>"
# no empty var tag
if len(variants) < 15:
variants = ""

yield self.render_word(
WORD_TPL_KOBO,
word=word,
current_word=current_word,
definitions=current_details.definitions,
pronunciation=convert_pronunciation(current_details.pronunciations),
gender=convert_gender(current_details.genders),
etymologies=current_details.etymology if self.include_etymology else [],
variants=variants,
)

def save(self) -> None: # sourcery skip: extract-method
"""
Format of resulting dicthtml-LOCALE-LOCALE.zip:
Expand Down Expand Up @@ -393,8 +366,8 @@ def save_html(
# Save to uncompressed HTML
raw_output = output_dir / f"{name}.raw.html"
with raw_output.open(mode="w", encoding="utf-8") as fh:
for word, details in words.items():
fh.writelines(self.handle_word(word, details, name=name, words=words))
for word in words:
fh.writelines(self.handle_word(word, words))

# Compress the HTML with gzip
output = output_dir / f"{name}.html"
Expand All @@ -408,24 +381,13 @@ class DictFileFormat(BaseFormat):
"""Save the data into a *.df* DictFile."""

output_file = "dict-{0}-{0}{1}.df"

def handle_word(self, word: str, details: Word, **kwargs: Any) -> Generator[str]:
if details.definitions:
yield self.render_word(
WORD_TPL_DICTFILE,
word=word,
definitions=details.definitions,
pronunciation=convert_pronunciation(details.pronunciations),
gender=convert_gender(details.genders),
etymologies=details.etymology if self.include_etymology else [],
variants=self.variants.get(word, []),
)
template = WORD_TPL_DICTFILE

def process(self) -> None:
file = self.dictionary_file(self.output_file)
with file.open(mode="w", encoding="utf-8") as fh:
for word, details in self.words.items():
fh.writelines(self.handle_word(word, details))
for word in self.words:
fh.writelines(self.handle_word(word, self.words))

self.summary(file)

Expand Down Expand Up @@ -636,7 +598,7 @@ def all_chars(word: str, details: Word) -> set[str]:
for w in related_words:
new_words.pop(w, None)
stats.pop(char)
if len(stats) < 256:
if len(stats) <= 256:
break
threshold += 1

Expand Down Expand Up @@ -688,10 +650,8 @@ def make_variants(words: Words) -> Variants:
"""Group word by variant."""
variants: Variants = defaultdict(list)
for word, details in words.items():
# Variant must be normalized by trimming whitespace and lowercasing it.
for variant in details.variants:
if variant:
variants[variant].append(word)
variants[variant].append(word)
return variants


Expand Down
4 changes: 0 additions & 4 deletions wikidict/stubs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,6 @@ class Word(NamedTuple):
definitions: list[Definitions]
variants: list[str]

@classmethod
def empty(cls) -> "Word":
return cls([], [], [], [], [])


Words = dict[str, Word]
Groups = dict[str, Words]