Skip to content

Commit 33a98f2

Browse files
committed
Fix lolcat translator
1 parent 471ab0b commit 33a98f2

2 files changed

Lines changed: 88 additions & 18 deletions

File tree

src/localization_translation/extricate.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -52,17 +52,17 @@ def combine_end(data: Iterable[str], end: str = "and") -> str:
5252

5353

5454
TYPE_CHAR: Final = {
55-
"str": "!",
56-
"int": "@",
57-
"float": "#",
58-
"bool": "$",
59-
"dict": "%",
60-
"list": "^",
61-
"NoneType": "&",
55+
"str": "\x01",
56+
"int": "\x02",
57+
"float": "\x03",
58+
"bool": "\x04",
59+
"dict": "\x05",
60+
"list": "\x06",
61+
"NoneType": "\x07",
6262
}
6363
CHAR_TYPE: Final = {v: k for k, v in TYPE_CHAR.items()}
6464

65-
SEP: Final = "*"
65+
SEP: Final = "\x00"
6666

6767

6868
def dict_to_list(data: Any) -> tuple[list[str], list[str]]:
@@ -78,7 +78,7 @@ def read_block(data: Any) -> tuple[list[str], list[str]]:
7878
assert isinstance(data, dict) # Mypy doesn't understand
7979
# If empty dict
8080
if not data:
81-
keys.append(wrap_quotes(f"{SEP}", TYPE_CHAR[dtype]))
81+
keys.append(wrap_quotes(SEP, TYPE_CHAR[dtype]))
8282
values.append("")
8383
# Will not run if no data to enumerate
8484
for key, value in data.items():

src/localization_translation/lolcat.py

Lines changed: 79 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,9 @@ def handle_endtag(self, tag: str) -> None:
8888

8989
def translate_sentence(sentence: str) -> str:
9090
"""Translate sentence."""
91+
if not sentence.strip():
92+
return sentence
93+
9194
browser = mechanicalsoup.StatefulBrowser(soup_config={"features": "html.parser"})
9295
browser.open("https://funtranslations.com/lolcat")
9396
browser.select_form("form#textform")
@@ -100,7 +103,10 @@ def translate_sentence(sentence: str) -> str:
100103

101104
value = parser.value
102105
if value is None:
103-
raise LookupError("Failed to find lolcat span in HTML response")
106+
exc = LookupError("Failed to find lolcat span in HTML response")
107+
exc.add_note(f"{sentence = }")
108+
exc.add_note(response.text)
109+
raise exc
104110
if value.endswith(" ") and not sentence.endswith(" "):
105111
value = value[:-1]
106112

@@ -121,17 +127,39 @@ def translate_block(sentences: list[str], threshold: int = 2048) -> list[str]:
121127
"""Translate sentences in bulk in batches if very big."""
122128
sep = "^&^"
123129
result = []
124-
to_translate = list(reversed(sentences))
130+
to_translate = tuple(sentences)
125131
while to_translate:
126132
block = ""
127-
while to_translate and len(block) < threshold:
128-
block += to_translate.pop() + sep
133+
block_count = 0
134+
while block_count < len(to_translate) and len(block) < threshold:
135+
block += to_translate[block_count] + sep
136+
block_count += 1
129137
if not block:
130138
break
131139
block = block.removesuffix(sep)
132140
response = translate_sentence(block)
133-
result.extend(response.split(sep))
141+
block_result = response.split(sep)
142+
## assert len(block_result) == block_count, f"{len(block_result)} != {block_count}"
143+
if len(block_result) != block_count:
144+
print("Something broke, manually going through one by one")
145+
for sentence in to_translate[:block_count]:
146+
translated = translate_sentence(sentence)
147+
print(f"{sentence!r} -> {translated!r}")
148+
result.append(translated)
149+
else:
150+
result.extend(block_result)
151+
## print(f'{len(to_translate) - block_count = }')
152+
to_translate = to_translate[block_count:]
153+
## print(f'{len(to_translate) = }')
134154
## result.append(translate_sentence(to_translate.pop()))
155+
if len(result) != len(sentences):
156+
for idx in range(max(len(result), len(sentences))):
157+
if idx < len(sentences):
158+
print(repr(sentences[idx]), end="")
159+
print(" --> ", end="")
160+
if idx < len(result):
161+
print(repr(result[idx]), end="")
162+
print()
135163
assert len(result) == len(sentences), f"{len(result)} != {len(sentences)}"
136164
return result
137165

@@ -140,15 +168,57 @@ def translate_file(english: dict[str, T], block_threshold: int = 2048) -> dict[s
140168
"""Translate an entire file."""
141169
keys, sentences = extricate.dict_to_list(english)
142170

143-
results = translate_block(sentences, block_threshold)
144-
145-
for orig, new in zip(enumerate(sentences), results, strict=True):
146-
idx, old = orig
171+
# There apparently exist some bad values somewhere, strip those out
172+
bad_values = set()
173+
good_values = []
174+
for idx, sentence in enumerate(sentences):
175+
if isinstance(sentence, str) and sentence.strip():
176+
good_values.append(idx)
177+
else:
178+
bad_values.add(idx)
179+
180+
# Only translate non-blanks
181+
translate_sentences = [sentences[idx] for idx in good_values]
182+
183+
## # Handle duplicate values
184+
## duplicate_values = {}
185+
## translate_sentences = []
186+
## for idx in good_values:
187+
## sentence = sentences[idx]
188+
## if sentence in translate_sentences:
189+
## duplicate_values[idx] = translate_sentences.index(sentence)
190+
## else:
191+
## translate_sentences.append(sentence)
192+
193+
translate_results = translate_block(translate_sentences, block_threshold)
194+
195+
## # Rebuild with duplicate values
196+
## translate_results = []
197+
## for deduped_index, idx in enumerate(good_values):
198+
## duplicate_entry = duplicate_values.get(idx)
199+
## if duplicate_entry is not None:
200+
## translate_results.append(translate_results[duplicate_entry])
201+
## else:
202+
## translate_results.append(translate_deduplicated_results[deduped_index])
203+
204+
# Rebuild full results with blanks
205+
results = []
206+
index = 0
207+
for idx, original in enumerate(sentences):
208+
if idx in bad_values:
209+
results.append(original)
210+
else:
211+
results.append(translate_results[index])
212+
index += 1
213+
214+
for (idx, old), new in zip(enumerate(sentences), results, strict=True):
147215
if new is None or not isinstance(old, str):
148216
results[idx] = old
149217
continue
150218
if old.endswith(" ") and not new.endswith(" "):
151219
results[idx] = new + " "
220+
assert len(results) == len(sentences)
221+
## print("\n".join(" --> ".join(x) for x in zip(sentences, results)))
152222
return extricate.list_to_dict(keys, results) # type: ignore
153223

154224

0 commit comments

Comments
 (0)