@@ -88,6 +88,9 @@ def handle_endtag(self, tag: str) -> None:
8888
8989def translate_sentence (sentence : str ) -> str :
9090 """Translate sentence."""
91+ if not sentence .strip ():
92+ return sentence
93+
9194 browser = mechanicalsoup .StatefulBrowser (soup_config = {"features" : "html.parser" })
9295 browser .open ("https://funtranslations.com/lolcat" )
9396 browser .select_form ("form#textform" )
@@ -100,7 +103,10 @@ def translate_sentence(sentence: str) -> str:
100103
101104 value = parser .value
102105 if value is None :
103- raise LookupError ("Failed to find lolcat span in HTML response" )
106+ exc = LookupError ("Failed to find lolcat span in HTML response" )
107+ exc .add_note (f"{ sentence = } " )
108+ exc .add_note (response .text )
109+ raise exc
104110 if value .endswith (" " ) and not sentence .endswith (" " ):
105111 value = value [:- 1 ]
106112
@@ -121,17 +127,39 @@ def translate_block(sentences: list[str], threshold: int = 2048) -> list[str]:
121127 """Translate sentences in bulk in batches if very big."""
122128 sep = "^&^"
123129 result = []
124- to_translate = list ( reversed ( sentences ) )
130+ to_translate = tuple ( sentences )
125131 while to_translate :
126132 block = ""
127- while to_translate and len (block ) < threshold :
128- block += to_translate .pop () + sep
133+ block_count = 0
134+ while block_count < len (to_translate ) and len (block ) < threshold :
135+ block += to_translate [block_count ] + sep
136+ block_count += 1
129137 if not block :
130138 break
131139 block = block .removesuffix (sep )
132140 response = translate_sentence (block )
133- result .extend (response .split (sep ))
141+ block_result = response .split (sep )
142+ ## assert len(block_result) == block_count, f"{len(block_result)} != {block_count}"
143+ if len (block_result ) != block_count :
144+ print ("Something broke, manually going through one by one" )
145+ for sentence in to_translate [:block_count ]:
146+ translated = translate_sentence (sentence )
147+ print (f"{ sentence !r} -> { translated !r} " )
148+ result .append (translated )
149+ else :
150+ result .extend (block_result )
151+ ## print(f'{len(to_translate) - block_count = }')
152+ to_translate = to_translate [block_count :]
153+ ## print(f'{len(to_translate) = }')
134154 ## result.append(translate_sentence(to_translate.pop()))
155+ if len (result ) != len (sentences ):
156+ for idx in range (max (len (result ), len (sentences ))):
157+ if idx < len (sentences ):
158+ print (repr (sentences [idx ]), end = "" )
159+ print (" --> " , end = "" )
160+ if idx < len (result ):
161+ print (repr (result [idx ]), end = "" )
162+ print ()
135163 assert len (result ) == len (sentences ), f"{ len (result )} != { len (sentences )} "
136164 return result
137165
@@ -140,15 +168,57 @@ def translate_file(english: dict[str, T], block_threshold: int = 2048) -> dict[s
140168 """Translate an entire file."""
141169 keys , sentences = extricate .dict_to_list (english )
142170
143- results = translate_block (sentences , block_threshold )
144-
145- for orig , new in zip (enumerate (sentences ), results , strict = True ):
146- idx , old = orig
171+ # There apparently exist some bad values somewhere, strip those out
172+ bad_values = set ()
173+ good_values = []
174+ for idx , sentence in enumerate (sentences ):
175+ if isinstance (sentence , str ) and sentence .strip ():
176+ good_values .append (idx )
177+ else :
178+ bad_values .add (idx )
179+
180+ # Only translate non-blanks
181+ translate_sentences = [sentences [idx ] for idx in good_values ]
182+
183+ ## # Handle duplicate values
184+ ## duplicate_values = {}
185+ ## translate_sentences = []
186+ ## for idx in good_values:
187+ ## sentence = sentences[idx]
188+ ## if sentence in translate_sentences:
189+ ## duplicate_values[idx] = translate_sentences.index(sentence)
190+ ## else:
191+ ## translate_sentences.append(sentence)
192+
193+ translate_results = translate_block (translate_sentences , block_threshold )
194+
195+ ## # Rebuild with duplicate values
196+ ## translate_results = []
197+ ## for deduped_index, idx in enumerate(good_values):
198+ ## duplicate_entry = duplicate_values.get(idx)
199+ ## if duplicate_entry is not None:
200+ ## translate_results.append(translate_results[duplicate_entry])
201+ ## else:
202+ ## translate_results.append(translate_deduplicated_results[deduped_index])
203+
204+ # Rebuild full results with blanks
205+ results = []
206+ index = 0
207+ for idx , original in enumerate (sentences ):
208+ if idx in bad_values :
209+ results .append (original )
210+ else :
211+ results .append (translate_results [index ])
212+ index += 1
213+
214+ for (idx , old ), new in zip (enumerate (sentences ), results , strict = True ):
147215 if new is None or not isinstance (old , str ):
148216 results [idx ] = old
149217 continue
150218 if old .endswith (" " ) and not new .endswith (" " ):
151219 results [idx ] = new + " "
220+ assert len (results ) == len (sentences )
221+ ## print("\n".join(" --> ".join(x) for x in zip(sentences, results)))
152222 return extricate .list_to_dict (keys , results ) # type: ignore
153223
154224
0 commit comments