Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion babeldoc/format/pdf/document_il/backend/pdf_creater.py
Original file line number Diff line number Diff line change
Expand Up @@ -1402,7 +1402,7 @@ def write(
auto_extracted_glossary_path = self.translation_config.get_output_file_path(
f"{basename}{debug_suffix}.{translation_config.lang_out}.glossary.csv"
)
with auto_extracted_glossary_path.open("w", encoding="utf-8") as f:
with auto_extracted_glossary_path.open("w", encoding="utf-8-sig") as f:
logger.info(
f"save auto extracted glossary to {auto_extracted_glossary_path}"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -394,14 +394,14 @@ def procress(self, doc_il: ILDocument):
"term_extractor_tracking.json"
)
logger.debug(f"save translate tracking to {path}")
with Path(path).open("w", encoding="utf-8") as f:
with Path(path).open("w", encoding="utf-8-sig") as f:
Copy link

@cubic-dev-ai cubic-dev-ai bot Mar 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: JSON debug artifacts are now written with UTF-8 BOM (utf-8-sig), reducing interoperability and potentially breaking downstream parsers expecting standard BOM-less UTF-8 JSON.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At babeldoc/format/pdf/document_il/midend/automatic_term_extractor.py, line 397:

<comment>JSON debug artifacts are now written with UTF-8 BOM (`utf-8-sig`), reducing interoperability and potentially breaking downstream parsers expecting standard BOM-less UTF-8 JSON.</comment>

<file context>
@@ -394,14 +394,14 @@ def procress(self, doc_il: ILDocument):
             )
             logger.debug(f"save translate tracking to {path}")
-            with Path(path).open("w", encoding="utf-8") as f:
+            with Path(path).open("w", encoding="utf-8-sig") as f:
                 f.write(tracker.to_json())
 
</file context>
Suggested change
with Path(path).open("w", encoding="utf-8-sig") as f:
with Path(path).open("w", encoding="utf-8") as f:
Fix with Cubic

f.write(tracker.to_json())

path = self.translation_config.get_working_file_path(
"term_extractor_freq.json"
)
logger.debug(f"save term frequency to {path}")
with Path(path).open("w", encoding="utf-8") as f:
with Path(path).open("w", encoding="utf-8-sig") as f:
json.dump(
self.shared_context.raw_extracted_terms,
f,
Expand All @@ -413,7 +413,7 @@ def procress(self, doc_il: ILDocument):
"auto_extractor_glossary.csv"
)
logger.debug(f"save auto extracted glossary to {path}")
with Path(path).open("w", encoding="utf-8") as f:
with Path(path).open("w", encoding="utf-8-sig") as f:
auto_extracted_glossary = self.shared_context.auto_extracted_glossary
if auto_extracted_glossary:
f.write(auto_extracted_glossary.to_csv())
2 changes: 1 addition & 1 deletion babeldoc/format/pdf/result_merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def merge_results(
auto_extracted_glossary_path = self.config.get_output_file_path(
f"{basename}{debug_suffix}.{self.config.lang_out}.glossary.csv"
)
with auto_extracted_glossary_path.open("w", encoding="utf-8") as f:
with auto_extracted_glossary_path.open("w", encoding="utf-8-sig") as f:
logger.info(
f"save auto extracted glossary to {auto_extracted_glossary_path}"
)
Expand Down