Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 25 additions & 4 deletions scripts/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
import hashlib


SUSPICIOUSLY_SHORT_RATIO = 0.1
SUSPICIOUSLY_SHORT_SOURCE_MIN_BYTES = 200


def file_hash(filepath):
"""Compute SHA-256 hash of a file."""
h = hashlib.sha256()
Expand Down Expand Up @@ -112,20 +116,37 @@ def validate_for_merge(temp_dir):
errors.append(f"Missing output: {chunk['output_file']} (chunk {chunk['id']})")
continue

# Check non-empty
# Check readable and non-empty after trimming. Whitespace-only output
# would be silently skipped by merge, producing a partial book.
output_size = os.path.getsize(output_path)
if output_size == 0:
errors.append(f"Empty output: {chunk['output_file']} (chunk {chunk['id']})")
continue
try:
with open(output_path, 'r', encoding='utf-8') as f:
output_text = f.read()
except Exception as e:
errors.append(
f"Unreadable output: {chunk['output_file']} (chunk {chunk['id']}): {e}"
)
continue
if not output_text.strip():
errors.append(f"Blank output: {chunk['output_file']} (chunk {chunk['id']})")
continue

# Check abnormally short
# Check abnormally short. Very small source chunks can legitimately
# shrink below this ratio, so only hard-fail substantive chunks.
if os.path.exists(source_path):
source_size = os.path.getsize(source_path)
if source_size > 0 and output_size < source_size * 0.1:
warnings.append(
if source_size > 0 and output_size < source_size * SUSPICIOUSLY_SHORT_RATIO:
message = (
f"Suspiciously short: {chunk['output_file']} "
f"({output_size} bytes vs source {source_size} bytes)"
)
if source_size >= SUSPICIOUSLY_SHORT_SOURCE_MIN_BYTES:
errors.append(message)
continue
warnings.append(message)

ordered_output_files.append(output_path)

Expand Down
32 changes: 22 additions & 10 deletions scripts/merge_and_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,15 @@ def natural_sort_key(text):
return [int(t) if t.isdigit() else t.lower() for t in re.split(r'(\d+)', text)]


def _read_nonblank_translated_chunk(file_path):
"""Read a translated chunk and reject whitespace-only content."""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read().strip()
if not content:
raise ValueError("translated chunk is blank after trimming whitespace")
return content


# =============================================================================
# Step 4: Merge translated markdown files
# =============================================================================
Expand Down Expand Up @@ -326,12 +335,11 @@ def merge_markdown_files(temp_dir):
merged_content = ""
for file_path in ordered_files:
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read().strip()
if content:
merged_content += content + "\n\n"
content = _read_nonblank_translated_chunk(file_path)
merged_content += content + "\n\n"
except Exception as e:
print(f"Error reading {os.path.basename(file_path)}: {e}")
print(f"ERROR: Cannot merge {os.path.basename(file_path)}: {e}")
return False
else:
# Legacy fallback: glob-based merge (no manifest)
print("WARNING: No manifest.json found — using legacy glob-based merge.")
Expand Down Expand Up @@ -371,6 +379,11 @@ def merge_markdown_files(temp_dir):
if os.path.getsize(fp) == 0:
print(f"ERROR: Empty output file: {os.path.basename(fp)}")
return False
try:
_read_nonblank_translated_chunk(fp)
except Exception as e:
print(f"ERROR: Cannot merge {os.path.basename(fp)}: {e}")
return False

# Use source order to determine merge order (via expected output names)
output_files = [
Expand All @@ -382,12 +395,11 @@ def merge_markdown_files(temp_dir):
merged_content = ""
for file_path in output_files:
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read().strip()
if content:
merged_content += content + "\n\n"
content = _read_nonblank_translated_chunk(file_path)
merged_content += content + "\n\n"
except Exception as e:
print(f"Error reading {os.path.basename(file_path)}: {e}")
print(f"ERROR: Cannot merge {os.path.basename(file_path)}: {e}")
return False

try:
with open(output_md, 'w', encoding='utf-8') as f:
Expand Down
14 changes: 14 additions & 0 deletions scripts/run_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,13 @@ def _selected_terms_for_chunk(glossary, source_path):
return glossary_mod.select_terms_for_chunk(glossary, text)


def _read_nonblank_output(output_path):
text = Path(output_path).read_text(encoding='utf-8')
if not text.strip():
raise ValueError(f"Output chunk is blank: {output_path}")
return text


def _term_ids_and_hashes(terms):
ids = []
hashes = {}
Expand Down Expand Up @@ -143,6 +150,7 @@ def build_chunk_record(temp_dir, chunk_id):
raise FileNotFoundError(f"Output chunk not found: {output_path}")
if os.path.getsize(output_path) == 0:
raise ValueError(f"Output chunk is empty: {output_path}")
_read_nonblank_output(output_path)

glossary, glossary_hash, _, _ = _load_glossary(temp_dir)
selected_terms = _selected_terms_for_chunk(glossary, source_path)
Expand Down Expand Up @@ -221,6 +229,12 @@ def plan(temp_dir, retranslate_untracked=False):
elif os.path.getsize(output_path) == 0:
item["action"] = "translate"
_reason(item, "empty_output")
else:
try:
_read_nonblank_output(output_path)
except Exception as e:
item["action"] = "translate"
_reason(item, "invalid_output", str(e))

current_source_hash = file_hash(source_path) if os.path.exists(source_path) else ""
manifest_source_hash = entry.get("manifest_source_hash", "")
Expand Down
55 changes: 55 additions & 0 deletions tests/test_merge_and_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
sys.path.insert(0, str(SCRIPT_DIR))

import merge_and_build # noqa: E402
from manifest import create_manifest # noqa: E402


class GenerateFormatTests(unittest.TestCase):
Expand Down Expand Up @@ -196,6 +197,60 @@ def test_export_name_rejects_paths(self):
merge_and_build.export_named_aliases("/tmp", "../bad")


class MergeMarkdownValidationTests(unittest.TestCase):
def _write(self, path, content):
Path(path).write_text(content, encoding="utf-8")

def _run_merge(self, temp_dir):
buf = io.StringIO()
with contextlib.redirect_stdout(buf):
ok = merge_and_build.merge_markdown_files(temp_dir)
return ok, buf.getvalue()

def test_blank_manifest_output_fails_and_removes_stale_merge(self):
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
self._write(temp_path / "input.md", "input\n")
self._write(temp_path / "chunk0001.md", "Translatable source text.\n")
self._write(temp_path / "output_chunk0001.md", " \n\t\n")
self._write(temp_path / "output.md", "stale merged content\n")
create_manifest(temp_dir, ["chunk0001.md"], str(temp_path / "input.md"))

ok, out = self._run_merge(temp_dir)

self.assertFalse(ok)
self.assertFalse(temp_path.joinpath("output.md").exists())
self.assertIn("Blank output", out)
self.assertIn("output_chunk0001.md", out)

def test_suspiciously_short_manifest_output_fails_without_writing_merge(self):
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
self._write(temp_path / "input.md", "input\n")
self._write(temp_path / "chunk0001.md", "A" * 500 + "\n")
self._write(temp_path / "output_chunk0001.md", "ok\n")
create_manifest(temp_dir, ["chunk0001.md"], str(temp_path / "input.md"))

ok, out = self._run_merge(temp_dir)

self.assertFalse(ok)
self.assertFalse(temp_path.joinpath("output.md").exists())
self.assertIn("Suspiciously short", out)
self.assertIn("output_chunk0001.md", out)

def test_legacy_blank_output_fails_without_writing_merge(self):
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
self._write(temp_path / "chunk0001.md", "Translatable source text.\n")
self._write(temp_path / "output_chunk0001.md", " \n\t\n")

ok, out = self._run_merge(temp_dir)

self.assertFalse(ok)
self.assertFalse(temp_path.joinpath("output.md").exists())
self.assertIn("Cannot merge output_chunk0001.md", out)


class ImageValidationTests(unittest.TestCase):
"""Validates _validate_chunk_images, _check_generated_html_sanity, and the
basic-regex alt-escape fix. Together these guard against subagent-produced
Expand Down
17 changes: 17 additions & 0 deletions tests/test_run_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,23 @@ def test_missing_output_needs_translation(self):

self.assertIn("chunk0002", plan["translation_chunk_ids"])

def test_blank_output_needs_translation(self):
tmp, temp_dir = self._workspace()
with tmp:
self._write(temp_dir / "output_chunk0002.md", " \n\t\n")
plan = run_state.plan(str(temp_dir))

self.assertIn("chunk0002", plan["translation_chunk_ids"])
self.assertNotIn("chunk0002", plan["record_only_chunk_ids"])

def test_record_rejects_blank_output(self):
tmp, temp_dir = self._workspace()
with tmp:
self._write(temp_dir / "output_chunk0001.md", " \n\t\n")

with self.assertRaisesRegex(ValueError, "blank"):
run_state.record_chunks(str(temp_dir), ["chunk0001"])

def test_source_hash_change_needs_translation_after_record(self):
tmp, temp_dir = self._workspace()
with tmp:
Expand Down
Loading