Skip to content

Commit ac4225c

Browse files
authored
Merge pull request #52 from myk-org/fix/issue-51-doc-quality
fix: Doc generation quality — AI artifacts, link rot, internal leaks (#51)
2 parents 90e46ab + 5f9b349 commit ac4225c

File tree

17 files changed

+1190
-303
lines changed

17 files changed

+1190
-303
lines changed

CLAUDE.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,11 @@ When adding new code:
2828

2929
| Resource Type | Location | Examples |
3030
|---|---|---|
31-
| Python constants | `src/docsfy/models.py` | `VALID_PROVIDERS`, `DEFAULT_BRANCH`, `DOCSFY_DOCS_URL`, `DOCSFY_REPO_URL` |
31+
| Python constants | `src/docsfy/models.py` | `VALID_PROVIDERS`, `DEFAULT_BRANCH`, `PAGE_TYPES`, `DOCSFY_DOCS_URL`, `DOCSFY_REPO_URL` |
3232
| Data models | `src/docsfy/models.py` | `GenerateRequest`, `DocPlan`, `DocPage`, `NavGroup` |
3333
| DB constants & validators | `src/docsfy/storage.py` | `VALID_STATUSES`, `VALID_ROLES`, `_validate_name()`, `_validate_owner()` |
3434
| Git timeouts | `src/docsfy/repository.py` | `_CLONE_TIMEOUT`, `_FETCH_TIMEOUT`, `_DIFF_TIMEOUT` |
35-
| Prompt constants | `src/docsfy/prompts.py` | `_MAX_DIFF_LENGTH`, `_PAGE_WRITING_RULES` |
35+
| Prompt constants | `src/docsfy/prompts.py` | `_MAX_DIFF_LENGTH`, `_GUIDE_WRITING_RULES`, `_REFERENCE_WRITING_RULES`, `_RECIPE_WRITING_RULES`, `_CONCEPT_WRITING_RULES`, `_INCREMENTAL_WRITING_RULES`, `truncate_diff_content()` |
3636
| Frontend constants | `frontend/src/lib/constants.ts` | API base URL, poll intervals, toast durations |
3737
| Frontend types | `frontend/src/types/index.ts` | `Project`, `User`, `Variant`, `AuthState` |
3838
| Frontend API client | `frontend/src/lib/api.ts` | `fetchProjects()`, `login()`, `generateDocs()` |

src/docsfy/api/projects.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,13 @@
3030
VALID_PROVIDERS,
3131
GenerateRequest,
3232
)
33-
from docsfy.postprocess import add_cross_links, detect_version, validate_pages
33+
from docsfy.postprocess import (
34+
add_cross_links,
35+
detect_version,
36+
fix_broken_internal_links,
37+
linkify_plain_references,
38+
validate_pages,
39+
)
3440
from docsfy.renderer import render_site
3541
from docsfy.repository import (
3642
clone_repo,
@@ -1016,6 +1022,11 @@ async def _on_page_generated(page_count: int) -> None:
10161022
current_stage="cross_linking",
10171023
page_count=len(pages),
10181024
)
1025+
pages = fix_broken_internal_links(pages, plan, project_name=project_name)
1026+
try:
1027+
pages = linkify_plain_references(pages, plan, project_name=project_name)
1028+
except Exception as exc:
1029+
logger.warning(f"[{project_name}] linkify_plain_references failed: {exc}")
10191030
pages = await add_cross_links(
10201031
pages=pages,
10211032
plan=plan,

src/docsfy/config.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,11 @@ class Settings(BaseSettings):
2020
log_level: str = "INFO"
2121
data_dir: str = "/data"
2222
secure_cookies: bool = True # Set to False for local HTTP dev
23+
max_concurrent_pages: int = Field(
24+
default=10,
25+
gt=0,
26+
description="Maximum number of AI CLI calls to run in parallel during page generation and validation",
27+
)
2328

2429

2530
@lru_cache

src/docsfy/generator.py

Lines changed: 162 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
from __future__ import annotations
22

3+
import json
4+
import re
5+
import shutil
6+
import tempfile
37
from collections.abc import Awaitable, Callable
48
from pathlib import Path
59
from typing import Any
@@ -10,12 +14,13 @@
1014
from docsfy.json_parser import parse_json_array_response, parse_json_response
1115
from pydantic import ValidationError
1216

13-
from docsfy.models import DEFAULT_BRANCH, MAX_CONCURRENT_PAGES, DocPlan
17+
from docsfy.models import DEFAULT_BRANCH, PAGE_TYPES, DocPlan
1418
from docsfy.prompts import (
1519
build_incremental_page_prompt,
1620
build_incremental_planner_prompt,
1721
build_page_prompt,
1822
build_planner_prompt,
23+
truncate_diff_content,
1924
)
2025

2126
logger = get_logger(name=__name__)
@@ -37,6 +42,52 @@ def _strip_ai_preamble(text: str) -> str:
3742
return text
3843

3944

45+
_AI_COMMENTARY_END_MARKERS = (
46+
"\nWait -",
47+
"\nWait,",
48+
"\nLet me refine",
49+
"\nLet me remove",
50+
"\nI should ",
51+
"\nI'll also ",
52+
"\nI'll remove",
53+
"\nSo I should",
54+
"\n`</think>`",
55+
)
56+
57+
58+
def _strip_ai_artifacts(text: str) -> str:
59+
"""Strip AI thinking/reasoning artifacts from generated content.
60+
61+
Removes:
62+
- <think>...</think> blocks anywhere in the text
63+
- </think> orphan closing tags
64+
- Self-referential AI commentary at the end (e.g., "Wait - the user said...",
65+
"Let me refine:", "I should NOT include...")
66+
"""
67+
# Remove <think>...</think> blocks (including multiline)
68+
while "<think>" in text and "</think>" in text:
69+
text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
70+
71+
# Remove orphan </think> tags
72+
text = re.sub(r"</think>", "", text)
73+
74+
# Remove orphan <think> tags
75+
text = re.sub(r"<think>", "", text)
76+
77+
# Only scan the tail of the output for self-referential AI commentary.
78+
# These markers only appear at the very end when the AI "thinks out loud"
79+
# after finishing. Scanning the full text risks truncating legitimate prose.
80+
if len(text) > 500:
81+
tail_offset = len(text) - 500
82+
for marker in _AI_COMMENTARY_END_MARKERS:
83+
idx = text.find(marker, tail_offset)
84+
if idx >= 0:
85+
text = text[:idx]
86+
break # Only apply the first match
87+
88+
return text.strip()
89+
90+
4091
async def _call_ai_or_raise(
4192
prompt: str,
4293
repo_path: Path,
@@ -156,12 +207,16 @@ async def generate_full_page_content(
156207
ai_model: str,
157208
ai_cli_timeout: int | None = None,
158209
exclusions_path: str | None = None,
210+
page_type: str = "guide",
211+
other_pages_path: str | None = None,
159212
) -> str:
160213
prompt = build_page_prompt(
161214
project_name=project_name,
162215
page_title=page_title,
163216
page_description=page_description,
217+
page_type=page_type,
164218
exclusions_path=exclusions_path,
219+
other_pages_path=other_pages_path,
165220
)
166221
output = await _call_ai_or_raise(
167222
prompt=prompt,
@@ -170,7 +225,7 @@ async def generate_full_page_content(
170225
ai_model=ai_model,
171226
ai_cli_timeout=ai_cli_timeout,
172227
)
173-
return _strip_ai_preamble(output)
228+
return _strip_ai_artifacts(_strip_ai_preamble(output))
174229

175230

176231
async def _generate_incremental_page_content(
@@ -184,22 +239,35 @@ async def _generate_incremental_page_content(
184239
ai_provider: str,
185240
ai_model: str,
186241
ai_cli_timeout: int | None = None,
242+
page_type: str = "guide",
187243
) -> str:
188-
prompt = build_incremental_page_prompt(
189-
project_name=project_name,
190-
page_title=page_title,
191-
page_description=page_description,
192-
existing_content=existing_content,
193-
changed_files=changed_files,
194-
diff_content=diff_content,
195-
)
196-
output = await _call_ai_or_raise(
197-
prompt=prompt,
198-
repo_path=repo_path,
199-
ai_provider=ai_provider,
200-
ai_model=ai_model,
201-
ai_cli_timeout=ai_cli_timeout,
202-
)
244+
job_dir = Path(tempfile.mkdtemp(prefix="docsfy-incremental-page-"))
245+
try:
246+
existing_page_file = job_dir / "existing_page.md"
247+
existing_page_file.write_text(existing_content, encoding="utf-8")
248+
249+
truncated_diff = truncate_diff_content(diff_content)
250+
diff_file = job_dir / "diff.patch"
251+
diff_file.write_text(truncated_diff, encoding="utf-8")
252+
253+
prompt = build_incremental_page_prompt(
254+
project_name=project_name,
255+
page_title=page_title,
256+
page_description=page_description,
257+
existing_page_path=str(existing_page_file),
258+
changed_files=changed_files,
259+
diff_path=str(diff_file),
260+
page_type=page_type,
261+
)
262+
output = await _call_ai_or_raise(
263+
prompt=prompt,
264+
repo_path=repo_path,
265+
ai_provider=ai_provider,
266+
ai_model=ai_model,
267+
ai_cli_timeout=ai_cli_timeout,
268+
)
269+
finally:
270+
shutil.rmtree(job_dir, ignore_errors=True)
203271
return _apply_incremental_page_updates(existing_content, output)
204272

205273

@@ -260,6 +328,8 @@ async def generate_page(
260328
diff_content: str | None = None,
261329
branch: str = DEFAULT_BRANCH,
262330
on_page_generated: Callable[[int], Awaitable[None]] | None = None,
331+
page_type: str = "guide",
332+
other_pages_path: str | None = None,
263333
) -> str:
264334
_label = project_name or repo_path.name
265335
prompt_project_name = project_name or repo_path.name
@@ -291,6 +361,7 @@ async def generate_page(
291361
ai_provider=ai_provider,
292362
ai_model=ai_model,
293363
ai_cli_timeout=ai_cli_timeout,
364+
page_type=page_type,
294365
)
295366
except (RuntimeError, ValueError) as exc:
296367
logger.warning(
@@ -305,6 +376,8 @@ async def generate_page(
305376
ai_provider=ai_provider,
306377
ai_model=ai_model,
307378
ai_cli_timeout=ai_cli_timeout,
379+
page_type=page_type,
380+
other_pages_path=other_pages_path,
308381
)
309382
else:
310383
output = await generate_full_page_content(
@@ -315,6 +388,8 @@ async def generate_page(
315388
ai_provider=ai_provider,
316389
ai_model=ai_model,
317390
ai_cli_timeout=ai_cli_timeout,
391+
page_type=page_type,
392+
other_pages_path=other_pages_path,
318393
)
319394
except RuntimeError as exc:
320395
logger.warning(f"[{_label}] Failed to generate page '{slug}': {exc}")
@@ -380,40 +455,64 @@ async def generate_all_pages(
380455
if is_unsafe_slug(slug):
381456
logger.warning(f"[{_label}] Skipping path-unsafe slug: '{slug}'")
382457
continue
458+
_page_type = page.get("type", "guide")
459+
if _page_type not in PAGE_TYPES:
460+
logger.warning(
461+
f"[{_label}] Unknown page type '{_page_type}' for slug '{slug}', "
462+
f"falling back to 'guide'"
463+
)
464+
_page_type = "guide"
383465
all_pages.append(
384466
{
385467
"slug": slug,
386468
"title": title,
387469
"description": page.get("description", ""),
470+
"type": _page_type,
388471
}
389472
)
390473

391-
_existing_pages = existing_pages or {}
392-
coroutines = [
393-
generate_page(
394-
repo_path=repo_path,
395-
slug=p["slug"],
396-
title=p["title"],
397-
description=p["description"],
398-
cache_dir=cache_dir,
399-
ai_provider=ai_provider,
400-
ai_model=ai_model,
401-
ai_cli_timeout=ai_cli_timeout,
402-
use_cache=use_cache,
403-
project_name=project_name,
404-
owner=owner,
405-
existing_content=_existing_pages.get(p["slug"]),
406-
changed_files=changed_files,
407-
diff_content=diff_content,
408-
branch=branch,
409-
on_page_generated=on_page_generated,
410-
)
411-
for p in all_pages
412-
]
474+
# Write page manifest once for cross-referencing (GOLDEN RULE: don't inline in prompts)
475+
pages_manifest_dir = Path(tempfile.mkdtemp(prefix="docsfy-pages-manifest-"))
476+
try:
477+
pages_manifest_path = pages_manifest_dir / "pages.txt"
478+
manifest_lines = [
479+
f"- [{p['title']}]({p['slug']}.html) \u2014 {p['description']}"
480+
for p in all_pages
481+
]
482+
pages_manifest_path.write_text("\n".join(manifest_lines), encoding="utf-8")
483+
484+
_existing_pages = existing_pages or {}
485+
coroutines = [
486+
generate_page(
487+
repo_path=repo_path,
488+
slug=p["slug"],
489+
title=p["title"],
490+
description=p["description"],
491+
cache_dir=cache_dir,
492+
page_type=p["type"],
493+
ai_provider=ai_provider,
494+
ai_model=ai_model,
495+
ai_cli_timeout=ai_cli_timeout,
496+
use_cache=use_cache,
497+
project_name=project_name,
498+
owner=owner,
499+
existing_content=_existing_pages.get(p["slug"]),
500+
changed_files=changed_files,
501+
diff_content=diff_content,
502+
branch=branch,
503+
on_page_generated=on_page_generated,
504+
other_pages_path=str(pages_manifest_path),
505+
)
506+
for p in all_pages
507+
]
413508

414-
results = await run_parallel_with_limit(
415-
coroutines, max_concurrency=MAX_CONCURRENT_PAGES
416-
)
509+
from docsfy.config import get_settings
510+
511+
results = await run_parallel_with_limit(
512+
coroutines, max_concurrency=get_settings().max_concurrent_pages
513+
)
514+
finally:
515+
shutil.rmtree(pages_manifest_dir, ignore_errors=True)
417516
pages: dict[str, str] = {}
418517
for page_info, result in zip(all_pages, results):
419518
if isinstance(result, Exception):
@@ -443,20 +542,29 @@ async def run_incremental_planner(
443542
logger.info(
444543
f"[{project_name}] Running incremental planner for {len(changed_files)} changed files"
445544
)
446-
prompt = build_incremental_planner_prompt(
447-
project_name, changed_files, existing_plan
448-
)
545+
job_dir = Path(tempfile.mkdtemp(prefix="docsfy-incremental-plan-"))
449546
try:
450-
output = await _call_ai_or_raise(
451-
prompt=prompt,
452-
repo_path=repo_path,
453-
ai_provider=ai_provider,
454-
ai_model=ai_model,
455-
ai_cli_timeout=ai_cli_timeout,
547+
plan_file = job_dir / "existing_plan.json"
548+
plan_file.write_text(json.dumps(existing_plan, indent=2), encoding="utf-8")
549+
550+
prompt = build_incremental_planner_prompt(
551+
project_name, changed_files, str(plan_file)
456552
)
457-
except RuntimeError:
458-
logger.warning(f"[{project_name}] Incremental planner failed, regenerating all")
459-
return ["all"]
553+
try:
554+
output = await _call_ai_or_raise(
555+
prompt=prompt,
556+
repo_path=repo_path,
557+
ai_provider=ai_provider,
558+
ai_model=ai_model,
559+
ai_cli_timeout=ai_cli_timeout,
560+
)
561+
except RuntimeError:
562+
logger.warning(
563+
f"[{project_name}] Incremental planner failed, regenerating all"
564+
)
565+
return ["all"]
566+
finally:
567+
shutil.rmtree(job_dir, ignore_errors=True)
460568

461569
raw_result = parse_json_array_response(output)
462570
if raw_result is None or not isinstance(raw_result, list):

0 commit comments

Comments
 (0)