11from __future__ import annotations
22
3+ import json
4+ import re
5+ import shutil
6+ import tempfile
37from collections .abc import Awaitable , Callable
48from pathlib import Path
59from typing import Any
1014from docsfy .json_parser import parse_json_array_response , parse_json_response
1115from pydantic import ValidationError
1216
13- from docsfy .models import DEFAULT_BRANCH , MAX_CONCURRENT_PAGES , DocPlan
17+ from docsfy .models import DEFAULT_BRANCH , PAGE_TYPES , DocPlan
1418from docsfy .prompts import (
1519 build_incremental_page_prompt ,
1620 build_incremental_planner_prompt ,
1721 build_page_prompt ,
1822 build_planner_prompt ,
23+ truncate_diff_content ,
1924)
2025
2126logger = get_logger (name = __name__ )
@@ -37,6 +42,52 @@ def _strip_ai_preamble(text: str) -> str:
3742 return text
3843
3944
45+ _AI_COMMENTARY_END_MARKERS = (
46+ "\n Wait -" ,
47+ "\n Wait," ,
48+ "\n Let me refine" ,
49+ "\n Let me remove" ,
50+ "\n I should " ,
51+ "\n I'll also " ,
52+ "\n I'll remove" ,
53+ "\n So I should" ,
54+ "\n `</think>`" ,
55+ )
56+
57+
58+ def _strip_ai_artifacts (text : str ) -> str :
59+ """Strip AI thinking/reasoning artifacts from generated content.
60+
61+ Removes:
62+ - <think>...</think> blocks anywhere in the text
63+ - </think> orphan closing tags
64+ - Self-referential AI commentary at the end (e.g., "Wait - the user said...",
65+ "Let me refine:", "I should NOT include...")
66+ """
67+ # Remove <think>...</think> blocks (including multiline)
68+ while "<think>" in text and "</think>" in text :
69+ text = re .sub (r"<think>.*?</think>" , "" , text , flags = re .DOTALL )
70+
71+ # Remove orphan </think> tags
72+ text = re .sub (r"</think>" , "" , text )
73+
74+ # Remove orphan <think> tags
75+ text = re .sub (r"<think>" , "" , text )
76+
77+ # Only scan the tail of the output for self-referential AI commentary.
78+ # These markers only appear at the very end when the AI "thinks out loud"
79+ # after finishing. Scanning the full text risks truncating legitimate prose.
80+ if len (text ) > 500 :
81+ tail_offset = len (text ) - 500
82+ for marker in _AI_COMMENTARY_END_MARKERS :
83+ idx = text .find (marker , tail_offset )
84+ if idx >= 0 :
85+ text = text [:idx ]
86+ break # Only apply the first match
87+
88+ return text .strip ()
89+
90+
4091async def _call_ai_or_raise (
4192 prompt : str ,
4293 repo_path : Path ,
@@ -156,12 +207,16 @@ async def generate_full_page_content(
156207 ai_model : str ,
157208 ai_cli_timeout : int | None = None ,
158209 exclusions_path : str | None = None ,
210+ page_type : str = "guide" ,
211+ other_pages_path : str | None = None ,
159212) -> str :
160213 prompt = build_page_prompt (
161214 project_name = project_name ,
162215 page_title = page_title ,
163216 page_description = page_description ,
217+ page_type = page_type ,
164218 exclusions_path = exclusions_path ,
219+ other_pages_path = other_pages_path ,
165220 )
166221 output = await _call_ai_or_raise (
167222 prompt = prompt ,
@@ -170,7 +225,7 @@ async def generate_full_page_content(
170225 ai_model = ai_model ,
171226 ai_cli_timeout = ai_cli_timeout ,
172227 )
173- return _strip_ai_preamble (output )
228+ return _strip_ai_artifacts ( _strip_ai_preamble (output ) )
174229
175230
176231async def _generate_incremental_page_content (
@@ -184,22 +239,35 @@ async def _generate_incremental_page_content(
184239 ai_provider : str ,
185240 ai_model : str ,
186241 ai_cli_timeout : int | None = None ,
242+ page_type : str = "guide" ,
187243) -> str :
188- prompt = build_incremental_page_prompt (
189- project_name = project_name ,
190- page_title = page_title ,
191- page_description = page_description ,
192- existing_content = existing_content ,
193- changed_files = changed_files ,
194- diff_content = diff_content ,
195- )
196- output = await _call_ai_or_raise (
197- prompt = prompt ,
198- repo_path = repo_path ,
199- ai_provider = ai_provider ,
200- ai_model = ai_model ,
201- ai_cli_timeout = ai_cli_timeout ,
202- )
244+ job_dir = Path (tempfile .mkdtemp (prefix = "docsfy-incremental-page-" ))
245+ try :
246+ existing_page_file = job_dir / "existing_page.md"
247+ existing_page_file .write_text (existing_content , encoding = "utf-8" )
248+
249+ truncated_diff = truncate_diff_content (diff_content )
250+ diff_file = job_dir / "diff.patch"
251+ diff_file .write_text (truncated_diff , encoding = "utf-8" )
252+
253+ prompt = build_incremental_page_prompt (
254+ project_name = project_name ,
255+ page_title = page_title ,
256+ page_description = page_description ,
257+ existing_page_path = str (existing_page_file ),
258+ changed_files = changed_files ,
259+ diff_path = str (diff_file ),
260+ page_type = page_type ,
261+ )
262+ output = await _call_ai_or_raise (
263+ prompt = prompt ,
264+ repo_path = repo_path ,
265+ ai_provider = ai_provider ,
266+ ai_model = ai_model ,
267+ ai_cli_timeout = ai_cli_timeout ,
268+ )
269+ finally :
270+ shutil .rmtree (job_dir , ignore_errors = True )
203271 return _apply_incremental_page_updates (existing_content , output )
204272
205273
@@ -260,6 +328,8 @@ async def generate_page(
260328 diff_content : str | None = None ,
261329 branch : str = DEFAULT_BRANCH ,
262330 on_page_generated : Callable [[int ], Awaitable [None ]] | None = None ,
331+ page_type : str = "guide" ,
332+ other_pages_path : str | None = None ,
263333) -> str :
264334 _label = project_name or repo_path .name
265335 prompt_project_name = project_name or repo_path .name
@@ -291,6 +361,7 @@ async def generate_page(
291361 ai_provider = ai_provider ,
292362 ai_model = ai_model ,
293363 ai_cli_timeout = ai_cli_timeout ,
364+ page_type = page_type ,
294365 )
295366 except (RuntimeError , ValueError ) as exc :
296367 logger .warning (
@@ -305,6 +376,8 @@ async def generate_page(
305376 ai_provider = ai_provider ,
306377 ai_model = ai_model ,
307378 ai_cli_timeout = ai_cli_timeout ,
379+ page_type = page_type ,
380+ other_pages_path = other_pages_path ,
308381 )
309382 else :
310383 output = await generate_full_page_content (
@@ -315,6 +388,8 @@ async def generate_page(
315388 ai_provider = ai_provider ,
316389 ai_model = ai_model ,
317390 ai_cli_timeout = ai_cli_timeout ,
391+ page_type = page_type ,
392+ other_pages_path = other_pages_path ,
318393 )
319394 except RuntimeError as exc :
320395 logger .warning (f"[{ _label } ] Failed to generate page '{ slug } ': { exc } " )
@@ -380,40 +455,64 @@ async def generate_all_pages(
380455 if is_unsafe_slug (slug ):
381456 logger .warning (f"[{ _label } ] Skipping path-unsafe slug: '{ slug } '" )
382457 continue
458+ _page_type = page .get ("type" , "guide" )
459+ if _page_type not in PAGE_TYPES :
460+ logger .warning (
461+ f"[{ _label } ] Unknown page type '{ _page_type } ' for slug '{ slug } ', "
462+ f"falling back to 'guide'"
463+ )
464+ _page_type = "guide"
383465 all_pages .append (
384466 {
385467 "slug" : slug ,
386468 "title" : title ,
387469 "description" : page .get ("description" , "" ),
470+ "type" : _page_type ,
388471 }
389472 )
390473
391- _existing_pages = existing_pages or {}
392- coroutines = [
393- generate_page (
394- repo_path = repo_path ,
395- slug = p ["slug" ],
396- title = p ["title" ],
397- description = p ["description" ],
398- cache_dir = cache_dir ,
399- ai_provider = ai_provider ,
400- ai_model = ai_model ,
401- ai_cli_timeout = ai_cli_timeout ,
402- use_cache = use_cache ,
403- project_name = project_name ,
404- owner = owner ,
405- existing_content = _existing_pages .get (p ["slug" ]),
406- changed_files = changed_files ,
407- diff_content = diff_content ,
408- branch = branch ,
409- on_page_generated = on_page_generated ,
410- )
411- for p in all_pages
412- ]
474+ # Write page manifest once for cross-referencing (GOLDEN RULE: don't inline in prompts)
475+ pages_manifest_dir = Path (tempfile .mkdtemp (prefix = "docsfy-pages-manifest-" ))
476+ try :
477+ pages_manifest_path = pages_manifest_dir / "pages.txt"
478+ manifest_lines = [
479+ f"- [{ p ['title' ]} ]({ p ['slug' ]} .html) \u2014 { p ['description' ]} "
480+ for p in all_pages
481+ ]
482+ pages_manifest_path .write_text ("\n " .join (manifest_lines ), encoding = "utf-8" )
483+
484+ _existing_pages = existing_pages or {}
485+ coroutines = [
486+ generate_page (
487+ repo_path = repo_path ,
488+ slug = p ["slug" ],
489+ title = p ["title" ],
490+ description = p ["description" ],
491+ cache_dir = cache_dir ,
492+ page_type = p ["type" ],
493+ ai_provider = ai_provider ,
494+ ai_model = ai_model ,
495+ ai_cli_timeout = ai_cli_timeout ,
496+ use_cache = use_cache ,
497+ project_name = project_name ,
498+ owner = owner ,
499+ existing_content = _existing_pages .get (p ["slug" ]),
500+ changed_files = changed_files ,
501+ diff_content = diff_content ,
502+ branch = branch ,
503+ on_page_generated = on_page_generated ,
504+ other_pages_path = str (pages_manifest_path ),
505+ )
506+ for p in all_pages
507+ ]
413508
414- results = await run_parallel_with_limit (
415- coroutines , max_concurrency = MAX_CONCURRENT_PAGES
416- )
509+ from docsfy .config import get_settings
510+
511+ results = await run_parallel_with_limit (
512+ coroutines , max_concurrency = get_settings ().max_concurrent_pages
513+ )
514+ finally :
515+ shutil .rmtree (pages_manifest_dir , ignore_errors = True )
417516 pages : dict [str , str ] = {}
418517 for page_info , result in zip (all_pages , results ):
419518 if isinstance (result , Exception ):
@@ -443,20 +542,29 @@ async def run_incremental_planner(
443542 logger .info (
444543 f"[{ project_name } ] Running incremental planner for { len (changed_files )} changed files"
445544 )
446- prompt = build_incremental_planner_prompt (
447- project_name , changed_files , existing_plan
448- )
545+ job_dir = Path (tempfile .mkdtemp (prefix = "docsfy-incremental-plan-" ))
449546 try :
450- output = await _call_ai_or_raise (
451- prompt = prompt ,
452- repo_path = repo_path ,
453- ai_provider = ai_provider ,
454- ai_model = ai_model ,
455- ai_cli_timeout = ai_cli_timeout ,
547+ plan_file = job_dir / "existing_plan.json"
548+ plan_file .write_text (json .dumps (existing_plan , indent = 2 ), encoding = "utf-8" )
549+
550+ prompt = build_incremental_planner_prompt (
551+ project_name , changed_files , str (plan_file )
456552 )
457- except RuntimeError :
458- logger .warning (f"[{ project_name } ] Incremental planner failed, regenerating all" )
459- return ["all" ]
553+ try :
554+ output = await _call_ai_or_raise (
555+ prompt = prompt ,
556+ repo_path = repo_path ,
557+ ai_provider = ai_provider ,
558+ ai_model = ai_model ,
559+ ai_cli_timeout = ai_cli_timeout ,
560+ )
561+ except RuntimeError :
562+ logger .warning (
563+ f"[{ project_name } ] Incremental planner failed, regenerating all"
564+ )
565+ return ["all" ]
566+ finally :
567+ shutil .rmtree (job_dir , ignore_errors = True )
460568
461569 raw_result = parse_json_array_response (output )
462570 if raw_result is None or not isinstance (raw_result , list ):
0 commit comments