refactor(prompt_templates): improve question generation guidelines and context handling

lyingbug · lyingbug · commit 8e1cfaccb7f1 · 2026-04-01T21:59:22.000+08:00
- Updated the question generation template to clarify the role of surrounding context and main content.
- Enhanced quality rules for generated questions to better align with user search intent.
- Revised output format and added explicit instructions on what not to generate.
- Improved logging and output in the web parser for better visibility of parsed content and metadata.
diff --git a/config/prompt_templates/generate_questions.yaml b/config/prompt_templates/generate_questions.yaml
@@ -6,32 +6,37 @@ templates:
     description: "Generate related questions from document chunks to improve retrieval recall"
     default: true
     content: |
-      You are a professional question generation assistant. Your task is to generate related questions that users might ask based on the given [Main Content].
-
+      You are a question generation assistant optimizing for search retrieval. Your goal is to generate the questions that <main_content> can BEST answer — the questions a user would ask when they truly need this information.
+      Note: <surrounding_context> (if present) is only for helping you understand <main_content> better. Generate questions ONLY about <main_content>.
+      
       {{context}}
-      ## Main Content (generate questions based on this content)
+      <main_content>
       Document name: {{doc_name}}
-      Document content:
+
       {{content}}
+      </main_content>
+
+      ## Think Before Generating
+      First, silently identify:
+      1. What is the CORE TOPIC of this content?
+      2. What problem or need does this content address?
+      3. If a user needed this information, what would they search for?
 
-      ## Core Requirements
-      - Generated questions must be directly related to the [Main Content]
-      - Questions must NOT use any pronouns or referential words (such as "it", "this", "that document", "this article", "the text", "its", etc.); use specific names instead
-      - Questions must be complete and self-contained, understandable without additional context
-      - Questions should be natural questions that users would likely ask in real scenarios
-      - Questions should be diverse, covering different aspects of the content
+      ## Question Quality Rules
+      - Focus on questions where this content provides a COMPLETE or SUBSTANTIAL answer, not just a passing mention
+      - Prioritize questions about the main theme, key concepts, how-tos, and conclusions — NOT trivial details or isolated facts
+      - Questions should reflect real user search intent: "How to...", "What is...", "Why does...", "What are the best practices for..."
+      - Each question must be self-contained: NO pronouns or references (e.g., "it", "this", "that document"); use specific names
+      - Questions should be at a level where someone would genuinely search for the answer, not quiz-style trivia
       - Each question should be concise and clear, within 30 words
-      - Generate {{question_count}} questions
 
-      ## Suggested Question Types
-      - Definition: What is...? What does... mean?
-      - Reason: Why...? What is the reason for...?
-      - Method: How to...? What is the way to...?
-      - Comparison: What is the difference between... and...?
-      - Application: What scenarios can... be used for?
+      ## What NOT to Generate
+      - Do NOT generate questions about minor details that are only briefly mentioned
+      - Do NOT generate questions that can be answered with a single word or number extracted from the text
+      - Do NOT generate questions that are too broad to be meaningfully answered by this specific content
 
-      ## Output Format
-      Output the question list directly, one question per line, without numbering or other prefixes.
+      ## Output
+      Generate {{question_count}} questions, one per line, no numbering or prefixes.
 
       ## CRITICAL: Language Rule
       - Generate questions in {{language}}
diff --git a/docreader/parser/web_parser.py b/docreader/parser/web_parser.py
@@ -138,16 +138,34 @@ class WebParser(PipelineParser):
 
 
 if __name__ == "__main__":
-    # Configure logging for debugging
-    logging.basicConfig(level=logging.DEBUG)
-    logger.setLevel(logging.DEBUG)
+    import sys
 
-    # Example URL to scrape
-    url = "https://cloud.tencent.com/document/product/457/6759"
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")
+
+    url = sys.argv[1] if len(sys.argv) > 1 else "https://cloud.tencent.com/document/product/457/6759"
+    print(f"\n{'='*60}")
+    print(f"URL: {url}")
+    print(f"{'='*60}\n")
 
-    # Create parser instance and parse the web page
     parser = WebParser(title="")
-    cc = parser.parse_into_text(url.encode())
-    # Save the parsed markdown content to file
-    with open("./tencent.md", "w") as f:
-        f.write(cc.content)
+    doc = parser.parse_into_text(url.encode())
+
+    print(f"--- metadata ---")
+    for k, v in doc.metadata.items():
+        print(f"  {k}: {v}")
+
+    print(f"\n--- images ({len(doc.images)}) ---")
+    for path in list(doc.images.keys())[:10]:
+        print(f"  {path}  ({len(doc.images[path])} chars base64)")
+
+    print(f"\n--- content ({len(doc.content)} chars) ---")
+    print(doc.content[:300000])
+    if len(doc.content) > 300000:
+        print(f"\n... (truncated, total {len(doc.content)} chars)")
+
+    print(f"\n--- chunks ({len(doc.chunks)}) ---")
+    for i, chunk in enumerate(doc.chunks[:5]):
+        print(f"  [{i}] seq={chunk.seq} range=[{chunk.start}:{chunk.end}] len={len(chunk.content)}")
+        print(f"      {chunk.content[:120]}{'...' if len(chunk.content) > 120 else ''}")
+    if len(doc.chunks) > 5:
+        print(f"  ... ({len(doc.chunks) - 5} more chunks)")
diff --git a/internal/application/service/knowledge.go b/internal/application/service/knowledge.go
@@ -2369,14 +2369,14 @@ func (s *knowledgeService) generateQuestionsWithContext(ctx context.Context,
 	// Build context section
 	var contextSection string
 	if prevContent != "" || nextContent != "" {
-		contextSection = "## Context Information (for reference only, to help understand the main content)\n"
+		contextSection = "<surrounding_context>\n"
 		if prevContent != "" {
-			contextSection += fmt.Sprintf("[Preceding Context] %s\n", prevContent)
+			contextSection += fmt.Sprintf("[Preceding Content]\n%s\n\n", prevContent)
 		}
 		if nextContent != "" {
-			contextSection += fmt.Sprintf("[Following Context] %s\n", nextContent)
+			contextSection += fmt.Sprintf("[Following Content]\n%s\n\n", nextContent)
 		}
-		contextSection += "\n"
+		contextSection += "</surrounding_context>\n\n"
 	}
 
 	langName := types.LanguageNameFromContext(ctx)
@@ -7577,6 +7577,13 @@ func (s *knowledgeService) ProcessDocument(ctx context.Context, t *asynq.Task) e
 
 	// Step 2: Store images and update markdown references
 	var storedImages []docparser.StoredImage
+
+	// For URL imports, resolve relative image paths to absolute URLs
+	// so that ResolveRemoteImages can download them.
+	if payload.URL != "" && convertResult != nil {
+		convertResult.MarkdownContent = docparser.ResolveRelativeImageURLs(convertResult.MarkdownContent, payload.URL)
+	}
+
 	if s.imageResolver != nil && convertResult != nil {
 		fileSvc := s.resolveFileService(ctx, kb)
 		tenantID, _ := ctx.Value(types.TenantIDContextKey).(uint64)