Skip to content

Commit 8e1cfac

Browse files
committed
refactor(prompt_templates): improve question generation guidelines and context handling
- Updated the question generation template to clarify the role of surrounding context and main content. - Enhanced quality rules for generated questions to better align with user search intent. - Revised output format and added explicit instructions on what not to generate. - Improved logging and output in the web parser for better visibility of parsed content and metadata.
1 parent 1b7070c commit 8e1cfac

File tree

3 files changed

+63
-33
lines changed

3 files changed

+63
-33
lines changed

config/prompt_templates/generate_questions.yaml

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,32 +6,37 @@ templates:
66
description: "Generate related questions from document chunks to improve retrieval recall"
77
default: true
88
content: |
9-
You are a professional question generation assistant. Your task is to generate related questions that users might ask based on the given [Main Content].
10-
9+
You are a question generation assistant optimizing for search retrieval. Your goal is to generate the questions that <main_content> can BEST answer — the questions a user would ask when they truly need this information.
10+
Note: <surrounding_context> (if present) is only for helping you understand <main_content> better. Generate questions ONLY about <main_content>.
11+
1112
{{context}}
12-
## Main Content (generate questions based on this content)
13+
<main_content>
1314
Document name: {{doc_name}}
14-
Document content:
15+
1516
{{content}}
17+
</main_content>
18+
19+
## Think Before Generating
20+
First, silently identify:
21+
1. What is the CORE TOPIC of this content?
22+
2. What problem or need does this content address?
23+
3. If a user needed this information, what would they search for?
1624
17-
## Core Requirements
18-
- Generated questions must be directly related to the [Main Content]
19-
- Questions must NOT use any pronouns or referential words (such as "it", "this", "that document", "this article", "the text", "its", etc.); use specific names instead
20-
- Questions must be complete and self-contained, understandable without additional context
21-
- Questions should be natural questions that users would likely ask in real scenarios
22-
- Questions should be diverse, covering different aspects of the content
25+
## Question Quality Rules
26+
- Focus on questions where this content provides a COMPLETE or SUBSTANTIAL answer, not just a passing mention
27+
- Prioritize questions about the main theme, key concepts, how-tos, and conclusions — NOT trivial details or isolated facts
28+
- Questions should reflect real user search intent: "How to...", "What is...", "Why does...", "What are the best practices for..."
29+
- Each question must be self-contained: NO pronouns or references (e.g., "it", "this", "that document"); use specific names
30+
- Questions should be at a level where someone would genuinely search for the answer, not quiz-style trivia
2331
- Each question should be concise and clear, within 30 words
24-
- Generate {{question_count}} questions
2532
26-
## Suggested Question Types
27-
- Definition: What is...? What does... mean?
28-
- Reason: Why...? What is the reason for...?
29-
- Method: How to...? What is the way to...?
30-
- Comparison: What is the difference between... and...?
31-
- Application: What scenarios can... be used for?
33+
## What NOT to Generate
34+
- Do NOT generate questions about minor details that are only briefly mentioned
35+
- Do NOT generate questions that can be answered with a single word or number extracted from the text
36+
- Do NOT generate questions that are too broad to be meaningfully answered by this specific content
3237
33-
## Output Format
34-
Output the question list directly, one question per line, without numbering or other prefixes.
38+
## Output
39+
Generate {{question_count}} questions, one per line, no numbering or prefixes.
3540
3641
## CRITICAL: Language Rule
3742
- Generate questions in {{language}}

docreader/parser/web_parser.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -138,16 +138,34 @@ class WebParser(PipelineParser):
138138

139139

140140
if __name__ == "__main__":
141-
# Configure logging for debugging
142-
logging.basicConfig(level=logging.DEBUG)
143-
logger.setLevel(logging.DEBUG)
141+
import sys
144142

145-
# Example URL to scrape
146-
url = "https://cloud.tencent.com/document/product/457/6759"
143+
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")
144+
145+
url = sys.argv[1] if len(sys.argv) > 1 else "https://cloud.tencent.com/document/product/457/6759"
146+
print(f"\n{'='*60}")
147+
print(f"URL: {url}")
148+
print(f"{'='*60}\n")
147149

148-
# Create parser instance and parse the web page
149150
parser = WebParser(title="")
150-
cc = parser.parse_into_text(url.encode())
151-
# Save the parsed markdown content to file
152-
with open("./tencent.md", "w") as f:
153-
f.write(cc.content)
151+
doc = parser.parse_into_text(url.encode())
152+
153+
print(f"--- metadata ---")
154+
for k, v in doc.metadata.items():
155+
print(f" {k}: {v}")
156+
157+
print(f"\n--- images ({len(doc.images)}) ---")
158+
for path in list(doc.images.keys())[:10]:
159+
print(f" {path} ({len(doc.images[path])} chars base64)")
160+
161+
print(f"\n--- content ({len(doc.content)} chars) ---")
162+
print(doc.content[:300000])
163+
if len(doc.content) > 300000:
164+
print(f"\n... (truncated, total {len(doc.content)} chars)")
165+
166+
print(f"\n--- chunks ({len(doc.chunks)}) ---")
167+
for i, chunk in enumerate(doc.chunks[:5]):
168+
print(f" [{i}] seq={chunk.seq} range=[{chunk.start}:{chunk.end}] len={len(chunk.content)}")
169+
print(f" {chunk.content[:120]}{'...' if len(chunk.content) > 120 else ''}")
170+
if len(doc.chunks) > 5:
171+
print(f" ... ({len(doc.chunks) - 5} more chunks)")

internal/application/service/knowledge.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2369,14 +2369,14 @@ func (s *knowledgeService) generateQuestionsWithContext(ctx context.Context,
23692369
// Build context section
23702370
var contextSection string
23712371
if prevContent != "" || nextContent != "" {
2372-
contextSection = "## Context Information (for reference only, to help understand the main content)\n"
2372+
contextSection = "<surrounding_context>\n"
23732373
if prevContent != "" {
2374-
contextSection += fmt.Sprintf("[Preceding Context] %s\n", prevContent)
2374+
contextSection += fmt.Sprintf("[Preceding Content]\n%s\n\n", prevContent)
23752375
}
23762376
if nextContent != "" {
2377-
contextSection += fmt.Sprintf("[Following Context] %s\n", nextContent)
2377+
contextSection += fmt.Sprintf("[Following Content]\n%s\n\n", nextContent)
23782378
}
2379-
contextSection += "\n"
2379+
contextSection += "</surrounding_context>\n\n"
23802380
}
23812381

23822382
langName := types.LanguageNameFromContext(ctx)
@@ -7577,6 +7577,13 @@ func (s *knowledgeService) ProcessDocument(ctx context.Context, t *asynq.Task) e
75777577

75787578
// Step 2: Store images and update markdown references
75797579
var storedImages []docparser.StoredImage
7580+
7581+
// For URL imports, resolve relative image paths to absolute URLs
7582+
// so that ResolveRemoteImages can download them.
7583+
if payload.URL != "" && convertResult != nil {
7584+
convertResult.MarkdownContent = docparser.ResolveRelativeImageURLs(convertResult.MarkdownContent, payload.URL)
7585+
}
7586+
75807587
if s.imageResolver != nil && convertResult != nil {
75817588
fileSvc := s.resolveFileService(ctx, kb)
75827589
tenantID, _ := ctx.Value(types.TenantIDContextKey).(uint64)

0 commit comments

Comments
 (0)