Skip to content

Commit 34b76bf

Browse files
committed
feat(scoring): add LLM metadata reranker and rich metadata #463
Introduce LLMMetadataReranker and DocumentRichMetadata for improved document scoring. Update related tools and add tests for reranking functionality.
1 parent 50e6cba commit 34b76bf

File tree

7 files changed

+1424
-24
lines changed

7 files changed

+1424
-24
lines changed

mpp-core/src/commonMain/kotlin/cc/unitmesh/agent/document/DocumentAgent.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,8 @@ class DocumentAgent(
7575
subAgentManager = subAgentManager,
7676
llmService = llmService
7777
).apply {
78-
// Register DocQLTool
79-
registerTool(DocQLTool())
78+
// Register DocQLTool with LLM service for LLM-based reranking
79+
registerTool(DocQLTool(llmService))
8080
}
8181
}
8282

Lines changed: 332 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,332 @@
1+
package cc.unitmesh.agent.scoring
2+
3+
import kotlinx.serialization.Serializable
4+
import kotlinx.serialization.json.Json
5+
6+
/**
7+
* Rich metadata for a document, extracted during indexing.
8+
* This metadata is used by LLMMetadataReranker for intelligent relevance scoring.
9+
*
10+
* ## Storage
11+
*
12+
* This metadata can be serialized to JSON and stored alongside the document content
13+
* in the document index, or computed on-demand during query time.
14+
*
15+
* ## Extraction
16+
*
17+
* Use [DocumentRichMetadataExtractor] to extract this metadata from parsed documents.
18+
*/
19+
@Serializable
20+
data class DocumentRichMetadata(
21+
/** Document path (unique identifier) */
22+
val path: String,
23+
/** File name without path */
24+
val fileName: String,
25+
/** File extension */
26+
val extension: String,
27+
/** Parent directory */
28+
val directory: String,
29+
/** Document format type ("markdown", "kotlin", "java", etc.) */
30+
val formatType: String,
31+
32+
// === Document Structure ===
33+
34+
/** Primary heading (H1) of the document */
35+
val h1Heading: String? = null,
36+
/** All headings in document (for TOC) */
37+
val headings: List<HeadingInfo> = emptyList(),
38+
/** Total number of headings */
39+
val headingCount: Int = 0,
40+
41+
// === Content Statistics ===
42+
43+
/** Total content length in characters */
44+
val contentLength: Int = 0,
45+
/** Number of lines */
46+
val lineCount: Int = 0,
47+
/** Number of code blocks (for markdown) */
48+
val codeBlockCount: Int = 0,
49+
/** Number of links/references */
50+
val linkCount: Int = 0,
51+
52+
// === Entities ===
53+
54+
/** Classes defined in document */
55+
val classes: List<String> = emptyList(),
56+
/** Functions defined in document */
57+
val functions: List<String> = emptyList(),
58+
/** Terms/definitions in document */
59+
val terms: List<String> = emptyList(),
60+
61+
// === Timestamps ===
62+
63+
/** Last modification timestamp */
64+
val lastModified: Long = 0,
65+
/** File size in bytes */
66+
val fileSize: Long = 0,
67+
/** When this metadata was extracted */
68+
val extractedAt: Long = 0,
69+
70+
// === Relationships ===
71+
72+
/** Documents this document references */
73+
val outgoingRefs: List<String> = emptyList(),
74+
/** Documents that reference this document */
75+
val incomingRefs: List<String> = emptyList(),
76+
77+
// === Keywords ===
78+
79+
/** Extracted keywords/tags */
80+
val keywords: List<String> = emptyList(),
81+
/** Language of content (e.g., "en", "zh") */
82+
val language: String? = null
83+
) {
84+
companion object {
85+
private val json = Json {
86+
ignoreUnknownKeys = true
87+
encodeDefaults = true
88+
}
89+
90+
/**
91+
* Serialize metadata to JSON string.
92+
*/
93+
fun toJson(metadata: DocumentRichMetadata): String {
94+
return json.encodeToString(serializer(), metadata)
95+
}
96+
97+
/**
98+
* Deserialize metadata from JSON string.
99+
*/
100+
fun fromJson(jsonStr: String): DocumentRichMetadata? {
101+
return try {
102+
json.decodeFromString(serializer(), jsonStr)
103+
} catch (e: Exception) {
104+
null
105+
}
106+
}
107+
108+
/**
109+
* Create from file path with minimal info.
110+
*/
111+
fun fromPath(path: String): DocumentRichMetadata {
112+
val fileName = path.substringAfterLast('/')
113+
val extension = fileName.substringAfterLast('.', "")
114+
val directory = path.substringBeforeLast('/', "")
115+
val formatType = when (extension.lowercase()) {
116+
"md", "markdown" -> "markdown"
117+
"kt" -> "kotlin"
118+
"java" -> "java"
119+
"py" -> "python"
120+
"ts" -> "typescript"
121+
"js" -> "javascript"
122+
"go" -> "go"
123+
"rs" -> "rust"
124+
"cs" -> "csharp"
125+
"pdf" -> "pdf"
126+
"docx" -> "docx"
127+
else -> "text"
128+
}
129+
130+
return DocumentRichMetadata(
131+
path = path,
132+
fileName = fileName,
133+
extension = extension,
134+
directory = directory,
135+
formatType = formatType
136+
)
137+
}
138+
}
139+
140+
/**
141+
* Convert to DocumentMetadataItem for reranking.
142+
*/
143+
fun toMetadataItem(
144+
contentType: String = "document",
145+
name: String = fileName,
146+
preview: String = "",
147+
heuristicScore: Double = 0.0
148+
): DocumentMetadataItem {
149+
return DocumentMetadataItem(
150+
id = path,
151+
filePath = path,
152+
fileName = fileName,
153+
extension = extension,
154+
directory = directory,
155+
contentType = contentType,
156+
name = name,
157+
preview = preview,
158+
h1Heading = h1Heading,
159+
parentHeading = headings.firstOrNull()?.title,
160+
lastModified = lastModified,
161+
fileSize = fileSize,
162+
formatType = formatType,
163+
references = outgoingRefs,
164+
tags = keywords,
165+
heuristicScore = heuristicScore
166+
)
167+
}
168+
}
169+
170+
/**
171+
* Heading information extracted from document.
172+
*/
173+
@Serializable
174+
data class HeadingInfo(
175+
/** Heading level (1 = H1, 2 = H2, etc.) */
176+
val level: Int,
177+
/** Heading text */
178+
val title: String,
179+
/** Anchor/ID for navigation */
180+
val anchor: String? = null,
181+
/** Line number in document */
182+
val lineNumber: Int? = null
183+
)
184+
185+
/**
186+
* Extracts rich metadata from documents.
187+
*/
188+
object DocumentRichMetadataExtractor {
189+
190+
/**
191+
* Extract metadata from markdown content.
192+
*/
193+
fun extractFromMarkdown(
194+
path: String,
195+
content: String,
196+
lastModified: Long = 0
197+
): DocumentRichMetadata {
198+
val base = DocumentRichMetadata.fromPath(path)
199+
200+
val lines = content.lines()
201+
val headings = mutableListOf<HeadingInfo>()
202+
var codeBlockCount = 0
203+
var linkCount = 0
204+
var inCodeBlock = false
205+
206+
lines.forEachIndexed { index, line ->
207+
// Track code blocks
208+
if (line.trim().startsWith("```")) {
209+
if (!inCodeBlock) {
210+
codeBlockCount++
211+
}
212+
inCodeBlock = !inCodeBlock
213+
}
214+
215+
// Extract headings (outside code blocks)
216+
if (!inCodeBlock && line.startsWith("#")) {
217+
val level = line.takeWhile { it == '#' }.length
218+
val title = line.drop(level).trim()
219+
if (title.isNotEmpty()) {
220+
val anchor = title.lowercase()
221+
.replace(Regex("[^a-z0-9\\s-]"), "")
222+
.replace(Regex("\\s+"), "-")
223+
headings.add(HeadingInfo(level, title, anchor, index + 1))
224+
}
225+
}
226+
227+
// Count links
228+
linkCount += Regex("\\[.*?\\]\\(.*?\\)").findAll(line).count()
229+
}
230+
231+
// Extract keywords from headings and content
232+
val keywords = extractKeywords(headings.map { it.title }, content)
233+
234+
return base.copy(
235+
h1Heading = headings.firstOrNull { it.level == 1 }?.title,
236+
headings = headings,
237+
headingCount = headings.size,
238+
contentLength = content.length,
239+
lineCount = lines.size,
240+
codeBlockCount = codeBlockCount,
241+
linkCount = linkCount,
242+
lastModified = lastModified,
243+
fileSize = content.length.toLong(),
244+
extractedAt = kotlinx.datetime.Clock.System.now().toEpochMilliseconds(),
245+
keywords = keywords,
246+
outgoingRefs = extractOutgoingRefs(content)
247+
)
248+
}
249+
250+
/**
251+
* Extract metadata from source code.
252+
*/
253+
fun extractFromCode(
254+
path: String,
255+
content: String,
256+
classes: List<String> = emptyList(),
257+
functions: List<String> = emptyList(),
258+
lastModified: Long = 0
259+
): DocumentRichMetadata {
260+
val base = DocumentRichMetadata.fromPath(path)
261+
val lines = content.lines()
262+
263+
// Extract package/module as primary heading for code files
264+
val packageLine = lines.firstOrNull {
265+
it.trim().startsWith("package ") || it.trim().startsWith("module ")
266+
}
267+
val packageName = packageLine?.substringAfter("package ")
268+
?.substringAfter("module ")
269+
?.trim()
270+
?.removeSuffix(";")
271+
272+
return base.copy(
273+
h1Heading = packageName,
274+
contentLength = content.length,
275+
lineCount = lines.size,
276+
classes = classes,
277+
functions = functions,
278+
lastModified = lastModified,
279+
fileSize = content.length.toLong(),
280+
extractedAt = kotlinx.datetime.Clock.System.now().toEpochMilliseconds(),
281+
keywords = (classes + functions).take(20)
282+
)
283+
}
284+
285+
/**
286+
* Extract keywords from document.
287+
*/
288+
private fun extractKeywords(headings: List<String>, content: String): List<String> {
289+
val keywords = mutableSetOf<String>()
290+
291+
// Add heading words (excluding common words)
292+
val stopWords = setOf("the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by")
293+
headings.forEach { heading ->
294+
heading.split(Regex("[\\s,.-]+"))
295+
.filter { it.length > 2 && it.lowercase() !in stopWords }
296+
.forEach { keywords.add(it) }
297+
}
298+
299+
// Look for code-like identifiers (CamelCase, snake_case)
300+
val identifierPattern = Regex("\\b([A-Z][a-z]+(?:[A-Z][a-z]+)+|[a-z]+_[a-z]+(?:_[a-z]+)*)\\b")
301+
identifierPattern.findAll(content).take(50).forEach { match ->
302+
keywords.add(match.value)
303+
}
304+
305+
return keywords.take(30).toList()
306+
}
307+
308+
/**
309+
* Extract outgoing references from content.
310+
*/
311+
private fun extractOutgoingRefs(content: String): List<String> {
312+
val refs = mutableListOf<String>()
313+
314+
// Markdown links: [text](path)
315+
val linkPattern = Regex("\\[.*?\\]\\((.*?)\\)")
316+
linkPattern.findAll(content).forEach { match ->
317+
val href = match.groupValues[1]
318+
if (href.isNotBlank() && !href.startsWith("http") && !href.startsWith("#")) {
319+
refs.add(href.substringBefore("#").substringBefore("?"))
320+
}
321+
}
322+
323+
// Import statements
324+
val importPattern = Regex("import\\s+[\"']?(.*?)[\"']?[;\\s]")
325+
importPattern.findAll(content).forEach { match ->
326+
refs.add(match.groupValues[1])
327+
}
328+
329+
return refs.distinct().take(20)
330+
}
331+
}
332+

0 commit comments

Comments
 (0)