|
| 1 | +package cc.unitmesh.agent.scoring |
| 2 | + |
| 3 | +import kotlinx.serialization.Serializable |
| 4 | +import kotlinx.serialization.json.Json |
| 5 | + |
| 6 | +/** |
| 7 | + * Rich metadata for a document, extracted during indexing. |
| 8 | + * This metadata is used by LLMMetadataReranker for intelligent relevance scoring. |
| 9 | + * |
| 10 | + * ## Storage |
| 11 | + * |
| 12 | + * This metadata can be serialized to JSON and stored alongside the document content |
| 13 | + * in the document index, or computed on-demand during query time. |
| 14 | + * |
| 15 | + * ## Extraction |
| 16 | + * |
| 17 | + * Use [DocumentRichMetadataExtractor] to extract this metadata from parsed documents. |
| 18 | + */ |
| 19 | +@Serializable |
| 20 | +data class DocumentRichMetadata( |
| 21 | + /** Document path (unique identifier) */ |
| 22 | + val path: String, |
| 23 | + /** File name without path */ |
| 24 | + val fileName: String, |
| 25 | + /** File extension */ |
| 26 | + val extension: String, |
| 27 | + /** Parent directory */ |
| 28 | + val directory: String, |
| 29 | + /** Document format type ("markdown", "kotlin", "java", etc.) */ |
| 30 | + val formatType: String, |
| 31 | + |
| 32 | + // === Document Structure === |
| 33 | + |
| 34 | + /** Primary heading (H1) of the document */ |
| 35 | + val h1Heading: String? = null, |
| 36 | + /** All headings in document (for TOC) */ |
| 37 | + val headings: List<HeadingInfo> = emptyList(), |
| 38 | + /** Total number of headings */ |
| 39 | + val headingCount: Int = 0, |
| 40 | + |
| 41 | + // === Content Statistics === |
| 42 | + |
| 43 | + /** Total content length in characters */ |
| 44 | + val contentLength: Int = 0, |
| 45 | + /** Number of lines */ |
| 46 | + val lineCount: Int = 0, |
| 47 | + /** Number of code blocks (for markdown) */ |
| 48 | + val codeBlockCount: Int = 0, |
| 49 | + /** Number of links/references */ |
| 50 | + val linkCount: Int = 0, |
| 51 | + |
| 52 | + // === Entities === |
| 53 | + |
| 54 | + /** Classes defined in document */ |
| 55 | + val classes: List<String> = emptyList(), |
| 56 | + /** Functions defined in document */ |
| 57 | + val functions: List<String> = emptyList(), |
| 58 | + /** Terms/definitions in document */ |
| 59 | + val terms: List<String> = emptyList(), |
| 60 | + |
| 61 | + // === Timestamps === |
| 62 | + |
| 63 | + /** Last modification timestamp */ |
| 64 | + val lastModified: Long = 0, |
| 65 | + /** File size in bytes */ |
| 66 | + val fileSize: Long = 0, |
| 67 | + /** When this metadata was extracted */ |
| 68 | + val extractedAt: Long = 0, |
| 69 | + |
| 70 | + // === Relationships === |
| 71 | + |
| 72 | + /** Documents this document references */ |
| 73 | + val outgoingRefs: List<String> = emptyList(), |
| 74 | + /** Documents that reference this document */ |
| 75 | + val incomingRefs: List<String> = emptyList(), |
| 76 | + |
| 77 | + // === Keywords === |
| 78 | + |
| 79 | + /** Extracted keywords/tags */ |
| 80 | + val keywords: List<String> = emptyList(), |
| 81 | + /** Language of content (e.g., "en", "zh") */ |
| 82 | + val language: String? = null |
| 83 | +) { |
| 84 | + companion object { |
| 85 | + private val json = Json { |
| 86 | + ignoreUnknownKeys = true |
| 87 | + encodeDefaults = true |
| 88 | + } |
| 89 | + |
| 90 | + /** |
| 91 | + * Serialize metadata to JSON string. |
| 92 | + */ |
| 93 | + fun toJson(metadata: DocumentRichMetadata): String { |
| 94 | + return json.encodeToString(serializer(), metadata) |
| 95 | + } |
| 96 | + |
| 97 | + /** |
| 98 | + * Deserialize metadata from JSON string. |
| 99 | + */ |
| 100 | + fun fromJson(jsonStr: String): DocumentRichMetadata? { |
| 101 | + return try { |
| 102 | + json.decodeFromString(serializer(), jsonStr) |
| 103 | + } catch (e: Exception) { |
| 104 | + null |
| 105 | + } |
| 106 | + } |
| 107 | + |
| 108 | + /** |
| 109 | + * Create from file path with minimal info. |
| 110 | + */ |
| 111 | + fun fromPath(path: String): DocumentRichMetadata { |
| 112 | + val fileName = path.substringAfterLast('/') |
| 113 | + val extension = fileName.substringAfterLast('.', "") |
| 114 | + val directory = path.substringBeforeLast('/', "") |
| 115 | + val formatType = when (extension.lowercase()) { |
| 116 | + "md", "markdown" -> "markdown" |
| 117 | + "kt" -> "kotlin" |
| 118 | + "java" -> "java" |
| 119 | + "py" -> "python" |
| 120 | + "ts" -> "typescript" |
| 121 | + "js" -> "javascript" |
| 122 | + "go" -> "go" |
| 123 | + "rs" -> "rust" |
| 124 | + "cs" -> "csharp" |
| 125 | + "pdf" -> "pdf" |
| 126 | + "docx" -> "docx" |
| 127 | + else -> "text" |
| 128 | + } |
| 129 | + |
| 130 | + return DocumentRichMetadata( |
| 131 | + path = path, |
| 132 | + fileName = fileName, |
| 133 | + extension = extension, |
| 134 | + directory = directory, |
| 135 | + formatType = formatType |
| 136 | + ) |
| 137 | + } |
| 138 | + } |
| 139 | + |
| 140 | + /** |
| 141 | + * Convert to DocumentMetadataItem for reranking. |
| 142 | + */ |
| 143 | + fun toMetadataItem( |
| 144 | + contentType: String = "document", |
| 145 | + name: String = fileName, |
| 146 | + preview: String = "", |
| 147 | + heuristicScore: Double = 0.0 |
| 148 | + ): DocumentMetadataItem { |
| 149 | + return DocumentMetadataItem( |
| 150 | + id = path, |
| 151 | + filePath = path, |
| 152 | + fileName = fileName, |
| 153 | + extension = extension, |
| 154 | + directory = directory, |
| 155 | + contentType = contentType, |
| 156 | + name = name, |
| 157 | + preview = preview, |
| 158 | + h1Heading = h1Heading, |
| 159 | + parentHeading = headings.firstOrNull()?.title, |
| 160 | + lastModified = lastModified, |
| 161 | + fileSize = fileSize, |
| 162 | + formatType = formatType, |
| 163 | + references = outgoingRefs, |
| 164 | + tags = keywords, |
| 165 | + heuristicScore = heuristicScore |
| 166 | + ) |
| 167 | + } |
| 168 | +} |
| 169 | + |
| 170 | +/** |
| 171 | + * Heading information extracted from document. |
| 172 | + */ |
| 173 | +@Serializable |
| 174 | +data class HeadingInfo( |
| 175 | + /** Heading level (1 = H1, 2 = H2, etc.) */ |
| 176 | + val level: Int, |
| 177 | + /** Heading text */ |
| 178 | + val title: String, |
| 179 | + /** Anchor/ID for navigation */ |
| 180 | + val anchor: String? = null, |
| 181 | + /** Line number in document */ |
| 182 | + val lineNumber: Int? = null |
| 183 | +) |
| 184 | + |
| 185 | +/** |
| 186 | + * Extracts rich metadata from documents. |
| 187 | + */ |
| 188 | +object DocumentRichMetadataExtractor { |
| 189 | + |
| 190 | + /** |
| 191 | + * Extract metadata from markdown content. |
| 192 | + */ |
| 193 | + fun extractFromMarkdown( |
| 194 | + path: String, |
| 195 | + content: String, |
| 196 | + lastModified: Long = 0 |
| 197 | + ): DocumentRichMetadata { |
| 198 | + val base = DocumentRichMetadata.fromPath(path) |
| 199 | + |
| 200 | + val lines = content.lines() |
| 201 | + val headings = mutableListOf<HeadingInfo>() |
| 202 | + var codeBlockCount = 0 |
| 203 | + var linkCount = 0 |
| 204 | + var inCodeBlock = false |
| 205 | + |
| 206 | + lines.forEachIndexed { index, line -> |
| 207 | + // Track code blocks |
| 208 | + if (line.trim().startsWith("```")) { |
| 209 | + if (!inCodeBlock) { |
| 210 | + codeBlockCount++ |
| 211 | + } |
| 212 | + inCodeBlock = !inCodeBlock |
| 213 | + } |
| 214 | + |
| 215 | + // Extract headings (outside code blocks) |
| 216 | + if (!inCodeBlock && line.startsWith("#")) { |
| 217 | + val level = line.takeWhile { it == '#' }.length |
| 218 | + val title = line.drop(level).trim() |
| 219 | + if (title.isNotEmpty()) { |
| 220 | + val anchor = title.lowercase() |
| 221 | + .replace(Regex("[^a-z0-9\\s-]"), "") |
| 222 | + .replace(Regex("\\s+"), "-") |
| 223 | + headings.add(HeadingInfo(level, title, anchor, index + 1)) |
| 224 | + } |
| 225 | + } |
| 226 | + |
| 227 | + // Count links |
| 228 | + linkCount += Regex("\\[.*?\\]\\(.*?\\)").findAll(line).count() |
| 229 | + } |
| 230 | + |
| 231 | + // Extract keywords from headings and content |
| 232 | + val keywords = extractKeywords(headings.map { it.title }, content) |
| 233 | + |
| 234 | + return base.copy( |
| 235 | + h1Heading = headings.firstOrNull { it.level == 1 }?.title, |
| 236 | + headings = headings, |
| 237 | + headingCount = headings.size, |
| 238 | + contentLength = content.length, |
| 239 | + lineCount = lines.size, |
| 240 | + codeBlockCount = codeBlockCount, |
| 241 | + linkCount = linkCount, |
| 242 | + lastModified = lastModified, |
| 243 | + fileSize = content.length.toLong(), |
| 244 | + extractedAt = kotlinx.datetime.Clock.System.now().toEpochMilliseconds(), |
| 245 | + keywords = keywords, |
| 246 | + outgoingRefs = extractOutgoingRefs(content) |
| 247 | + ) |
| 248 | + } |
| 249 | + |
| 250 | + /** |
| 251 | + * Extract metadata from source code. |
| 252 | + */ |
| 253 | + fun extractFromCode( |
| 254 | + path: String, |
| 255 | + content: String, |
| 256 | + classes: List<String> = emptyList(), |
| 257 | + functions: List<String> = emptyList(), |
| 258 | + lastModified: Long = 0 |
| 259 | + ): DocumentRichMetadata { |
| 260 | + val base = DocumentRichMetadata.fromPath(path) |
| 261 | + val lines = content.lines() |
| 262 | + |
| 263 | + // Extract package/module as primary heading for code files |
| 264 | + val packageLine = lines.firstOrNull { |
| 265 | + it.trim().startsWith("package ") || it.trim().startsWith("module ") |
| 266 | + } |
| 267 | + val packageName = packageLine?.substringAfter("package ") |
| 268 | + ?.substringAfter("module ") |
| 269 | + ?.trim() |
| 270 | + ?.removeSuffix(";") |
| 271 | + |
| 272 | + return base.copy( |
| 273 | + h1Heading = packageName, |
| 274 | + contentLength = content.length, |
| 275 | + lineCount = lines.size, |
| 276 | + classes = classes, |
| 277 | + functions = functions, |
| 278 | + lastModified = lastModified, |
| 279 | + fileSize = content.length.toLong(), |
| 280 | + extractedAt = kotlinx.datetime.Clock.System.now().toEpochMilliseconds(), |
| 281 | + keywords = (classes + functions).take(20) |
| 282 | + ) |
| 283 | + } |
| 284 | + |
| 285 | + /** |
| 286 | + * Extract keywords from document. |
| 287 | + */ |
| 288 | + private fun extractKeywords(headings: List<String>, content: String): List<String> { |
| 289 | + val keywords = mutableSetOf<String>() |
| 290 | + |
| 291 | + // Add heading words (excluding common words) |
| 292 | + val stopWords = setOf("the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by") |
| 293 | + headings.forEach { heading -> |
| 294 | + heading.split(Regex("[\\s,.-]+")) |
| 295 | + .filter { it.length > 2 && it.lowercase() !in stopWords } |
| 296 | + .forEach { keywords.add(it) } |
| 297 | + } |
| 298 | + |
| 299 | + // Look for code-like identifiers (CamelCase, snake_case) |
| 300 | + val identifierPattern = Regex("\\b([A-Z][a-z]+(?:[A-Z][a-z]+)+|[a-z]+_[a-z]+(?:_[a-z]+)*)\\b") |
| 301 | + identifierPattern.findAll(content).take(50).forEach { match -> |
| 302 | + keywords.add(match.value) |
| 303 | + } |
| 304 | + |
| 305 | + return keywords.take(30).toList() |
| 306 | + } |
| 307 | + |
| 308 | + /** |
| 309 | + * Extract outgoing references from content. |
| 310 | + */ |
| 311 | + private fun extractOutgoingRefs(content: String): List<String> { |
| 312 | + val refs = mutableListOf<String>() |
| 313 | + |
| 314 | + // Markdown links: [text](path) |
| 315 | + val linkPattern = Regex("\\[.*?\\]\\((.*?)\\)") |
| 316 | + linkPattern.findAll(content).forEach { match -> |
| 317 | + val href = match.groupValues[1] |
| 318 | + if (href.isNotBlank() && !href.startsWith("http") && !href.startsWith("#")) { |
| 319 | + refs.add(href.substringBefore("#").substringBefore("?")) |
| 320 | + } |
| 321 | + } |
| 322 | + |
| 323 | + // Import statements |
| 324 | + val importPattern = Regex("import\\s+[\"']?(.*?)[\"']?[;\\s]") |
| 325 | + importPattern.findAll(content).forEach { match -> |
| 326 | + refs.add(match.groupValues[1]) |
| 327 | + } |
| 328 | + |
| 329 | + return refs.distinct().take(20) |
| 330 | + } |
| 331 | +} |
| 332 | + |
0 commit comments