Skip to content

Commit 5423034

Browse files
committed
feat(document): add DocumentRegistry for multi-format parsing #463
Introduce DocumentRegistry to manage document parsers and caching across platforms. Enables centralized registration, retrieval, and DocQL querying for PDF, DOCX, Markdown, and more. Refactor Tika parser registration to use platform-specific initialization. Add JVM tests for multi-format DocQL queries.
1 parent ae4fe34 commit 5423034

File tree

9 files changed

+496
-14
lines changed

9 files changed

+496
-14
lines changed
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package cc.unitmesh.devins.document
2+
3+
import io.github.oshai.kotlinlogging.KotlinLogging
4+
5+
private val logger = KotlinLogging.logger {}
6+
7+
/**
8+
* Android platform-specific initialization
9+
* Currently only Markdown is supported on Android platform
10+
* TODO: Consider adding Android-specific document parsers
11+
*/
12+
actual fun platformInitialize() {
13+
logger.info { "Initializing Android document parsers (Markdown only)" }
14+
// Android platform only supports Markdown for now
15+
// Markdown parser is already registered in DocumentRegistry init block
16+
}
17+
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
package cc.unitmesh.devins.document
2+
3+
import cc.unitmesh.devins.document.docql.DocQLExecutor
4+
import cc.unitmesh.devins.document.docql.DocQLResult
5+
import cc.unitmesh.devins.document.docql.parseDocQL
6+
import io.github.oshai.kotlinlogging.KotlinLogging
7+
8+
private val logger = KotlinLogging.logger {}
9+
10+
/**
11+
* Document Registry - manages document parsers and parsed documents
12+
*
13+
* This registry provides a centralized way to:
14+
* 1. Register and retrieve document parsers for different formats
15+
* 2. Cache parsed documents for efficient DocQL queries
16+
* 3. Support platform-specific parser initialization
17+
*
18+
* Usage:
19+
* ```kotlin
20+
* // Register a parser (done automatically by platform initialization)
21+
* DocumentRegistry.registerParser(DocumentFormatType.PDF, TikaDocumentParser())
22+
*
23+
* // Parse and register a document
24+
* val parser = DocumentRegistry.getParser(DocumentFormatType.PDF)
25+
* val parsedDoc = parser?.parse(file, content)
26+
* DocumentRegistry.registerDocument(file.path, parsedDoc, parser)
27+
*
28+
* // Query via DocQL
29+
* val result = DocumentRegistry.queryDocument(filePath, "$.content.heading('Introduction')")
30+
* ```
31+
*/
32+
object DocumentRegistry {
33+
34+
/**
35+
* Cache of parsed documents with their parsers
36+
* Key: document path, Value: Pair(DocumentFile, Parser)
37+
*/
38+
private val documentCache = mutableMapOf<String, Pair<DocumentTreeNode, DocumentParserService>>()
39+
40+
/**
41+
* Flag to track if platform-specific parsers have been initialized
42+
*/
43+
private var initialized = false
44+
45+
init {
46+
// Register Markdown parser (available on all platforms)
47+
DocumentParserFactory.registerParser(DocumentFormatType.MARKDOWN) { MarkdownDocumentParser() }
48+
}
49+
50+
/**
51+
* Initialize platform-specific document parsers
52+
* This should be called automatically by platform-specific code
53+
*/
54+
fun initializePlatformParsers() {
55+
if (!initialized) {
56+
initialized = true
57+
logger.info { "Initializing platform-specific document parsers" }
58+
// Platform-specific initialization happens via expect/actual
59+
platformInitialize()
60+
}
61+
}
62+
63+
/**
64+
* Register a document with its parser for future queries
65+
*
66+
* @param path Document path (unique identifier)
67+
* @param document Parsed document tree node
68+
* @param parser Parser service used for queries
69+
*/
70+
fun registerDocument(
71+
path: String,
72+
document: DocumentTreeNode,
73+
parser: DocumentParserService
74+
) {
75+
documentCache[path] = document to parser
76+
logger.debug { "Registered document: $path" }
77+
}
78+
79+
/**
80+
* Get a registered document and its parser
81+
*
82+
* @param path Document path
83+
* @return Pair of (DocumentTreeNode, Parser) or null if not found
84+
*/
85+
fun getDocument(path: String): Pair<DocumentTreeNode, DocumentParserService>? {
86+
return documentCache[path]
87+
}
88+
89+
/**
90+
* Get parser for a specific format
91+
* Ensures platform parsers are initialized
92+
*
93+
* @param formatType Document format type
94+
* @return Parser service or null if not supported
95+
*/
96+
fun getParser(formatType: DocumentFormatType): DocumentParserService? {
97+
initializePlatformParsers()
98+
return DocumentParserFactory.createParser(formatType)
99+
}
100+
101+
/**
102+
* Get parser for a file path (auto-detect format)
103+
*
104+
* @param filePath File path
105+
* @return Parser service or null if format not supported
106+
*/
107+
fun getParserForFile(filePath: String): DocumentParserService? {
108+
initializePlatformParsers()
109+
return DocumentParserFactory.createParserForFile(filePath)
110+
}
111+
112+
/**
113+
* Query a registered document using DocQL
114+
*
115+
* @param documentPath Document path
116+
* @param docqlQuery DocQL query string (e.g., "$.content.heading('title')")
117+
* @return Query result or null if document not found
118+
*/
119+
suspend fun queryDocument(documentPath: String, docqlQuery: String): DocQLResult? {
120+
val (document, parser) = getDocument(documentPath) ?: return null
121+
122+
if (document !is DocumentFile) {
123+
logger.warn { "Document at $documentPath is not a file" }
124+
return null
125+
}
126+
127+
return try {
128+
val query = parseDocQL(docqlQuery)
129+
val executor = DocQLExecutor(document, parser)
130+
executor.execute(query)
131+
} catch (e: Exception) {
132+
logger.error(e) { "Failed to execute DocQL query: $docqlQuery" }
133+
DocQLResult.Error(e.message ?: "Query execution failed")
134+
}
135+
}
136+
137+
/**
138+
* Clear document cache
139+
*/
140+
fun clearCache() {
141+
documentCache.clear()
142+
logger.info { "Document cache cleared" }
143+
}
144+
145+
/**
146+
* Get all registered document paths
147+
*/
148+
fun getRegisteredPaths(): List<String> {
149+
return documentCache.keys.toList()
150+
}
151+
152+
/**
153+
* Check if a document is registered
154+
*/
155+
fun isDocumentRegistered(path: String): Boolean {
156+
return documentCache.containsKey(path)
157+
}
158+
}
159+
160+
/**
161+
* Platform-specific initialization function
162+
* Implemented via expect/actual pattern
163+
*/
164+
expect fun platformInitialize()
165+
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package cc.unitmesh.devins.document
2+
3+
import io.github.oshai.kotlinlogging.KotlinLogging
4+
5+
private val logger = KotlinLogging.logger {}
6+
7+
/**
8+
* iOS platform-specific initialization
9+
* Currently only Markdown is supported on iOS platform
10+
* TODO: Consider adding iOS-specific document parsers
11+
*/
12+
actual fun platformInitialize() {
13+
logger.info { "Initializing iOS document parsers (Markdown only)" }
14+
// iOS platform only supports Markdown for now
15+
// Markdown parser is already registered in DocumentRegistry init block
16+
}
17+
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
package cc.unitmesh.devins.document
2+
3+
import io.github.oshai.kotlinlogging.KotlinLogging
4+
5+
private val logger = KotlinLogging.logger {}
6+
7+
/**
8+
* JS platform-specific initialization
9+
* Currently only Markdown is supported on JS platform
10+
*/
11+
actual fun platformInitialize() {
12+
logger.info { "Initializing JS document parsers (Markdown only)" }
13+
// JS platform only supports Markdown for now
14+
// Markdown parser is already registered in DocumentRegistry init block
15+
}
16+
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
package cc.unitmesh.devins.document
2+
3+
import io.github.oshai.kotlinlogging.KotlinLogging
4+
5+
private val logger = KotlinLogging.logger {}
6+
7+
/**
8+
* JVM platform-specific initialization
9+
* Automatically registers Tika parser for various document formats
10+
*/
11+
actual fun platformInitialize() {
12+
logger.info { "Initializing JVM document parsers (Tika)" }
13+
14+
// Register Tika parser for multiple formats
15+
val tikaFormats = listOf(
16+
DocumentFormatType.PDF,
17+
DocumentFormatType.DOCX,
18+
DocumentFormatType.PLAIN_TEXT
19+
)
20+
21+
tikaFormats.forEach { format ->
22+
DocumentParserFactory.registerParser(format) { TikaDocumentParser() }
23+
logger.debug { "Registered TikaDocumentParser for $format" }
24+
}
25+
26+
logger.info { "JVM parsers initialized: ${tikaFormats.size} formats supported via Tika" }
27+
}
28+

mpp-core/src/jvmMain/kotlin/cc/unitmesh/devins/document/TikaDocumentParser.kt

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -216,16 +216,4 @@ class TikaDocumentParser : DocumentParserService {
216216
}
217217
}
218218

219-
/**
220-
* Initialize Tika parser registration for JVM platform
221-
* This function is called automatically when the JVM platform is initialized
222-
*/
223-
fun initializeTikaParser() {
224-
// Register Tika parser for PDF, DOCX, and PLAIN_TEXT formats
225-
DocumentParserFactory.registerParser(DocumentFormatType.PDF) { TikaDocumentParser() }
226-
DocumentParserFactory.registerParser(DocumentFormatType.DOCX) { TikaDocumentParser() }
227-
DocumentParserFactory.registerParser(DocumentFormatType.PLAIN_TEXT) { TikaDocumentParser() }
228-
229-
logger.info { "Tika parser registered for formats: PDF, DOCX, PLAIN_TEXT" }
230-
}
231219

0 commit comments

Comments
 (0)