Skip to content

Commit ae4fe34

Browse files
committed
feat(document): add Tika-based parser for PDF, DOCX, PPT #463
Introduce DocumentParserFactory for pluggable parsers and implement TikaDocumentParser on JVM to support PDF, DOC, DOCX, PPT, and TXT formats. Add tests and update dependencies for Apache Tika integration.
1 parent d14eea4 commit ae4fe34

File tree

10 files changed

+572
-1
lines changed

10 files changed

+572
-1
lines changed

mpp-core/build.gradle.kts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,10 @@ kotlin {
178178

179179
// JediTerm for terminal emulation (uses pty4j under the hood)
180180
implementation("org.jetbrains.pty4j:pty4j:0.13.10")
181+
182+
// Apache Tika for document parsing (PDF, DOC, DOCX, PPT, etc.)
183+
implementation("org.apache.tika:tika-core:3.2.3")
184+
implementation("org.apache.tika:tika-parsers-standard-package:3.2.3")
181185
}
182186
}
183187

mpp-core/src/commonMain/kotlin/cc/unitmesh/devins/document/DocumentModels.kt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ data class DocumentMetadata(
4040
val lastModified: Long, // 最后修改时间戳
4141
val fileSize: Long, // 文件大小(字节)
4242
val language: String? = null, // 文档语言(如 "markdown", "kotlin")
43-
val mimeType: String? = null // MIME 类型
43+
val mimeType: String? = null, // MIME 类型
44+
val formatType: DocumentFormatType = DocumentFormatType.PLAIN_TEXT // 文档格式类型
4445
)
4546

4647
/**
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
package cc.unitmesh.devins.document
2+
3+
/**
4+
* Factory for creating document parsers based on file format
5+
*
6+
* This factory provides a unified interface to create appropriate document parsers
7+
* for different file formats. It abstracts platform-specific parser implementations
8+
* while maintaining cross-platform compatibility.
9+
*
10+
* Usage:
11+
* ```kotlin
12+
* val parser = DocumentParserFactory.createParser(DocumentFormatType.MARKDOWN)
13+
* val result = parser.parse(file, content)
14+
* ```
15+
*/
16+
object DocumentParserFactory {
17+
18+
/**
19+
* Registry of parser providers for each format type
20+
* Can be extended with platform-specific parsers (e.g., Tika on JVM)
21+
*/
22+
private val parserProviders = mutableMapOf<DocumentFormatType, () -> DocumentParserService>()
23+
24+
init {
25+
// Register default Markdown parser (available on all platforms)
26+
registerParser(DocumentFormatType.MARKDOWN) { MarkdownDocumentParser() }
27+
}
28+
29+
/**
30+
* Register a parser provider for a specific format type
31+
* This allows platform-specific implementations to register themselves
32+
*
33+
* @param formatType The document format type
34+
* @param provider A function that creates a new parser instance
35+
*/
36+
fun registerParser(formatType: DocumentFormatType, provider: () -> DocumentParserService) {
37+
parserProviders[formatType] = provider
38+
}
39+
40+
/**
41+
* Create a parser for the given format type
42+
*
43+
* @param formatType The document format type
44+
* @return A parser instance for the format, or null if not supported
45+
*/
46+
fun createParser(formatType: DocumentFormatType): DocumentParserService? {
47+
return parserProviders[formatType]?.invoke()
48+
}
49+
50+
/**
51+
* Detect format type from file extension
52+
*
53+
* @param filePath The file path or name
54+
* @return The detected format type, or null if unknown
55+
*/
56+
fun detectFormat(filePath: String): DocumentFormatType? {
57+
val extension = filePath.substringAfterLast('.', "").lowercase()
58+
return when (extension) {
59+
"md", "markdown" -> DocumentFormatType.MARKDOWN
60+
"pdf" -> DocumentFormatType.PDF
61+
"doc", "docx" -> DocumentFormatType.DOCX
62+
"txt" -> DocumentFormatType.PLAIN_TEXT
63+
else -> null
64+
}
65+
}
66+
67+
/**
68+
* Create a parser based on file path (auto-detect format)
69+
*
70+
* @param filePath The file path
71+
* @return A parser instance, or null if format is not supported
72+
*/
73+
fun createParserForFile(filePath: String): DocumentParserService? {
74+
val format = detectFormat(filePath) ?: return null
75+
return createParser(format)
76+
}
77+
78+
/**
79+
* Check if a format is supported
80+
*
81+
* @param formatType The document format type
82+
* @return true if the format is supported
83+
*/
84+
fun isSupported(formatType: DocumentFormatType): Boolean {
85+
return parserProviders.containsKey(formatType)
86+
}
87+
88+
/**
89+
* Get all supported format types
90+
*
91+
* @return List of supported format types
92+
*/
93+
fun getSupportedFormats(): List<DocumentFormatType> {
94+
return parserProviders.keys.toList()
95+
}
96+
}
97+
Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
package cc.unitmesh.devins.document
2+
3+
import io.github.oshai.kotlinlogging.KotlinLogging
4+
import org.apache.tika.metadata.Metadata
5+
import org.apache.tika.parser.AutoDetectParser
6+
import org.apache.tika.parser.ParseContext
7+
import org.apache.tika.sax.BodyContentHandler
8+
import java.io.ByteArrayInputStream
9+
10+
private val logger = KotlinLogging.logger {}
11+
12+
/**
13+
* Apache Tika-based document parser for JVM platform
14+
*
15+
* Supports multiple document formats:
16+
* - PDF (.pdf)
17+
* - Microsoft Word (.doc, .docx)
18+
* - Microsoft PowerPoint (.ppt, .pptx)
19+
* - Plain text (.txt)
20+
* - HTML (.html)
21+
* - And many more formats supported by Tika
22+
*
23+
* This parser extracts plain text content and basic metadata from documents,
24+
* with position information tracked where possible.
25+
*/
26+
class TikaDocumentParser : DocumentParserService {
27+
private var currentContent: String? = null
28+
private var currentChunks: List<DocumentChunk> = emptyList()
29+
private var currentMetadata: Metadata? = null
30+
31+
override fun getDocumentContent(): String? = currentContent
32+
33+
override suspend fun parse(file: DocumentFile, content: String): DocumentTreeNode {
34+
logger.info { "=== Starting Tika Parse ===" }
35+
logger.info { "File: ${file.path}, Size: ${content.length} bytes" }
36+
37+
try {
38+
// Create Tika parser components
39+
val parser = AutoDetectParser()
40+
val handler = BodyContentHandler(-1) // No limit on content size
41+
val metadata = Metadata()
42+
val context = ParseContext()
43+
44+
// Set file name in metadata for better format detection
45+
metadata.set("resourceName", file.name)
46+
47+
// Parse document - convert String back to bytes using ISO_8859_1 to preserve binary data
48+
val inputStream = ByteArrayInputStream(content.toByteArray(Charsets.ISO_8859_1))
49+
parser.parse(inputStream, handler, metadata, context)
50+
51+
// Extract parsed content
52+
val extractedText = handler.toString().trim()
53+
currentContent = extractedText
54+
currentMetadata = metadata
55+
56+
logger.info { "Extracted ${extractedText.length} characters" }
57+
logger.debug { "Metadata: ${metadata.names().joinToString { "$it=${metadata.get(it)}" }}" }
58+
59+
// Build simple chunks (split by paragraphs or sections)
60+
currentChunks = buildSimpleChunks(extractedText, file.path, file.metadata.formatType)
61+
logger.info { "Created ${currentChunks.size} document chunks" }
62+
63+
// Extract basic TOC if possible (for now, just return empty)
64+
// TODO: Enhance with more sophisticated TOC extraction based on document structure
65+
val toc = extractSimpleTOC(extractedText)
66+
67+
logger.info { "=== Parse Complete ===" }
68+
69+
return file.copy(
70+
toc = toc,
71+
metadata = file.metadata.copy(
72+
parseStatus = ParseStatus.PARSED,
73+
chapterCount = toc.size,
74+
mimeType = metadata.get(Metadata.CONTENT_TYPE)
75+
)
76+
)
77+
} catch (e: Exception) {
78+
logger.error(e) { "Failed to parse document: ${e.message}" }
79+
return file.copy(
80+
metadata = file.metadata.copy(
81+
parseStatus = ParseStatus.PARSE_FAILED
82+
)
83+
)
84+
}
85+
}
86+
87+
override suspend fun queryHeading(keyword: String): List<DocumentChunk> {
88+
return currentChunks.filter {
89+
it.chapterTitle?.contains(keyword, ignoreCase = true) == true ||
90+
it.content.contains(keyword, ignoreCase = true)
91+
}.sortedByDescending {
92+
// Relevance scoring: title match > content match
93+
when {
94+
it.chapterTitle?.equals(keyword, ignoreCase = true) == true -> 10
95+
it.chapterTitle?.contains(keyword, ignoreCase = true) == true -> 5
96+
else -> 1
97+
}
98+
}
99+
}
100+
101+
override suspend fun queryChapter(chapterId: String): DocumentChunk? {
102+
return currentChunks.find {
103+
it.anchor == chapterId || it.anchor == "#$chapterId"
104+
}
105+
}
106+
107+
/**
108+
* Build simple document chunks by splitting on double newlines (paragraphs)
109+
* Each chunk includes position metadata for source attribution
110+
*/
111+
private fun buildSimpleChunks(
112+
content: String,
113+
documentPath: String,
114+
formatType: DocumentFormatType
115+
): List<DocumentChunk> {
116+
if (content.isBlank()) return emptyList()
117+
118+
val chunks = mutableListOf<DocumentChunk>()
119+
val lines = content.lines()
120+
121+
// Split into paragraphs (double newline or multiple empty lines)
122+
val paragraphs = mutableListOf<String>()
123+
var currentParagraph = StringBuilder()
124+
var emptyLineCount = 0
125+
126+
for (line in lines) {
127+
if (line.isBlank()) {
128+
emptyLineCount++
129+
if (emptyLineCount >= 2 && currentParagraph.isNotEmpty()) {
130+
paragraphs.add(currentParagraph.toString().trim())
131+
currentParagraph = StringBuilder()
132+
}
133+
} else {
134+
emptyLineCount = 0
135+
if (currentParagraph.isNotEmpty()) {
136+
currentParagraph.append("\n")
137+
}
138+
currentParagraph.append(line)
139+
}
140+
}
141+
142+
// Add last paragraph
143+
if (currentParagraph.isNotEmpty()) {
144+
paragraphs.add(currentParagraph.toString().trim())
145+
}
146+
147+
// Create chunks with position metadata
148+
var currentLineOffset = 0
149+
paragraphs.forEachIndexed { index, paragraph ->
150+
if (paragraph.isNotBlank()) {
151+
val lineCount = paragraph.count { it == '\n' } + 1
152+
val startLine = currentLineOffset
153+
val endLine = currentLineOffset + lineCount - 1
154+
155+
// Try to extract a title from first line if it looks like a heading
156+
val firstLine = paragraph.lines().first()
157+
val title = if (firstLine.length < 100 &&
158+
(firstLine.endsWith(":") || firstLine.all { it.isUpperCase() || it.isWhitespace() })) {
159+
firstLine.trim()
160+
} else {
161+
null
162+
}
163+
164+
val positionMetadata = PositionMetadata(
165+
documentPath = documentPath,
166+
formatType = formatType,
167+
position = DocumentPosition.LineRange(
168+
startLine = startLine,
169+
endLine = endLine
170+
)
171+
)
172+
173+
chunks.add(DocumentChunk(
174+
documentPath = documentPath,
175+
chapterTitle = title,
176+
content = paragraph,
177+
anchor = "#chunk-$index",
178+
startLine = startLine,
179+
endLine = endLine,
180+
position = positionMetadata
181+
))
182+
183+
currentLineOffset = endLine + 1
184+
}
185+
}
186+
187+
return chunks
188+
}
189+
190+
/**
191+
* Extract simple TOC from document content
192+
* Looks for lines that appear to be headings (all caps, short lines, etc.)
193+
* This is a basic implementation; more sophisticated parsing could be added
194+
*/
195+
private fun extractSimpleTOC(content: String): List<TOCItem> {
196+
val toc = mutableListOf<TOCItem>()
197+
val lines = content.lines()
198+
199+
lines.forEachIndexed { index, line ->
200+
val trimmed = line.trim()
201+
// Simple heuristic: line is short, ends with colon, or is all uppercase
202+
if (trimmed.isNotEmpty() &&
203+
trimmed.length < 100 &&
204+
(trimmed.endsWith(":") ||
205+
(trimmed.all { it.isUpperCase() || it.isWhitespace() || it.isDigit() || it in ".,()[]" }))) {
206+
toc.add(TOCItem(
207+
level = 1,
208+
title = trimmed,
209+
anchor = "#${trimmed.lowercase().replace(Regex("[^a-z0-9]+"), "-")}",
210+
lineNumber = index
211+
))
212+
}
213+
}
214+
215+
return toc
216+
}
217+
}
218+
219+
/**
220+
* Initialize Tika parser registration for JVM platform
221+
* This function is called automatically when the JVM platform is initialized
222+
*/
223+
fun initializeTikaParser() {
224+
// Register Tika parser for PDF, DOCX, and PLAIN_TEXT formats
225+
DocumentParserFactory.registerParser(DocumentFormatType.PDF) { TikaDocumentParser() }
226+
DocumentParserFactory.registerParser(DocumentFormatType.DOCX) { TikaDocumentParser() }
227+
DocumentParserFactory.registerParser(DocumentFormatType.PLAIN_TEXT) { TikaDocumentParser() }
228+
229+
logger.info { "Tika parser registered for formats: PDF, DOCX, PLAIN_TEXT" }
230+
}
231+

0 commit comments

Comments
 (0)