phodal
diff --git a/‎mpp-core/build.gradle.kts‎
Lines changed: 4 additions & 0 deletions b/‎mpp-core/build.gradle.kts‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎mpp-core/src/commonMain/kotlin/cc/unitmesh/devins/document/DocumentModels.kt‎
Lines changed: 2 additions & 1 deletion b/‎mpp-core/src/commonMain/kotlin/cc/unitmesh/devins/document/DocumentModels.kt‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎mpp-core/src/commonMain/kotlin/cc/unitmesh/devins/document/DocumentParserFactory.kt‎
Lines changed: 97 additions & 0 deletions b/‎mpp-core/src/commonMain/kotlin/cc/unitmesh/devins/document/DocumentParserFactory.kt‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎mpp-core/src/jvmMain/kotlin/cc/unitmesh/devins/document/TikaDocumentParser.kt‎
Lines changed: 231 additions & 0 deletions b/‎mpp-core/src/jvmMain/kotlin/cc/unitmesh/devins/document/TikaDocumentParser.kt‎
Lines changed: 231 additions & 0 deletions
@@ -178,6 +178,10 @@ kotlin {
 
                 // JediTerm for terminal emulation (uses pty4j under the hood)
                 implementation("org.jetbrains.pty4j:pty4j:0.13.10")
+
+                // Apache Tika for document parsing (PDF, DOC, DOCX, PPT, etc.)
+                implementation("org.apache.tika:tika-core:3.2.3")
+                implementation("org.apache.tika:tika-parsers-standard-package:3.2.3")
             }
         }
 
 
@@ -40,7 +40,8 @@ data class DocumentMetadata(
     val lastModified: Long,             // 最后修改时间戳
     val fileSize: Long,                 // 文件大小（字节）
     val language: String? = null,       // 文档语言（如 "markdown", "kotlin"）
-    val mimeType: String? = null        // MIME 类型
+    val mimeType: String? = null,       // MIME 类型
+    val formatType: DocumentFormatType = DocumentFormatType.PLAIN_TEXT  // 文档格式类型
 )
 
 /**
 
@@ -0,0 +1,97 @@
+package cc.unitmesh.devins.document
+
+/**
+ * Factory for creating document parsers based on file format
+ * 
+ * This factory provides a unified interface to create appropriate document parsers
+ * for different file formats. It abstracts platform-specific parser implementations
+ * while maintaining cross-platform compatibility.
+ * 
+ * Usage:
+ * ```kotlin
+ * val parser = DocumentParserFactory.createParser(DocumentFormatType.MARKDOWN)
+ * val result = parser.parse(file, content)
+ * ```
+ */
+object DocumentParserFactory {
+    
+    /**
+     * Registry of parser providers for each format type
+     * Can be extended with platform-specific parsers (e.g., Tika on JVM)
+     */
+    private val parserProviders = mutableMapOf<DocumentFormatType, () -> DocumentParserService>()
+    
+    init {
+        // Register default Markdown parser (available on all platforms)
+        registerParser(DocumentFormatType.MARKDOWN) { MarkdownDocumentParser() }
+    }
+    
+    /**
+     * Register a parser provider for a specific format type
+     * This allows platform-specific implementations to register themselves
+     * 
+     * @param formatType The document format type
+     * @param provider A function that creates a new parser instance
+     */
+    fun registerParser(formatType: DocumentFormatType, provider: () -> DocumentParserService) {
+        parserProviders[formatType] = provider
+    }
+    
+    /**
+     * Create a parser for the given format type
+     * 
+     * @param formatType The document format type
+     * @return A parser instance for the format, or null if not supported
+     */
+    fun createParser(formatType: DocumentFormatType): DocumentParserService? {
+        return parserProviders[formatType]?.invoke()
+    }
+    
+    /**
+     * Detect format type from file extension
+     * 
+     * @param filePath The file path or name
+     * @return The detected format type, or null if unknown
+     */
+    fun detectFormat(filePath: String): DocumentFormatType? {
+        val extension = filePath.substringAfterLast('.', "").lowercase()
+        return when (extension) {
+            "md", "markdown" -> DocumentFormatType.MARKDOWN
+            "pdf" -> DocumentFormatType.PDF
+            "doc", "docx" -> DocumentFormatType.DOCX
+            "txt" -> DocumentFormatType.PLAIN_TEXT
+            else -> null
+        }
+    }
+    
+    /**
+     * Create a parser based on file path (auto-detect format)
+     * 
+     * @param filePath The file path
+     * @return A parser instance, or null if format is not supported
+     */
+    fun createParserForFile(filePath: String): DocumentParserService? {
+        val format = detectFormat(filePath) ?: return null
+        return createParser(format)
+    }
+    
+    /**
+     * Check if a format is supported
+     * 
+     * @param formatType The document format type
+     * @return true if the format is supported
+     */
+    fun isSupported(formatType: DocumentFormatType): Boolean {
+        return parserProviders.containsKey(formatType)
+    }
+    
+    /**
+     * Get all supported format types
+     * 
+     * @return List of supported format types
+     */
+    fun getSupportedFormats(): List<DocumentFormatType> {
+        return parserProviders.keys.toList()
+    }
+}
+
@@ -0,0 +1,231 @@
+package cc.unitmesh.devins.document
+
+import io.github.oshai.kotlinlogging.KotlinLogging
+import org.apache.tika.metadata.Metadata
+import org.apache.tika.parser.AutoDetectParser
+import org.apache.tika.parser.ParseContext
+import org.apache.tika.sax.BodyContentHandler
+import java.io.ByteArrayInputStream
+
+private val logger = KotlinLogging.logger {}
+
+/**
+ * Apache Tika-based document parser for JVM platform
+ * 
+ * Supports multiple document formats:
+ * - PDF (.pdf)
+ * - Microsoft Word (.doc, .docx)
+ * - Microsoft PowerPoint (.ppt, .pptx)
+ * - Plain text (.txt)
+ * - HTML (.html)
+ * - And many more formats supported by Tika
+ * 
+ * This parser extracts plain text content and basic metadata from documents,
+ * with position information tracked where possible.
+ */
+class TikaDocumentParser : DocumentParserService {
+    private var currentContent: String? = null
+    private var currentChunks: List<DocumentChunk> = emptyList()
+    private var currentMetadata: Metadata? = null
+    
+    override fun getDocumentContent(): String? = currentContent
+    
+    override suspend fun parse(file: DocumentFile, content: String): DocumentTreeNode {
+        logger.info { "=== Starting Tika Parse ===" }
+        logger.info { "File: ${file.path}, Size: ${content.length} bytes" }
+        
+        try {
+            // Create Tika parser components
+            val parser = AutoDetectParser()
+            val handler = BodyContentHandler(-1) // No limit on content size
+            val metadata = Metadata()
+            val context = ParseContext()
+            
+            // Set file name in metadata for better format detection
+            metadata.set("resourceName", file.name)
+            
+            // Parse document - convert String back to bytes using ISO_8859_1 to preserve binary data
+            val inputStream = ByteArrayInputStream(content.toByteArray(Charsets.ISO_8859_1))
+            parser.parse(inputStream, handler, metadata, context)
+            
+            // Extract parsed content
+            val extractedText = handler.toString().trim()
+            currentContent = extractedText
+            currentMetadata = metadata
+            
+            logger.info { "Extracted ${extractedText.length} characters" }
+            logger.debug { "Metadata: ${metadata.names().joinToString { "$it=${metadata.get(it)}" }}" }
+            
+            // Build simple chunks (split by paragraphs or sections)
+            currentChunks = buildSimpleChunks(extractedText, file.path, file.metadata.formatType)
+            logger.info { "Created ${currentChunks.size} document chunks" }
+            
+            // Extract basic TOC if possible (for now, just return empty)
+            // TODO: Enhance with more sophisticated TOC extraction based on document structure
+            val toc = extractSimpleTOC(extractedText)
+            
+            logger.info { "=== Parse Complete ===" }
+            
+            return file.copy(
+                toc = toc,
+                metadata = file.metadata.copy(
+                    parseStatus = ParseStatus.PARSED,
+                    chapterCount = toc.size,
+                    mimeType = metadata.get(Metadata.CONTENT_TYPE)
+                )
+            )
+        } catch (e: Exception) {
+            logger.error(e) { "Failed to parse document: ${e.message}" }
+            return file.copy(
+                metadata = file.metadata.copy(
+                    parseStatus = ParseStatus.PARSE_FAILED
+                )
+            )
+        }
+    }
+    
+    override suspend fun queryHeading(keyword: String): List<DocumentChunk> {
+        return currentChunks.filter { 
+            it.chapterTitle?.contains(keyword, ignoreCase = true) == true ||
+            it.content.contains(keyword, ignoreCase = true)
+        }.sortedByDescending { 
+            // Relevance scoring: title match > content match
+            when {
+                it.chapterTitle?.equals(keyword, ignoreCase = true) == true -> 10
+                it.chapterTitle?.contains(keyword, ignoreCase = true) == true -> 5
+                else -> 1
+            }
+        }
+    }
+    
+    override suspend fun queryChapter(chapterId: String): DocumentChunk? {
+        return currentChunks.find { 
+            it.anchor == chapterId || it.anchor == "#$chapterId" 
+        }
+    }
+    
+    /**
+     * Build simple document chunks by splitting on double newlines (paragraphs)
+     * Each chunk includes position metadata for source attribution
+     */
+    private fun buildSimpleChunks(
+        content: String, 
+        documentPath: String,
+        formatType: DocumentFormatType
+    ): List<DocumentChunk> {
+        if (content.isBlank()) return emptyList()
+        
+        val chunks = mutableListOf<DocumentChunk>()
+        val lines = content.lines()
+        
+        // Split into paragraphs (double newline or multiple empty lines)
+        val paragraphs = mutableListOf<String>()
+        var currentParagraph = StringBuilder()
+        var emptyLineCount = 0
+        
+        for (line in lines) {
+            if (line.isBlank()) {
+                emptyLineCount++
+                if (emptyLineCount >= 2 && currentParagraph.isNotEmpty()) {
+                    paragraphs.add(currentParagraph.toString().trim())
+                    currentParagraph = StringBuilder()
+                }
+            } else {
+                emptyLineCount = 0
+                if (currentParagraph.isNotEmpty()) {
+                    currentParagraph.append("\n")
+                }
+                currentParagraph.append(line)
+            }
+        }
+        
+        // Add last paragraph
+        if (currentParagraph.isNotEmpty()) {
+            paragraphs.add(currentParagraph.toString().trim())
+        }
+        
+        // Create chunks with position metadata
+        var currentLineOffset = 0
+        paragraphs.forEachIndexed { index, paragraph ->
+            if (paragraph.isNotBlank()) {
+                val lineCount = paragraph.count { it == '\n' } + 1
+                val startLine = currentLineOffset
+                val endLine = currentLineOffset + lineCount - 1
+                
+                // Try to extract a title from first line if it looks like a heading
+                val firstLine = paragraph.lines().first()
+                val title = if (firstLine.length < 100 && 
+                    (firstLine.endsWith(":") || firstLine.all { it.isUpperCase() || it.isWhitespace() })) {
+                    firstLine.trim()
+                } else {
+                    null
+                }
+                
+                val positionMetadata = PositionMetadata(
+                    documentPath = documentPath,
+                    formatType = formatType,
+                    position = DocumentPosition.LineRange(
+                        startLine = startLine,
+                        endLine = endLine
+                    )
+                )
+                
+                chunks.add(DocumentChunk(
+                    documentPath = documentPath,
+                    chapterTitle = title,
+                    content = paragraph,
+                    anchor = "#chunk-$index",
+                    startLine = startLine,
+                    endLine = endLine,
+                    position = positionMetadata
+                ))
+                
+                currentLineOffset = endLine + 1
+            }
+        }
+        
+        return chunks
+    }
+    
+    /**
+     * Extract simple TOC from document content
+     * Looks for lines that appear to be headings (all caps, short lines, etc.)
+     * This is a basic implementation; more sophisticated parsing could be added
+     */
+    private fun extractSimpleTOC(content: String): List<TOCItem> {
+        val toc = mutableListOf<TOCItem>()
+        val lines = content.lines()
+        
+        lines.forEachIndexed { index, line ->
+            val trimmed = line.trim()
+            // Simple heuristic: line is short, ends with colon, or is all uppercase
+            if (trimmed.isNotEmpty() && 
+                trimmed.length < 100 &&
+                (trimmed.endsWith(":") || 
+                 (trimmed.all { it.isUpperCase() || it.isWhitespace() || it.isDigit() || it in ".,()[]" }))) {
+                toc.add(TOCItem(
+                    level = 1,
+                    title = trimmed,
+                    anchor = "#${trimmed.lowercase().replace(Regex("[^a-z0-9]+"), "-")}",
+                    lineNumber = index
+                ))
+            }
+        }
+        
+        return toc
+    }
+}
+
+/**
+ * Initialize Tika parser registration for JVM platform
+ * This function is called automatically when the JVM platform is initialized
+ */
+fun initializeTikaParser() {
+    // Register Tika parser for PDF, DOCX, and PLAIN_TEXT formats
+    DocumentParserFactory.registerParser(DocumentFormatType.PDF) { TikaDocumentParser() }
+    DocumentParserFactory.registerParser(DocumentFormatType.DOCX) { TikaDocumentParser() }
+    DocumentParserFactory.registerParser(DocumentFormatType.PLAIN_TEXT) { TikaDocumentParser() }
+    
+    logger.info { "Tika parser registered for formats: PDF, DOCX, PLAIN_TEXT" }
+}
+
Original file line number	Diff line number	Diff line change
`@@ -178,6 +178,10 @@ kotlin {`
`178`	`178`
`179`	`179`	`// JediTerm for terminal emulation (uses pty4j under the hood)`
`180`	`180`	`implementation("org.jetbrains.pty4j:pty4j:0.13.10")`
	`181`	`+`
	`182`	`+ // Apache Tika for document parsing (PDF, DOC, DOCX, PPT, etc.)`
	`183`	`+ implementation("org.apache.tika:tika-core:3.2.3")`
	`184`	`+ implementation("org.apache.tika:tika-parsers-standard-package:3.2.3")`
`181`	`185`	`}`
`182`	`186`	`}`
`183`	`187`
Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,8 @@ data class DocumentMetadata(`
`40`	`40`	`val lastModified: Long, // 最后修改时间戳`
`41`	`41`	`val fileSize: Long, // 文件大小（字节）`
`42`	`42`	`val language: String? = null, // 文档语言（如 "markdown", "kotlin"）`
`43`		`- val mimeType: String? = null // MIME 类型`
	`43`	`+ val mimeType: String? = null, // MIME 类型`
	`44`	`+ val formatType: DocumentFormatType = DocumentFormatType.PLAIN_TEXT // 文档格式类型`
`44`	`45`	`)`
`45`	`46`
`46`	`47`	`/**`