namtroi
diff --git a/‎apps/ai-worker/src/chunkers/document_chunker.py‎
Lines changed: 35 additions & 36 deletions b/‎apps/ai-worker/src/chunkers/document_chunker.py‎
Lines changed: 35 additions & 36 deletions
diff --git a/‎apps/ai-worker/src/chunkers/presentation_chunker.py‎
Lines changed: 4 additions & 7 deletions b/‎apps/ai-worker/src/chunkers/presentation_chunker.py‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎apps/ai-worker/src/chunkers/tabular_chunker.py‎
Lines changed: 1 addition & 12 deletions b/‎apps/ai-worker/src/chunkers/tabular_chunker.py‎
Lines changed: 1 addition & 12 deletions
diff --git a/‎apps/ai-worker/src/converters/base.py‎
Lines changed: 20 additions & 0 deletions b/‎apps/ai-worker/src/converters/base.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎apps/ai-worker/src/converters/pdf_converter.py‎
Lines changed: 1 addition & 1 deletion b/‎apps/ai-worker/src/converters/pdf_converter.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎apps/ai-worker/src/converters/pymupdf_converter.py‎
Lines changed: 15 additions & 1 deletion b/‎apps/ai-worker/src/converters/pymupdf_converter.py‎
Lines changed: 15 additions & 1 deletion
@@ -16,16 +16,25 @@ class DocumentChunker:
     Split Markdown documents by headers while maintaining hierarchy context.
     """
 
-    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 100):
+    # All supported header levels (H1-H6)
+    ALL_HEADERS = [
+        ("#", "Header 1"),
+        ("##", "Header 2"),
+        ("###", "Header 3"),
+        ("####", "Header 4"),
+        ("#####", "Header 5"),
+        ("######", "Header 6"),
+    ]
+
+    def __init__(
+        self, chunk_size: int = 1000, chunk_overlap: int = 100, header_levels: int = 3
+    ):
         self.chunk_size = chunk_size
         self.chunk_overlap = chunk_overlap
+        self.header_levels = min(max(header_levels, 1), 6)  # Clamp to 1-6
 
-        # Split by H1, H2, H3
-        headers_to_split_on = [
-            ("#", "Header 1"),
-            ("##", "Header 2"),
-            ("###", "Header 3"),
-        ]
+        # Build headers_to_split_on based on configured levels
+        headers_to_split_on = self.ALL_HEADERS[: self.header_levels]
         self.header_splitter = MarkdownHeaderTextSplitter(
             headers_to_split_on=headers_to_split_on
         )
@@ -35,6 +44,18 @@ def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 100):
             separators=["\n\n", "\n", ". ", " ", ""],
         )
 
+    def _create_chunk(
+        self, content: str, breadcrumbs: List[str], index: int
+    ) -> Dict[str, Any]:
+        """Create a chunk dict with content and metadata."""
+        return {
+            "content": content,
+            "metadata": {
+                "breadcrumbs": breadcrumbs,
+                "index": index,
+            },
+        }
+
     def chunk(self, text: str) -> List[Dict[str, Any]]:
         """
         Split Markdown text into chunks with breadcrumbs metadata.
@@ -46,18 +67,14 @@ def chunk(self, text: str) -> List[Dict[str, Any]]:
         header_splits = self.header_splitter.split_text(text)
 
         final_chunks = []
-        cumulative_pos = 0  # Track position for charStart/charEnd
 
-        for i, split in enumerate(header_splits):
+        for split in header_splits:
             # Extract headers from metadata to build breadcrumbs
-            # MarkdownHeaderTextSplitter returns metadata like {"Header 1": "Title", ...}
             breadcrumbs = []
-            if "Header 1" in split.metadata:
-                breadcrumbs.append(split.metadata["Header 1"])
-            if "Header 2" in split.metadata:
-                breadcrumbs.append(split.metadata["Header 2"])
-            if "Header 3" in split.metadata:
-                breadcrumbs.append(split.metadata["Header 3"])
+            for i in range(1, self.header_levels + 1):
+                key = f"Header {i}"
+                if key in split.metadata:
+                    breadcrumbs.append(split.metadata[key])
 
             # 2. Format breadcrumbs as a context header
             context_prefix = ""
@@ -73,29 +90,11 @@ def chunk(self, text: str) -> List[Dict[str, Any]]:
                 for sub_text in sub_chunks:
                     content = context_prefix + sub_text
                     final_chunks.append(
-                        {
-                            "content": content,
-                            "metadata": {
-                                "breadcrumbs": breadcrumbs,
-                                "index": len(final_chunks),
-                                "charStart": cumulative_pos,
-                                "charEnd": cumulative_pos + len(content),
-                            },
-                        }
+                        self._create_chunk(content, breadcrumbs, len(final_chunks))
                     )
-                    cumulative_pos += len(content)
             else:
                 final_chunks.append(
-                    {
-                        "content": chunk_content,
-                        "metadata": {
-                            "breadcrumbs": breadcrumbs,
-                            "index": len(final_chunks),
-                            "charStart": cumulative_pos,
-                            "charEnd": cumulative_pos + len(chunk_content),
-                        },
-                    }
+                    self._create_chunk(chunk_content, breadcrumbs, len(final_chunks))
                 )
-                cumulative_pos += len(chunk_content)
 
         return final_chunks
@@ -37,7 +37,6 @@ def chunk(self, text: str) -> List[Dict[str, Any]]:
         final_chunks = []
         current_accumulation = []
         current_indices = []
-        cumulative_pos = 0  # Track position for charStart/charEnd
 
         for i, slide_content in enumerate(raw_slides):
             slide_nr = i + 1
@@ -55,24 +54,23 @@ def chunk(self, text: str) -> List[Dict[str, Any]]:
             # If we met the minimum size or this is the last slide, emit a chunk
             if total_size >= self.min_chunk_size or i == len(raw_slides) - 1:
                 chunk = self._create_chunk(
-                    current_accumulation, current_indices, cumulative_pos
+                    current_accumulation, current_indices, len(final_chunks)
                 )
-                cumulative_pos = chunk["metadata"]["charEnd"]
                 final_chunks.append(chunk)
                 current_accumulation = []
                 current_indices = []
 
         # If anything is left over (e.g., the last slide was empty but we had an accumulation)
         if current_accumulation:
             chunk = self._create_chunk(
-                current_accumulation, current_indices, cumulative_pos
+                current_accumulation, current_indices, len(final_chunks)
             )
             final_chunks.append(chunk)
 
         return final_chunks
 
     def _create_chunk(
-        self, contents: List[str], indices: List[int], char_start: int
+        self, contents: List[str], indices: List[int], index: int
     ) -> Dict[str, Any]:
         """
         Helper to format a chunk dictionary.
@@ -91,8 +89,7 @@ def _create_chunk(
             "type": "presentation",
             "hasTitle": bool(title),
             "title": title,
-            "charStart": char_start,
-            "charEnd": char_start + len(combined_content),
+            "index": index,
         }
 
         # Backward compatibility: ensure slide_number is set even if grouped
 
@@ -16,16 +16,13 @@ def chunk(self, text: str) -> List[Dict[str, Any]]:
             return []
 
         # 1. Split by --- (multiple sheets)
-        # FIX: Chỉ split khi có xuống dòng rõ ràng (\n\n---\n\n).
-        # Tuyệt đối không split "---" khơi khơi vì sẽ cắt nát Markdown Table (|---|).
         delimiter = "\n\n---\n\n"
         if delimiter in text:
             sections = [s.strip() for s in text.split(delimiter) if s.strip()]
         else:
             sections = [text.strip()]
 
         final_chunks = []
-        cumulative_pos = 0
 
         for section in sections:
             # 2. Extract breadcrumbs (Sheet Name from H1)
@@ -50,7 +47,7 @@ def chunk(self, text: str) -> List[Dict[str, Any]]:
             # Check for Markdown Table syntax (| col | col | + separator |---|)
             is_markdown_table = (
                 "|" in content_text
-                and "\n|-" in content_text  # Check kỹ hơn chút để tránh nhầm
+                and "\n|-" in content_text
                 and content_text.count("|") > 2
             )
 
@@ -63,12 +60,9 @@ def chunk(self, text: str) -> List[Dict[str, Any]]:
                             "breadcrumbs": breadcrumbs,
                             "chunk_type": "tabular",
                             "index": len(final_chunks),
-                            "charStart": cumulative_pos,
-                            "charEnd": cumulative_pos + len(section),
                         },
                     }
                 )
-                cumulative_pos += len(section)
             else:
                 # STRATEGY B: Sentence Format -> Split by rows
                 # Assumes rows are separated by double newlines (from converter)
@@ -82,8 +76,6 @@ def chunk(self, text: str) -> List[Dict[str, Any]]:
                     batch = rows[i : i + self.rows_per_chunk]
                     batch_text = "\n\n".join(batch)
 
-                    # FIX: Inject Header an toàn hơn
-                    # Nếu có breadcrumb (Sheet name), tái tạo lại header cho mỗi chunk
                     if breadcrumbs:
                         header_line = f"# {breadcrumbs[0]}"
                         chunk_display_text = f"{header_line}\n\n{batch_text}"
@@ -97,11 +89,8 @@ def chunk(self, text: str) -> List[Dict[str, Any]]:
                                 "breadcrumbs": breadcrumbs,
                                 "chunk_type": "tabular",
                                 "index": len(final_chunks),
-                                "charStart": cumulative_pos,
-                                "charEnd": cumulative_pos + len(chunk_display_text),
                             },
                         }
                     )
-                    cumulative_pos += len(chunk_display_text)
 
         return final_chunks
@@ -63,6 +63,26 @@ def _post_process(self, markdown: str) -> str:
         """
         return self._normalizer.normalize(markdown)
 
+    def _post_process_pdf(self, markdown: str) -> str:
+        """
+        Post-process for PDF: normalize + remove page artifacts + junk code blocks.
+        """
+        markdown = self._normalizer.normalize(markdown)
+        markdown = self._normalizer.remove_page_artifacts(markdown)
+        markdown = self._normalizer.remove_junk_code_blocks(markdown)
+        return markdown
+
+    def _post_process_pymupdf(self, markdown: str) -> str:
+        """
+        Post-process for PyMuPDF: includes soft linebreak merge.
+        PyMuPDF4LLM preserves PDF hard line breaks more than Docling.
+        """
+        markdown = self._normalizer.normalize(markdown)
+        markdown = self._normalizer.merge_soft_linebreaks(markdown)
+        markdown = self._normalizer.remove_page_artifacts(markdown)
+        markdown = self._normalizer.remove_junk_code_blocks(markdown)
+        return markdown
+
     async def process(self, file_path: str, *args, **kwargs) -> ProcessorOutput:
         """Backward-compatible alias for to_markdown()."""
         return await self.to_markdown(file_path, *args, **kwargs)
@@ -130,7 +130,7 @@ async def _convert_internal(
 
             markdown = result.document.export_to_markdown()
             markdown = self._sanitize_raw(markdown)
-            markdown = self._post_process(markdown)
+            markdown = self._post_process_pdf(markdown)
             page_count = (
                 len(result.document.pages) if hasattr(result.document, "pages") else 1
             )
 
@@ -2,6 +2,7 @@
 """Fast PDF converter using PyMuPDF4LLM."""
 
 import gc
+import re
 from pathlib import Path
 
 from src.logging_config import get_logger
@@ -51,7 +52,8 @@ async def to_markdown(self, file_path: str) -> ProcessorOutput:
 
             # Sanitize and normalize
             markdown = self._sanitize_raw(markdown)
-            markdown = self._post_process(markdown)
+            markdown = self._strip_hidden_links(markdown)
+            markdown = self._post_process_pymupdf(markdown)
 
             # Get page count
             page_count = self._get_page_count(path)
@@ -95,3 +97,15 @@ def _get_page_count(self, path: Path) -> int:
                 return len(doc)
         except Exception:
             return 1
+
+    def _strip_hidden_links(self, markdown: str) -> str:
+        """
+        Strip markdown link formatting, keeping display text only.
+        PyMuPDF4LLM extracts hidden hyperlinks; we remove the link syntax.
+        Example: [Click here](https://...) -> Click here
+        Handles nested brackets: [Text [note]](url) -> Text [note]
+        """
+        # Pattern handles one level of nested brackets in link text
+        # [non-brackets [non-brackets] non-brackets](url)
+        pattern = r"\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\([^)]+\)"
+        return re.sub(pattern, r"\1", markdown)
Original file line number	Diff line number	Diff line change
`@@ -130,7 +130,7 @@ async def _convert_internal(`
`130`	`130`
`131`	`131`	`markdown = result.document.export_to_markdown()`
`132`	`132`	`markdown = self._sanitize_raw(markdown)`
`133`		`- markdown = self._post_process(markdown)`
	`133`	`+ markdown = self._post_process_pdf(markdown)`
`134`	`134`	`page_count = (`
`135`	`135`	`len(result.document.pages) if hasattr(result.document, "pages") else 1`
`136`	`136`	`)`