Skip to content

Commit c61922d

Browse files
authored
Merge pull request #68 from namtroi/pdf/optimize
Pdf/optimize
2 parents c268d08 + c470d76 commit c61922d

39 files changed

Lines changed: 905 additions & 246 deletions

apps/ai-worker/src/chunkers/document_chunker.py

Lines changed: 35 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,25 @@ class DocumentChunker:
1616
Split Markdown documents by headers while maintaining hierarchy context.
1717
"""
1818

19-
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 100):
19+
# All supported header levels (H1-H6)
20+
ALL_HEADERS = [
21+
("#", "Header 1"),
22+
("##", "Header 2"),
23+
("###", "Header 3"),
24+
("####", "Header 4"),
25+
("#####", "Header 5"),
26+
("######", "Header 6"),
27+
]
28+
29+
def __init__(
30+
self, chunk_size: int = 1000, chunk_overlap: int = 100, header_levels: int = 3
31+
):
2032
self.chunk_size = chunk_size
2133
self.chunk_overlap = chunk_overlap
34+
self.header_levels = min(max(header_levels, 1), 6) # Clamp to 1-6
2235

23-
# Split by H1, H2, H3
24-
headers_to_split_on = [
25-
("#", "Header 1"),
26-
("##", "Header 2"),
27-
("###", "Header 3"),
28-
]
36+
# Build headers_to_split_on based on configured levels
37+
headers_to_split_on = self.ALL_HEADERS[: self.header_levels]
2938
self.header_splitter = MarkdownHeaderTextSplitter(
3039
headers_to_split_on=headers_to_split_on
3140
)
@@ -35,6 +44,18 @@ def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 100):
3544
separators=["\n\n", "\n", ". ", " ", ""],
3645
)
3746

47+
def _create_chunk(
48+
self, content: str, breadcrumbs: List[str], index: int
49+
) -> Dict[str, Any]:
50+
"""Create a chunk dict with content and metadata."""
51+
return {
52+
"content": content,
53+
"metadata": {
54+
"breadcrumbs": breadcrumbs,
55+
"index": index,
56+
},
57+
}
58+
3859
def chunk(self, text: str) -> List[Dict[str, Any]]:
3960
"""
4061
Split Markdown text into chunks with breadcrumbs metadata.
@@ -46,18 +67,14 @@ def chunk(self, text: str) -> List[Dict[str, Any]]:
4667
header_splits = self.header_splitter.split_text(text)
4768

4869
final_chunks = []
49-
cumulative_pos = 0 # Track position for charStart/charEnd
5070

51-
for i, split in enumerate(header_splits):
71+
for split in header_splits:
5272
# Extract headers from metadata to build breadcrumbs
53-
# MarkdownHeaderTextSplitter returns metadata like {"Header 1": "Title", ...}
5473
breadcrumbs = []
55-
if "Header 1" in split.metadata:
56-
breadcrumbs.append(split.metadata["Header 1"])
57-
if "Header 2" in split.metadata:
58-
breadcrumbs.append(split.metadata["Header 2"])
59-
if "Header 3" in split.metadata:
60-
breadcrumbs.append(split.metadata["Header 3"])
74+
for i in range(1, self.header_levels + 1):
75+
key = f"Header {i}"
76+
if key in split.metadata:
77+
breadcrumbs.append(split.metadata[key])
6178

6279
# 2. Format breadcrumbs as a context header
6380
context_prefix = ""
@@ -73,29 +90,11 @@ def chunk(self, text: str) -> List[Dict[str, Any]]:
7390
for sub_text in sub_chunks:
7491
content = context_prefix + sub_text
7592
final_chunks.append(
76-
{
77-
"content": content,
78-
"metadata": {
79-
"breadcrumbs": breadcrumbs,
80-
"index": len(final_chunks),
81-
"charStart": cumulative_pos,
82-
"charEnd": cumulative_pos + len(content),
83-
},
84-
}
93+
self._create_chunk(content, breadcrumbs, len(final_chunks))
8594
)
86-
cumulative_pos += len(content)
8795
else:
8896
final_chunks.append(
89-
{
90-
"content": chunk_content,
91-
"metadata": {
92-
"breadcrumbs": breadcrumbs,
93-
"index": len(final_chunks),
94-
"charStart": cumulative_pos,
95-
"charEnd": cumulative_pos + len(chunk_content),
96-
},
97-
}
97+
self._create_chunk(chunk_content, breadcrumbs, len(final_chunks))
9898
)
99-
cumulative_pos += len(chunk_content)
10099

101100
return final_chunks

apps/ai-worker/src/chunkers/presentation_chunker.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ def chunk(self, text: str) -> List[Dict[str, Any]]:
3737
final_chunks = []
3838
current_accumulation = []
3939
current_indices = []
40-
cumulative_pos = 0 # Track position for charStart/charEnd
4140

4241
for i, slide_content in enumerate(raw_slides):
4342
slide_nr = i + 1
@@ -55,24 +54,23 @@ def chunk(self, text: str) -> List[Dict[str, Any]]:
5554
# If we met the minimum size or this is the last slide, emit a chunk
5655
if total_size >= self.min_chunk_size or i == len(raw_slides) - 1:
5756
chunk = self._create_chunk(
58-
current_accumulation, current_indices, cumulative_pos
57+
current_accumulation, current_indices, len(final_chunks)
5958
)
60-
cumulative_pos = chunk["metadata"]["charEnd"]
6159
final_chunks.append(chunk)
6260
current_accumulation = []
6361
current_indices = []
6462

6563
# If anything is left over (e.g., the last slide was empty but we had an accumulation)
6664
if current_accumulation:
6765
chunk = self._create_chunk(
68-
current_accumulation, current_indices, cumulative_pos
66+
current_accumulation, current_indices, len(final_chunks)
6967
)
7068
final_chunks.append(chunk)
7169

7270
return final_chunks
7371

7472
def _create_chunk(
75-
self, contents: List[str], indices: List[int], char_start: int
73+
self, contents: List[str], indices: List[int], index: int
7674
) -> Dict[str, Any]:
7775
"""
7876
Helper to format a chunk dictionary.
@@ -91,8 +89,7 @@ def _create_chunk(
9189
"type": "presentation",
9290
"hasTitle": bool(title),
9391
"title": title,
94-
"charStart": char_start,
95-
"charEnd": char_start + len(combined_content),
92+
"index": index,
9693
}
9794

9895
# Backward compatibility: ensure slide_number is set even if grouped

apps/ai-worker/src/chunkers/tabular_chunker.py

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,13 @@ def chunk(self, text: str) -> List[Dict[str, Any]]:
1616
return []
1717

1818
# 1. Split by --- (multiple sheets)
19-
# FIX: Chỉ split khi có xuống dòng rõ ràng (\n\n---\n\n).
20-
# Tuyệt đối không split "---" khơi khơi vì sẽ cắt nát Markdown Table (|---|).
2119
delimiter = "\n\n---\n\n"
2220
if delimiter in text:
2321
sections = [s.strip() for s in text.split(delimiter) if s.strip()]
2422
else:
2523
sections = [text.strip()]
2624

2725
final_chunks = []
28-
cumulative_pos = 0
2926

3027
for section in sections:
3128
# 2. Extract breadcrumbs (Sheet Name from H1)
@@ -50,7 +47,7 @@ def chunk(self, text: str) -> List[Dict[str, Any]]:
5047
# Check for Markdown Table syntax (| col | col | + separator |---|)
5148
is_markdown_table = (
5249
"|" in content_text
53-
and "\n|-" in content_text # Check kỹ hơn chút để tránh nhầm
50+
and "\n|-" in content_text
5451
and content_text.count("|") > 2
5552
)
5653

@@ -63,12 +60,9 @@ def chunk(self, text: str) -> List[Dict[str, Any]]:
6360
"breadcrumbs": breadcrumbs,
6461
"chunk_type": "tabular",
6562
"index": len(final_chunks),
66-
"charStart": cumulative_pos,
67-
"charEnd": cumulative_pos + len(section),
6863
},
6964
}
7065
)
71-
cumulative_pos += len(section)
7266
else:
7367
# STRATEGY B: Sentence Format -> Split by rows
7468
# Assumes rows are separated by double newlines (from converter)
@@ -82,8 +76,6 @@ def chunk(self, text: str) -> List[Dict[str, Any]]:
8276
batch = rows[i : i + self.rows_per_chunk]
8377
batch_text = "\n\n".join(batch)
8478

85-
# FIX: Inject Header an toàn hơn
86-
# Nếu có breadcrumb (Sheet name), tái tạo lại header cho mỗi chunk
8779
if breadcrumbs:
8880
header_line = f"# {breadcrumbs[0]}"
8981
chunk_display_text = f"{header_line}\n\n{batch_text}"
@@ -97,11 +89,8 @@ def chunk(self, text: str) -> List[Dict[str, Any]]:
9789
"breadcrumbs": breadcrumbs,
9890
"chunk_type": "tabular",
9991
"index": len(final_chunks),
100-
"charStart": cumulative_pos,
101-
"charEnd": cumulative_pos + len(chunk_display_text),
10292
},
10393
}
10494
)
105-
cumulative_pos += len(chunk_display_text)
10695

10796
return final_chunks

apps/ai-worker/src/converters/base.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,26 @@ def _post_process(self, markdown: str) -> str:
6363
"""
6464
return self._normalizer.normalize(markdown)
6565

66+
def _post_process_pdf(self, markdown: str) -> str:
67+
"""
68+
Post-process for PDF: normalize + remove page artifacts + junk code blocks.
69+
"""
70+
markdown = self._normalizer.normalize(markdown)
71+
markdown = self._normalizer.remove_page_artifacts(markdown)
72+
markdown = self._normalizer.remove_junk_code_blocks(markdown)
73+
return markdown
74+
75+
def _post_process_pymupdf(self, markdown: str) -> str:
76+
"""
77+
Post-process for PyMuPDF: includes soft linebreak merge.
78+
PyMuPDF4LLM preserves PDF hard line breaks more than Docling.
79+
"""
80+
markdown = self._normalizer.normalize(markdown)
81+
markdown = self._normalizer.merge_soft_linebreaks(markdown)
82+
markdown = self._normalizer.remove_page_artifacts(markdown)
83+
markdown = self._normalizer.remove_junk_code_blocks(markdown)
84+
return markdown
85+
6686
async def process(self, file_path: str, *args, **kwargs) -> ProcessorOutput:
6787
"""Backward-compatible alias for to_markdown()."""
6888
return await self.to_markdown(file_path, *args, **kwargs)

apps/ai-worker/src/converters/pdf_converter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ async def _convert_internal(
130130

131131
markdown = result.document.export_to_markdown()
132132
markdown = self._sanitize_raw(markdown)
133-
markdown = self._post_process(markdown)
133+
markdown = self._post_process_pdf(markdown)
134134
page_count = (
135135
len(result.document.pages) if hasattr(result.document, "pages") else 1
136136
)

apps/ai-worker/src/converters/pymupdf_converter.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"""Fast PDF converter using PyMuPDF4LLM."""
33

44
import gc
5+
import re
56
from pathlib import Path
67

78
from src.logging_config import get_logger
@@ -51,7 +52,8 @@ async def to_markdown(self, file_path: str) -> ProcessorOutput:
5152

5253
# Sanitize and normalize
5354
markdown = self._sanitize_raw(markdown)
54-
markdown = self._post_process(markdown)
55+
markdown = self._strip_hidden_links(markdown)
56+
markdown = self._post_process_pymupdf(markdown)
5557

5658
# Get page count
5759
page_count = self._get_page_count(path)
@@ -95,3 +97,15 @@ def _get_page_count(self, path: Path) -> int:
9597
return len(doc)
9698
except Exception:
9799
return 1
100+
101+
def _strip_hidden_links(self, markdown: str) -> str:
102+
"""
103+
Strip markdown link formatting, keeping display text only.
104+
PyMuPDF4LLM extracts hidden hyperlinks; we remove the link syntax.
105+
Example: [Click here](https://...) -> Click here
106+
Handles nested brackets: [Text [note]](url) -> Text [note]
107+
"""
108+
# Pattern handles one level of nested brackets in link text
109+
# [non-brackets [non-brackets] non-brackets](url)
110+
pattern = r"\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\([^)]+\)"
111+
return re.sub(pattern, r"\1", markdown)

0 commit comments

Comments
 (0)