@@ -16,16 +16,25 @@ class DocumentChunker:
1616 Split Markdown documents by headers while maintaining hierarchy context.
1717 """
1818
19- def __init__ (self , chunk_size : int = 1000 , chunk_overlap : int = 100 ):
19+ # All supported header levels (H1-H6)
20+ ALL_HEADERS = [
21+ ("#" , "Header 1" ),
22+ ("##" , "Header 2" ),
23+ ("###" , "Header 3" ),
24+ ("####" , "Header 4" ),
25+ ("#####" , "Header 5" ),
26+ ("######" , "Header 6" ),
27+ ]
28+
29+ def __init__ (
30+ self , chunk_size : int = 1000 , chunk_overlap : int = 100 , header_levels : int = 3
31+ ):
2032 self .chunk_size = chunk_size
2133 self .chunk_overlap = chunk_overlap
34+ self .header_levels = min (max (header_levels , 1 ), 6 ) # Clamp to 1-6
2235
23- # Split by H1, H2, H3
24- headers_to_split_on = [
25- ("#" , "Header 1" ),
26- ("##" , "Header 2" ),
27- ("###" , "Header 3" ),
28- ]
36+ # Build headers_to_split_on based on configured levels
37+ headers_to_split_on = self .ALL_HEADERS [: self .header_levels ]
2938 self .header_splitter = MarkdownHeaderTextSplitter (
3039 headers_to_split_on = headers_to_split_on
3140 )
@@ -35,6 +44,18 @@ def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 100):
3544 separators = ["\n \n " , "\n " , ". " , " " , "" ],
3645 )
3746
47+ def _create_chunk (
48+ self , content : str , breadcrumbs : List [str ], index : int
49+ ) -> Dict [str , Any ]:
50+ """Create a chunk dict with content and metadata."""
51+ return {
52+ "content" : content ,
53+ "metadata" : {
54+ "breadcrumbs" : breadcrumbs ,
55+ "index" : index ,
56+ },
57+ }
58+
3859 def chunk (self , text : str ) -> List [Dict [str , Any ]]:
3960 """
4061 Split Markdown text into chunks with breadcrumbs metadata.
@@ -46,18 +67,14 @@ def chunk(self, text: str) -> List[Dict[str, Any]]:
4667 header_splits = self .header_splitter .split_text (text )
4768
4869 final_chunks = []
49- cumulative_pos = 0 # Track position for charStart/charEnd
5070
51- for i , split in enumerate ( header_splits ) :
71+ for split in header_splits :
5272 # Extract headers from metadata to build breadcrumbs
53- # MarkdownHeaderTextSplitter returns metadata like {"Header 1": "Title", ...}
5473 breadcrumbs = []
55- if "Header 1" in split .metadata :
56- breadcrumbs .append (split .metadata ["Header 1" ])
57- if "Header 2" in split .metadata :
58- breadcrumbs .append (split .metadata ["Header 2" ])
59- if "Header 3" in split .metadata :
60- breadcrumbs .append (split .metadata ["Header 3" ])
74+ for i in range (1 , self .header_levels + 1 ):
75+ key = f"Header { i } "
76+ if key in split .metadata :
77+ breadcrumbs .append (split .metadata [key ])
6178
6279 # 2. Format breadcrumbs as a context header
6380 context_prefix = ""
@@ -73,29 +90,11 @@ def chunk(self, text: str) -> List[Dict[str, Any]]:
7390 for sub_text in sub_chunks :
7491 content = context_prefix + sub_text
7592 final_chunks .append (
76- {
77- "content" : content ,
78- "metadata" : {
79- "breadcrumbs" : breadcrumbs ,
80- "index" : len (final_chunks ),
81- "charStart" : cumulative_pos ,
82- "charEnd" : cumulative_pos + len (content ),
83- },
84- }
93+ self ._create_chunk (content , breadcrumbs , len (final_chunks ))
8594 )
86- cumulative_pos += len (content )
8795 else :
8896 final_chunks .append (
89- {
90- "content" : chunk_content ,
91- "metadata" : {
92- "breadcrumbs" : breadcrumbs ,
93- "index" : len (final_chunks ),
94- "charStart" : cumulative_pos ,
95- "charEnd" : cumulative_pos + len (chunk_content ),
96- },
97- }
97+ self ._create_chunk (chunk_content , breadcrumbs , len (final_chunks ))
9898 )
99- cumulative_pos += len (chunk_content )
10099
101100 return final_chunks
0 commit comments