33# Standard
44from pathlib import Path
55from tempfile import mkdtemp
6- from typing import Dict , List , Tuple , Union
6+ from typing import Dict , List , Union
77import glob
88import logging
99import os
1010import re
1111
1212# Third Party
1313from datasets import Dataset
14-
15- # pylint: disable=no-name-in-module
16- from docling_parse .docling_parse import pdf_parser_v1
1714from instructlab .schema .taxonomy import DEFAULT_TAXONOMY_FOLDERS as TAXONOMY_FOLDERS
1815from instructlab .schema .taxonomy import (
1916 TaxonomyMessageFormat ,
2724# Local
2825from .chunkers import DocumentChunker
2926
30- # Initialize the pdf parser
31- PDFParser = pdf_parser_v1 ()
32-
3327logger = logging .getLogger (__name__ )
3428
3529
@@ -126,9 +120,9 @@ def _get_documents(
126120 source : Dict [str , Union [str , List [str ]]],
127121 skip_checkout : bool = False ,
128122 document_output_dir : Path = None ,
129- ) -> Tuple [ List [str ], List [ Path ] ]:
123+ ) -> List [Path ]:
130124 """
131- Retrieve the content of files (Markdown and PDF) from a Git repository.
125+ Retrieve the file paths of files (Markdown and PDF) from a Git repository.
132126
133127 Args:
134128 source (dict): Source info containing repository URL, commit hash, and list of file patterns.
@@ -147,14 +141,13 @@ def _get_documents(
147141 repo_url = source .get ("repo" )
148142 commit_hash = source .get ("commit" )
149143 file_patterns = source .get ("patterns" , [])
150-
151- try : # pylint: disable=too-many-nested-blocks
144+ # pylint: disable=too-many-nested-blocks
145+ try :
152146 repo = git .Repo .clone_from (repo_url , document_output_dir )
153147
154148 if not skip_checkout and commit_hash :
155149 repo .git .checkout (commit_hash )
156150
157- file_contents = []
158151 filepaths = []
159152
160153 logger .info ("Processing files..." )
@@ -170,7 +163,6 @@ def _get_documents(
170163 logger .info (f"Processing file: { file_path } " )
171164 try :
172165 if file_path .lower ().endswith (".md" ):
173- # Process Markdown files
174166 with open (file_path , "r" , encoding = "utf-8" ) as file :
175167 content = file .read ()
176168 if _string_contains_html (content ):
@@ -179,75 +171,19 @@ def _get_documents(
179171 "NOTE: Continuing this might affect your data generation quality."
180172 "To get best results please format your markdown documents without the use of HTML or use a different document filetype."
181173 )
182- file_contents .append (content )
183- filepaths .append (Path (file_path ))
184- logger .info (
185- f"Appended Markdown content from { file_path } "
186- )
187-
188- elif file_path .lower ().endswith (".pdf" ):
189- # Process PDF files using docling_parse's pdf_parser_v1
190- doc_key = f"key_{ os .path .basename (file_path )} " # Unique document key
191- logger .info (f"Loading PDF document from { file_path } " )
192-
193- success = PDFParser .load_document (doc_key , file_path )
194- if not success :
195- logger .warning (
196- f"Failed to load PDF document: { file_path } "
197- )
198- continue
199-
200- num_pages = PDFParser .number_of_pages (doc_key )
201- logger .info (f"PDF '{ file_path } ' has { num_pages } pages." )
202-
203- pdf_text = ""
204-
205- for page in range (num_pages ):
206- try :
207- json_doc = PDFParser .parse_pdf_from_key_on_page (
208- doc_key , page
209- )
210- if "pages" not in json_doc or not json_doc ["pages" ]:
211- logger .warning (
212- f"Page { page + 1 } could not be parsed in '{ file_path } '"
213- )
214- continue
215-
216- json_page = json_doc ["pages" ][0 ]
217-
218- # Extract text from cells
219- for cell in json_page .get ("cells" , []):
220- text = cell .get ("content" , {}).get (
221- "rnormalized" , ""
222- )
223- if text .strip (): # Only append non-empty text
224- pdf_text += text .strip () + "\n "
225- except Exception as page_error : # pylint: disable=broad-exception-caught
226- logger .warning (
227- f"Error parsing page { page + 1 } of '{ file_path } ': { page_error } "
228- )
229- continue
230-
231- if pdf_text :
232- file_contents .append (pdf_text )
233- filepaths .append (Path (file_path ))
234-
235- # Unload the document to free memory
236- PDFParser .unload_document (doc_key )
237- logger .info (f"Unloaded PDF document: { file_path } " )
238-
239- else :
240- logger .info (f"Skipping unsupported file type: { file_path } " )
241- except Exception as file_error : # pylint: disable=broad-exception-caught
174+ filepaths .append (Path (file_path ))
175+ logger .info (f"Collected filepath: { file_path } " )
176+ # pylint: disable=broad-exception-caught
177+ except Exception as file_error :
242178 logger .error (
243179 f"Error processing file '{ file_path } ': { file_error } "
244180 )
245181 continue
246182 else :
247183 logger .info (f"Skipping non-file path: { file_path } " )
248184
249- if file_contents :
250- return file_contents , filepaths
185+ if filepaths :
186+ return filepaths
251187 raise SystemExit ("Couldn't find knowledge documents" )
252188
253189 except (OSError , git .exc .GitCommandError , FileNotFoundError ) as e :
@@ -281,13 +217,13 @@ def _read_taxonomy_file(
281217 task_description = contents .get ("task_description" , None )
282218 domain = contents .get ("domain" )
283219 documents = contents .get ("document" )
284- document_contents , doc_filepaths = None , None
220+ doc_filepaths = None
285221 if documents :
286222 os .makedirs (document_output_dir , exist_ok = True )
287223 unique_output_dir = mkdtemp (
288224 prefix = f"{ leaf_node_path } _" , dir = document_output_dir
289225 )
290- document_contents , doc_filepaths = _get_documents (
226+ doc_filepaths = _get_documents (
291227 source = documents ,
292228 document_output_dir = unique_output_dir ,
293229 )
@@ -302,7 +238,6 @@ def _read_taxonomy_file(
302238 "questions_and_answers" : question_answer_list ,
303239 "context" : context ,
304240 "taxonomy_path" : tax_path ,
305- "documents" : document_contents ,
306241 "filepaths" : doc_filepaths ,
307242 "domain" : domain ,
308243 "document_outline" : contents .get ("document_outline" ),
@@ -493,7 +428,8 @@ def leaf_node_to_samples(
493428 docling_model_path = None ,
494429):
495430 samples = []
496- if leaf_node and leaf_node [0 ].get ("documents" ):
431+ # check if the leaf node has document filepaths, if so, it's a knowledge leaf node
432+ if leaf_node and (leaf_node [0 ].get ("filepaths" )):
497433 samples = _knowledge_leaf_node_to_samples (
498434 leaf_node ,
499435 server_ctx_size ,
0 commit comments