1313from extract_thinker .document_loader .loader_interceptor import LoaderInterceptor
1414from extract_thinker .document_loader .llm_interceptor import LlmInterceptor
1515
16- from extract_thinker .utils import get_image_type
16+ from extract_thinker .utils import get_file_extension
1717
1818
1919SUPPORTED_IMAGE_FORMATS = ["jpeg" , "png" , "bmp" , "tiff" ]
20+ SUPPORTED_EXCEL_FORMATS = ['.xls' , '.xlsx' , '.xlsm' , '.xlsb' , '.odf' , '.ods' , '.odt' , '.csv' ]
2021
2122
2223class Extractor :
@@ -111,6 +112,13 @@ def classify_from_stream(self, stream: IO, classifications: List[Classification]
111112 content = self .document_loader .load_content_from_stream (stream )
112113 self ._classify (content , classifications )
113114
115+ def classify_from_excel (self , path : Union [str , IO ], classifications : List [Classification ]):
116+ if isinstance (path , str ):
117+ content = self .document_loader .load_content_from_file (path )
118+ else :
119+ content = self .document_loader .load_content_from_stream (path )
120+ return self ._classify (content , classifications )
121+
114122 def _classify (self , content : str , classifications : List [Classification ]):
115123 messages = [
116124 {
@@ -136,9 +144,11 @@ def classify(self, input: Union[str, IO], classifications: List[Classification])
136144 if isinstance (input , str ):
137145 # Check if the input is a valid file path
138146 if os .path .isfile (input ):
139- file_type = get_image_type (input )
147+ file_type = get_file_extension (input )
140148 if file_type in SUPPORTED_IMAGE_FORMATS :
141149 return self .classify_from_path (input , classifications )
150+ elif file_type in SUPPORTED_EXCEL_FORMATS :
151+ return self .classify_from_excel (input , classifications )
142152 else :
143153 raise ValueError (f"Unsupported file type: { input } " )
144154 else :
@@ -149,6 +159,9 @@ def classify(self, input: Union[str, IO], classifications: List[Classification])
149159 else :
150160 raise ValueError ("Input must be a file path or a stream." )
151161
162+ async def classify_async (self , input : Union [str , IO ], classifications : List [Classification ]):
163+ return await asyncio .to_thread (self .classify , input , classifications )
164+
152165 def _extract (
153166 self , content , file_or_stream , response_model , vision = False , is_stream = False
154167 ):
0 commit comments