Skip to content

Commit 85e3a99

Browse files
authored
Merge pull request #266 from enoch3712/263-bug-extractor-can_handle
get_document_loader refactor. Get the first, if only one present
2 parents 0a82d79 + 4db03d1 commit 85e3a99

File tree

1 file changed

+14
-4
lines changed

1 file changed

+14
-4
lines changed

extract_thinker/extractor.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -92,12 +92,22 @@ def get_document_loader(self, source: Union[str, IO]) -> Optional[DocumentLoader
9292
Returns:
9393
Optional[DocumentLoader]: The suitable document loader if available.
9494
"""
95+
# First, if a primary document loader is set and it can handle the source, return it.
96+
if self.document_loader and self.document_loader.can_handle(source):
97+
return self.document_loader
98+
99+
# If source is a string, attempt an extension-based lookup.
95100
if isinstance(source, str):
96101
_, ext = os.path.splitext(source)
97-
return self.document_loaders_by_file_type.get(ext, self.document_loader)
98-
elif hasattr(source, 'read'):
99-
# Implement logic to determine the loader based on the stream if necessary
100-
return self.document_loader
102+
loader = self.document_loaders_by_file_type.get(ext)
103+
if loader and loader.can_handle(source):
104+
return loader
105+
106+
# As a fallback, iterate over all registered loaders and return the first that supports the source.
107+
for loader in self.document_loaders_by_file_type.values():
108+
if loader.can_handle(source):
109+
return loader
110+
101111
return None
102112

103113
def load_document_loader(self, document_loader: DocumentLoader) -> None:

0 commit comments

Comments
 (0)