|
3 | 3 | import types |
4 | 4 | from dataclasses import dataclass, field |
5 | 5 | from heapq import heappop, heappush |
6 | | -from typing import Type, TypeAlias |
| 6 | +from typing import List, Type, TypeAlias |
7 | 7 |
|
8 | 8 | from quivr_core.files.file import FileExtension |
9 | 9 |
|
@@ -49,37 +49,41 @@ class ProcEntry: |
49 | 49 |
|
50 | 50 | def _append_proc_mapping( |
51 | 51 | mapping: ProcMapping, |
52 | | - file_ext: FileExtension | str, |
| 52 | + file_exts: List[FileExtension] | List[str], |
53 | 53 | cls_mod: str, |
54 | 54 | errtxt: str, |
55 | 55 | priority: int | None, |
56 | 56 | ): |
57 | | - if file_ext in mapping: |
58 | | - try: |
59 | | - prev_proc = heappop(mapping[file_ext]) |
60 | | - proc_entry = ProcEntry( |
61 | | - priority=priority if priority is not None else prev_proc.priority - 1, |
62 | | - cls_mod=cls_mod, |
63 | | - err=errtxt, |
64 | | - ) |
65 | | - # Push the previous processor back |
66 | | - heappush(mapping[file_ext], prev_proc) |
67 | | - heappush(mapping[file_ext], proc_entry) |
68 | | - except IndexError: |
| 57 | + for file_ext in file_exts: |
| 58 | + if file_ext in mapping: |
| 59 | + try: |
| 60 | + prev_proc = heappop(mapping[file_ext]) |
| 61 | + proc_entry = ProcEntry( |
| 62 | + priority=priority |
| 63 | + if priority is not None |
| 64 | + else prev_proc.priority - 1, |
| 65 | + cls_mod=cls_mod, |
| 66 | + err=errtxt, |
| 67 | + ) |
| 68 | + # Push the previous processor back |
| 69 | + heappush(mapping[file_ext], prev_proc) |
| 70 | + heappush(mapping[file_ext], proc_entry) |
| 71 | + except IndexError: |
| 72 | + proc_entry = ProcEntry( |
| 73 | + priority=priority if priority is not None else _LOWEST_PRIORITY, |
| 74 | + cls_mod=cls_mod, |
| 75 | + err=errtxt, |
| 76 | + ) |
| 77 | + heappush(mapping[file_ext], proc_entry) |
| 78 | + |
| 79 | + else: |
69 | 80 | proc_entry = ProcEntry( |
70 | 81 | priority=priority if priority is not None else _LOWEST_PRIORITY, |
71 | 82 | cls_mod=cls_mod, |
72 | 83 | err=errtxt, |
73 | 84 | ) |
74 | | - heappush(mapping[file_ext], proc_entry) |
75 | 85 |
|
76 | | - else: |
77 | | - proc_entry = ProcEntry( |
78 | | - priority=priority if priority is not None else _LOWEST_PRIORITY, |
79 | | - cls_mod=cls_mod, |
80 | | - err=errtxt, |
81 | | - ) |
82 | | - mapping[file_ext] = [proc_entry] |
| 86 | + mapping[file_ext] = [proc_entry] |
83 | 87 |
|
84 | 88 |
|
85 | 89 | def defaults_to_proc_entries( |
@@ -109,21 +113,38 @@ def defaults_to_proc_entries( |
109 | 113 | ext_str = ext.value if isinstance(ext, FileExtension) else ext |
110 | 114 | _append_proc_mapping( |
111 | 115 | mapping=base_processors, |
112 | | - file_ext=ext, |
| 116 | + file_exts=[ext], |
113 | 117 | cls_mod=f"quivr_core.processor.implementations.default.{processor_name}", |
114 | 118 | errtxt=f"can't import {processor_name}. Please install quivr-core[{ext_str}] to access {processor_name}", |
115 | 119 | priority=None, |
116 | 120 | ) |
117 | 121 |
|
118 | 122 | # TODO(@aminediro): Megaparse should register itself |
119 | 123 | # Append Megaparse |
120 | | - # _append_proc_mapping( |
121 | | - # mapping=base_processors, |
122 | | - # file_ext=FileExtension.pdf, |
123 | | - # cls_mod="quivr_core.processor.implementations.megaparse_processor.MegaparseProcessor", |
124 | | - # errtxt=f"can't import MegaparseProcessor. Please install quivr-core[{ext_str}] to access MegaparseProcessor", |
125 | | - # priority=None, |
126 | | - # ) |
| 124 | + _append_proc_mapping( |
| 125 | + mapping=base_processors, |
| 126 | + file_exts=[ |
| 127 | + FileExtension.pdf, |
| 128 | + FileExtension.docx, |
| 129 | + FileExtension.doc, |
| 130 | + FileExtension.pptx, |
| 131 | + FileExtension.xls, |
| 132 | + FileExtension.xlsx, |
| 133 | + FileExtension.csv, |
| 134 | + FileExtension.epub, |
| 135 | + FileExtension.bib, |
| 136 | + FileExtension.odt, |
| 137 | + FileExtension.html, |
| 138 | + FileExtension.py, |
| 139 | + FileExtension.markdown, |
| 140 | + FileExtension.md, |
| 141 | + FileExtension.mdx, |
| 142 | + FileExtension.ipynb, |
| 143 | + ], |
| 144 | + cls_mod="quivr_core.processor.implementations.megaparse_processor.MegaparseProcessor", |
| 145 | + errtxt=f"can't import MegaparseProcessor. Please install quivr-core[{ext_str}] to access MegaparseProcessor", |
| 146 | + priority=None, |
| 147 | + ) |
127 | 148 | return base_processors |
128 | 149 |
|
129 | 150 |
|
@@ -181,7 +202,7 @@ def register_processor( |
181 | 202 | if all(proc_cls != proc.cls_mod for proc in known_processors[file_ext]): |
182 | 203 | _append_proc_mapping( |
183 | 204 | known_processors, |
184 | | - file_ext=file_ext, |
| 205 | + file_exts=[file_ext], |
185 | 206 | cls_mod=proc_cls, |
186 | 207 | errtxt=errtxt |
187 | 208 | or f"{proc_cls} import failed for processor of {file_ext}", |
|
0 commit comments