Skip to content

Commit f0af8e4

Browse files
committed
small fixes
1 parent fd0cb35 commit f0af8e4

8 files changed

Lines changed: 31 additions & 17 deletions

File tree

docs/index_api.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ Returns the file with binary content.
173173
- File types supported:
174174

175175
```
176-
.pdf, .docx, .pptx, .md, .txt, .xlsx, .xls, .csv, .mp4, .avi, .mov, .mkv, .mp3, .wav, .aac, .eml, .html
176+
.pdf, .docx, .pptx, .md, .txt, .xlsx, .xls, .csv, .mp4, .avi, .mov, .mkv, .mp3, .wav, .aac, .eml, .html, .htm
177177
```
178178
179179

examples/rag/config_api.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ rag:
88
# Retriever Config
99
retriever:
1010
db:
11-
uri: ./proc_demo3.db
11+
uri: ./proc_demo.db
1212
name: my_db
1313
hybrid_search_weight: 0.5
1414
k: 5

src/mmore/process/dispatcher.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def from_dict(config: Dict) -> "DispatcherConfig":
8585
use_fast_processors=config.get("use_fast_processors", True),
8686
distributed=config.get("distributed", False),
8787
scheduler_file=config.get("scheduler_file"),
88-
processor_config=config.get("processor"),
88+
processor_config=config.get("processor_config"),
8989
process_batch_sizes=config.get("process_batch_sizes"),
9090
batch_multiplier=config.get("batch_multiplier", 1),
9191
extract_images=config.get("extract_images", False),
@@ -111,7 +111,7 @@ def to_dict(self) -> Dict:
111111
"distributed": self.distributed,
112112
"scheduler_file": self.scheduler_file,
113113
"output_path": self.output_path,
114-
"processor": self.processor_config,
114+
"processor_config": self.processor_config,
115115
"process_batch_sizes": self.process_batch_sizes,
116116
"batch_multiplier": self.batch_multiplier,
117117
"extract_images": self.extract_images,

src/mmore/process/processors/pdf_processor.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def load_models(disable_image_extraction: bool = False):
4747
config=config_parser.generate_config_dict(),
4848
)
4949

50-
converter.initialize_processors(converter.default_processors)
50+
converter.initialize_processors(list(converter.default_processors))
5151

5252
return converter
5353

@@ -131,7 +131,11 @@ def process_batch(
131131

132132
def process(self, file_path: str) -> MultimodalSample:
133133
if self.converter is None:
134-
self.converter = PDFProcessor.load_models()
134+
self.converter = PDFProcessor.load_models(
135+
disable_image_extraction=not self.config.custom_config.get(
136+
"extract_images", True
137+
)
138+
)
135139

136140
rendered = self.converter(file_path)
137141
text, _, images = text_from_rendered(rendered)

src/mmore/rag/retriever.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@ def from_config(cls, config: str | RetrieverConfig):
5353

5454
if not client.has_collection(config.collection_name):
5555
raise ValueError(
56-
"The Milvus database has not been initialized yet. Ensure the path is valid with a database that was already populated with the indexer."
56+
f"The Milvus database has not been initialized yet / does not have a collection {config.collection_name}. "
57+
"Ensure the path is valid with a database that was already populated with the indexer."
5758
)
5859

5960
# Init models

src/mmore/run_index_api.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,11 @@ async def upload_file(
9999
doc.id = doc.id.replace(defDocId, fileId)
100100

101101
# Get indexer and index the document
102-
indexer = get_indexer(COLLECTION_NAME, MILVUS_URI, MILVUS_DB)
102+
try:
103+
indexer = get_indexer(COLLECTION_NAME, MILVUS_URI, MILVUS_DB)
104+
except Exception as e:
105+
raise HTTPException(status_code=500, detail=str(e))
106+
103107
indexer.index_documents(
104108
documents=documents, collection_name=COLLECTION_NAME
105109
)
@@ -180,7 +184,11 @@ async def upload_files(
180184

181185
logging.info("Indexing the files")
182186

183-
indexer = get_indexer(COLLECTION_NAME, MILVUS_URI, MILVUS_DB)
187+
try:
188+
indexer = get_indexer(COLLECTION_NAME, MILVUS_URI, MILVUS_DB)
189+
except Exception as e:
190+
raise HTTPException(status_code=500, detail=str(e))
191+
184192
indexer.index_documents(
185193
documents=modified_documents, collection_name=COLLECTION_NAME
186194
)
@@ -240,7 +248,10 @@ async def update_file(
240248
doc.id = id
241249

242250
# Get indexer and reindex the document
243-
indexer = get_indexer(COLLECTION_NAME, MILVUS_URI, MILVUS_DB)
251+
try:
252+
indexer = get_indexer(COLLECTION_NAME, MILVUS_URI, MILVUS_DB)
253+
except Exception as e:
254+
raise HTTPException(status_code=500, detail=str(e))
244255

245256
# First delete the existing document
246257
try:

src/mmore/run_process.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ def process(config_file: str):
6565
".wav",
6666
".aac", # Audio files
6767
".eml", # Emails
68+
".html",
69+
".htm", # HTML pages
6870
],
6971
output_path=config.dispatcher_config.output_path,
7072
)

src/mmore/utils.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
import yaml
55
from dacite import from_dict
6-
from fastapi import HTTPException
76
from pymilvus import MilvusClient
87

98
if TYPE_CHECKING:
@@ -70,9 +69,7 @@ def create_new_indexer(collection_name: str, uri: str, db_name: str) -> "Indexer
7069
)
7170
return indexer
7271
except Exception as e:
73-
raise HTTPException(
74-
status_code=500, detail=f"Failed to create new indexer: {str(e)}"
75-
)
72+
raise Exception(f"Unable to create a new indexer: {str(e)}")
7673

7774

7875
def get_indexer(collection_name: str, uri: str, db_name: str) -> "Indexer":
@@ -114,9 +111,8 @@ def get_indexer(collection_name: str, uri: str, db_name: str) -> "Indexer":
114111

115112
return indexer
116113
except Exception as e:
117-
raise HTTPException(
118-
status_code=404,
119-
detail=f"Collection {collection_name} not found or could not be loaded: {str(e)}",
114+
raise Exception(
115+
f"Collection {collection_name} not found or could not be loaded: {str(e)}"
120116
)
121117

122118

0 commit comments

Comments
 (0)