Skip to content

Commit 904efd9

Browse files
authored
No crash if the processor is asked to process an empty folder (#130)
* better default config file for processing * removed useless run_index.load_results * stop processing if crawl_result is empty * removed useless import
1 parent 9c4c2e4 commit 904efd9

5 files changed

Lines changed: 10 additions & 31 deletions

File tree

examples/process/config.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
data_path: examples/sample_data/ #put absolute path!
1+
data_path: examples/sample_data/ #put absolute path or relative to the root of the module
22
dispatcher_config:
3-
output_path: examples/process/outputs/ #put absolute path!
3+
output_path: examples/process/outputs/ #put absolute path or relative to the root of the module
44
use_fast_processors: false
55
distributed: false
66
dashboard_backend_url: null
77
extract_images: true
8-
scheduler_file: /mmore/scheduler-file.json #put absolute path!
8+
scheduler_file: null # for instance /path/to/mmore/scheduler-file.json
99
process_batch_sizes:
1010
- URLProcessor: 40
1111
- DOCXProcessor: 100

src/mmore/run_index.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import argparse
2-
import json
32
import logging
43
from dataclasses import dataclass
54
from typing import Optional, Union
@@ -28,17 +27,6 @@ class IndexConfig:
2827
documents_path: str
2928

3029

31-
def load_results(path: str):
32-
# Load the results computed and saved by 'run_process.py'
33-
results = []
34-
logger.info(f"Loading results from {path}")
35-
with open(path, "rb") as f:
36-
for line in f:
37-
results.append(MultimodalSample.from_dict(json.loads(line)))
38-
logger.info(f"Loaded {len(results)} results")
39-
return results
40-
41-
4230
def index(
4331
config_file: Union[IndexConfig, str],
4432
documents_path: Optional[str] = None,

src/mmore/run_postprocess.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ def postprocess(config_file, input_data):
3232

3333
# Load samples
3434
samples = _load_dataset(input_data)
35+
if len(samples) == 0:
36+
logger.warning("⚠️ Found no file to postprocess")
3537

3638
# Run pipeline
3739
samples = pipeline(samples)

src/mmore/run_process.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ def process(config_file: str):
8383
crawl_time = crawl_end_time - crawl_start_time
8484
logger.info(f"Crawling completed in {crawl_time:.2f} seconds")
8585

86+
if len(crawl_result) == 0:
87+
logger.warning("⚠️ Found no file to process")
88+
return
89+
8690
dispatcher_config: DispatcherConfig = config.dispatcher_config
8791

8892
url = dispatcher_config.dashboard_backend_url

tests/test_indexer.py

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from mmore.index.indexer import Indexer, IndexerConfig
1010

1111
# Import run_index from the correct package path:
12-
from mmore.run_index import index, load_results
12+
from mmore.run_index import index
1313
from mmore.type import MultimodalSample
1414

1515

@@ -32,21 +32,6 @@ def sample_jsonl(tmp_path):
3232
return path
3333

3434

35-
def test_load_results(sample_jsonl):
36-
"""
37-
Tests that load_results() properly reads JSONL files and returns a list of MultimodalSample objects
38-
"""
39-
results = load_results(str(sample_jsonl))
40-
assert len(results) == 2, "Should load exactly 2 documents"
41-
print(type(results[0]), MultimodalSample)
42-
assert isinstance(results[0], MultimodalSample), (
43-
"Should return MultimodalSample objects"
44-
)
45-
# If your code overrides the .id, don't check for '1':
46-
assert "Document text 1" in results[0].text
47-
assert results[1].metadata.get("author") == "Alice"
48-
49-
5035
@patch("mmore.run_index.Indexer.from_documents")
5136
def test_index_invocation(mock_from_documents, sample_jsonl):
5237
"""

0 commit comments

Comments
 (0)