ruff formatting

fabnemEPFL · fabnemEPFL · commit 34a234316aa4 · 2025-06-18T17:53:25.000+02:00
diff --git a/examples/rag/evaluation/rag_evaluator_example.py b/examples/rag/evaluation/rag_evaluator_example.py
@@ -6,28 +6,44 @@
 
 load_dotenv()
 
-MOCK_EVALUATOR_CONFIG = './examples/rag/evaluation/rag_eval_example_config.yaml'
-MOCK_INDEXER_CONFIG = './examples/rag/evaluation/indexer_eval_example_config.yaml'
-MOCK_RAG_CONFIG = './examples/rag/evaluation/rag_evaluated_example_config.yaml'
+MOCK_EVALUATOR_CONFIG = "./examples/rag/evaluation/rag_eval_example_config.yaml"
+MOCK_INDEXER_CONFIG = "./examples/rag/evaluation/indexer_eval_example_config.yaml"
+MOCK_RAG_CONFIG = "./examples/rag/evaluation/rag_evaluated_example_config.yaml"
+
 
 def get_args():
-    parser = argparse.ArgumentParser(description='Run RAG Evaluation pipeline with specified parameters or use default mock data')
-    parser.add_argument('--eval-config', type=str, default=MOCK_EVALUATOR_CONFIG, help='Path to a rag evaluator config file.')
-    parser.add_argument('--indexer-config', type=str, default=MOCK_INDEXER_CONFIG, help='Path to an Indexer config file.')
-    parser.add_argument('--rag-config', type=str, default=MOCK_RAG_CONFIG, help='Path to a rag config file.')
+    parser = argparse.ArgumentParser(
+        description="Run RAG Evaluation pipeline with specified parameters or use default mock data"
+    )
+    parser.add_argument(
+        "--eval-config",
+        type=str,
+        default=MOCK_EVALUATOR_CONFIG,
+        help="Path to a rag evaluator config file.",
+    )
+    parser.add_argument(
+        "--indexer-config",
+        type=str,
+        default=MOCK_INDEXER_CONFIG,
+        help="Path to an Indexer config file.",
+    )
+    parser.add_argument(
+        "--rag-config",
+        type=str,
+        default=MOCK_RAG_CONFIG,
+        help="Path to a rag config file.",
+    )
 
     return parser.parse_args()
 
+
 if __name__ == "__main__":
     args = get_args()
 
     # Instantiate RAGEvaluator
     evaluator = RAGEvaluator.from_config(args.eval_config)
 
     # Run the evaluation
-    result = evaluator(
-        indexer_config = args.indexer_config,
-        rag_config = args.rag_config
-    )
+    result = evaluator(indexer_config=args.indexer_config, rag_config=args.rag_config)
 
-    print(result)
+    print(result)
diff --git a/scripts/data_extractor.py b/scripts/data_extractor.py
@@ -20,4 +20,4 @@
 
 # Extract 100 files, and copy them in '0000_small' folder
 for i in range(100):
-    os.system(f'cp 0000/{os.listdir("0000")[i]} 0000_small')
+    os.system(f"cp 0000/{os.listdir('0000')[i]} 0000_small")
diff --git a/src/mmore/index/indexer.py b/src/mmore/index/indexer.py
@@ -115,9 +115,7 @@ def _create_collection_with_schema(self, collection_name: str):
             FieldSchema(
                 name="id", dtype=DataType.VARCHAR, is_primary=True, max_length=128
             ),
-            FieldSchema(
-                name="document_id", dtype=DataType.VARCHAR, max_length=128
-            ),
+            FieldSchema(name="document_id", dtype=DataType.VARCHAR, max_length=128),
             FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535),
             FieldSchema(
                 name="dense_embedding",
diff --git a/src/mmore/process/execution_state.py b/src/mmore/process/execution_state.py
@@ -31,15 +31,15 @@ def initialize(distributed_mode=False, client=None):
         """
         if ExecutionState._use_dask is not None:
             raise Exception("Execution state already initialized")
-        assert (
-            distributed_mode is not None
-        ), "Distributed mode must be set to True or False"
+        assert distributed_mode is not None, (
+            "Distributed mode must be set to True or False"
+        )
         ExecutionState._use_dask = distributed_mode
 
         if distributed_mode:
-            assert (
-                client is not None
-            ), "You must be in the context of a dask client to use distributed mode"
+            assert client is not None, (
+                "You must be in the context of a dask client to use distributed mode"
+            )
             ExecutionState._dask_var = Variable("should_stop_execution", client=client)
             ExecutionState._dask_var.set(False)
             logger.info("Execution state initialized (distributed mode)")
diff --git a/src/mmore/process/post_processor/chunker/multimodal.py b/src/mmore/process/post_processor/chunker/multimodal.py
@@ -81,7 +81,10 @@ def chunk(self, sample: MultimodalSample) -> List[MultimodalSample]:
         chunks = []
         for i, (chunk, mods) in enumerate(zip(text_chunks, modalities_chunks)):
             s = MultimodalSample(
-                text=chunk.text, modalities=mods, metadata=sample.metadata, id=f"{sample.id}+{i}"
+                text=chunk.text,
+                modalities=mods,
+                metadata=sample.metadata,
+                id=f"{sample.id}+{i}",
             )
             chunks.append(s)
 
diff --git a/src/mmore/process/post_processor/pipeline.py b/src/mmore/process/post_processor/pipeline.py
@@ -49,7 +49,7 @@ def _log_plan(self):
         logger.info("-" * 50)
         logger.info("PP Pipeline:")
         for i, processor in enumerate(self.post_processors):
-            logger.info(f"  > {i+1}. {processor.name}")
+            logger.info(f"  > {i + 1}. {processor.name}")
         logger.info("-" * 50)
 
     @classmethod
@@ -75,7 +75,7 @@ def run(self, samples: List[MultimodalSample]) -> List[MultimodalSample]:
         for i, processor in enumerate(self.post_processors):
             samples = processor.batch_process(samples)
             if self.output_config.save_each_step:
-                self.save_results(samples, f"{i+1}___{processor.name}.jsonl")
+                self.save_results(samples, f"{i + 1}___{processor.name}.jsonl")
         self.save_results(samples, "final_pp.jsonl")
         return samples
 
diff --git a/src/mmore/process/processors/media_processor.py b/src/mmore/process/processors/media_processor.py
@@ -42,7 +42,8 @@ def accepts(cls, file: FileDescriptor) -> bool:
 
     @staticmethod
     def load_models(
-        self=None, fast_mode=False  # pyright: ignore[reportSelfClsParameterName]
+        self=None,  # pyright: ignore[reportSelfClsParameterName]
+        fast_mode=False,
     ):
         if self:
             model_name = (
diff --git a/src/mmore/process/processors/pdf_processor.py b/src/mmore/process/processors/pdf_processor.py
@@ -20,7 +20,7 @@
 
 
 class PDFProcessor(Processor):
-    artifact_dict = None #create_model_dict()
+    artifact_dict = None
 
     def __init__(self, config=None):
         super().__init__(config=config or ProcessorConfig())
@@ -34,7 +34,7 @@ def accepts(cls, file: FileDescriptor) -> bool:
     def load_models(disable_image_extraction: bool = False):
         if PDFProcessor.artifact_dict is None:
             PDFProcessor.artifact_dict = create_model_dict()
-        
+
         marker_config = {
             "disable_image_extraction": disable_image_extraction,
             "languages": None,
@@ -46,9 +46,9 @@ def load_models(disable_image_extraction: bool = False):
             artifact_dict=PDFProcessor.artifact_dict,
             config=config_parser.generate_config_dict(),
         )
-        
+
         converter.initialize_processors(converter.default_processors)
-        
+
         return converter
 
     # overwriting the process_batch
@@ -178,8 +178,8 @@ def _extract_images(pdf_doc, xref) -> Optional[Image.Image]:
             if self.config.custom_config.get("extract_images", True):
                 for img_info in page.get_images(full=False):
                     image = _extract_images(pdf_doc, img_info[0])
-                    if image and clean_image(
-                        image
+                    if (
+                        image and clean_image(image)
                     ):  # clean image filters images below size 512x512 and variance below 100, these are defaults and can be changed
                         embedded_images.append(image)
                         all_text.append(self.config.attachment_tag)
@@ -209,7 +209,7 @@ def _process_parallel(
     ):
         try:
             torch.cuda.set_device(gpu_id)
-            
+
             if PDFProcessor.artifact_dict is None:
                 PDFProcessor.artifact_dict = create_model_dict()
 
diff --git a/src/mmore/rag/llm.py b/src/mmore/rag/llm.py
@@ -74,7 +74,9 @@ def __post_init__(self):
                     else (
                         "COHERE"
                         if self.llm_name in _COHERE_MODELS
-                        else "HF" if self.base_url is None else None
+                        else "HF"
+                        if self.base_url is None
+                        else None
                     )
                 )
             )
diff --git a/src/mmore/rag/retriever.py b/src/mmore/rag/retriever.py
@@ -122,9 +122,9 @@ def retrieve(
             return []
 
         # Validate that the specified search type is allowed
-        assert search_type in get_args(
-            self._search_types
-        ), f"Invalid search_type: {search_type}. Must be 'dense', 'sparse', or 'hybrid'"
+        assert search_type in get_args(self._search_types), (
+            f"Invalid search_type: {search_type}. Must be 'dense', 'sparse', or 'hybrid'"
+        )
 
         # Determine the weight used to combine dense and sparse search scores
         search_weight = self._search_weights.get(search_type, self.hybrid_search_weight)
diff --git a/src/mmore/run_index_api.py b/src/mmore/run_index_api.py
@@ -92,7 +92,7 @@ async def upload_file(
                 # Process and index the file
                 file_extension = FilePath(file.filename).suffix.lower()
                 documents = process_files(temp_dir, COLLECTION_NAME, [file_extension])
-                
+
                 for doc in documents:
                     defDocId = doc.document_id
                     doc.document_id = fileId
@@ -178,7 +178,6 @@ async def upload_files(
                     doc.id = doc.id.replace(defDocId, docId)
                     modified_documents.append(doc)
 
-
                 logging.info("Indexing the files")
 
                 indexer = get_indexer(COLLECTION_NAME, MILVUS_URI, MILVUS_DB)
@@ -323,7 +322,7 @@ async def delete_file(id: str = Path(..., description="ID of the file to delete"
 
     @router.get("/v1/files/{id}", tags=["File Operations"])
     async def download_file(
-        id: str = Path(..., description="ID of the file to download")
+        id: str = Path(..., description="ID of the file to download"),
     ):
         """
         Download a file from the system.
diff --git a/src/mmore/utils.py b/src/mmore/utils.py
@@ -183,7 +183,6 @@ def process_files(
         ".html",
     ],
 ) -> List["MultimodalSample"]:
-
     from .process.crawler import Crawler, CrawlerConfig
     from .process.dispatcher import Dispatcher, DispatcherConfig
     from .process.post_processor.pipeline import PPPipeline, PPPipelineConfig
diff --git a/tests/test_indexer.py b/tests/test_indexer.py
@@ -22,7 +22,12 @@ def sample_jsonl(tmp_path):
     path = tmp_path / "sample_docs.jsonl"
     sample_data = [
         {"id": "1", "text": "Document text 1", "modalities": [], "metadata": {}},
-        {"id": "2", "text": "Document text 2", "modalities": [], "metadata": {"author": "Alice"}},
+        {
+            "id": "2",
+            "text": "Document text 2",
+            "modalities": [],
+            "metadata": {"author": "Alice"},
+        },
     ]
     with open(path, "w", encoding="utf-8") as f:
         for entry in sample_data:
@@ -36,7 +41,9 @@ def test_load_results(sample_jsonl):
     """
     results = load_results(str(sample_jsonl))
     assert len(results) == 2, "Should load exactly 2 documents"
-    assert isinstance(results[0], MultimodalSample), "Should return MultimodalSample objects"
+    assert isinstance(results[0], MultimodalSample), (
+        "Should return MultimodalSample objects"
+    )
     # If your code overrides the .id, don't check for '1':
     assert "Document text 1" in results[0].text
     assert results[1].metadata.get("author") == "Alice"
@@ -65,21 +72,18 @@ def test_index_invocation(mock_from_documents, sample_jsonl):
 @patch("src.mmore.index.indexer.DenseModel.from_config")
 @patch("src.mmore.index.indexer.SparseModel.from_config")
 def test_indexer_integration(
-    mock_sparse_model,
-    mock_dense_model,
-    mock_milvus_client,
-    sample_jsonl
+    mock_sparse_model, mock_dense_model, mock_milvus_client, sample_jsonl
 ):
     """
     Tests the Indexer class with mocked embeddings & Milvus.
     """
     mock_dense_model.return_value.embed_documents.return_value = [
         np.array([0.01, 0.02]),
-        np.array([0.03, 0.04])
+        np.array([0.03, 0.04]),
     ]
     mock_sparse_model.return_value.embed_documents.return_value = [
         np.array([0, 1]),
-        np.array([1, 0])
+        np.array([1, 0]),
     ]
 
     # Mock Milvus
@@ -91,7 +95,9 @@ def test_indexer_integration(
     dense_cfg = MagicMock()
     sparse_cfg = MagicMock()
     db_cfg = MagicMock()
-    test_indexer_config = IndexerConfig(dense_model=dense_cfg, sparse_model=sparse_cfg, db=db_cfg)
+    test_indexer_config = IndexerConfig(
+        dense_model=dense_cfg, sparse_model=sparse_cfg, db=db_cfg
+    )
 
     # Load sample documents
     documents = MultimodalSample.from_jsonl(str(sample_jsonl))
@@ -101,11 +107,13 @@ def test_indexer_integration(
         config=test_indexer_config,
         documents=documents,
         collection_name="test_collection",
-        batch_size=2
+        batch_size=2,
     )
 
     # Verify the client did what we expect
-    assert client_instance.create_collection.called, "Should create collection if it does not exist"
+    assert client_instance.create_collection.called, (
+        "Should create collection if it does not exist"
+    )
     assert client_instance.insert.called, "Should insert documents into Milvus"
 
 
@@ -121,14 +129,17 @@ def test_index_documents_error(mock_milvus_client, sample_jsonl):
 
     # Minimal config
     test_indexer_config = IndexerConfig(
-        dense_model=MagicMock(),
-        sparse_model=MagicMock()
+        dense_model=MagicMock(), sparse_model=MagicMock()
     )
 
     # Patch the embeddings to return arrays
-    with patch("src.mmore.index.indexer.DenseModel.from_config") as mock_dense_model,\
-         patch("src.mmore.index.indexer.SparseModel.from_config") as mock_sparse_model:
-        mock_dense_model.return_value.embed_documents.return_value = [np.array([0.01, 0.02])]
+    with (
+        patch("src.mmore.index.indexer.DenseModel.from_config") as mock_dense_model,
+        patch("src.mmore.index.indexer.SparseModel.from_config") as mock_sparse_model,
+    ):
+        mock_dense_model.return_value.embed_documents.return_value = [
+            np.array([0.01, 0.02])
+        ]
         mock_sparse_model.return_value.embed_documents.return_value = [np.array([0, 1])]
 
         indexer = Indexer(
diff --git a/tests/test_postprocessors.py b/tests/test_postprocessors.py
diff --git a/tests/test_processors_local.py b/tests/test_processors_local.py

Original file line number	Diff line number	Diff line change
`@@ -74,7 +74,9 @@ def __post_init__(self):`
`74`	`74`	`else (`
`75`	`75`	`"COHERE"`
`76`	`76`	`if self.llm_name in _COHERE_MODELS`
`77`		`- else "HF" if self.base_url is None else None`
	`77`	`+ else "HF"`
	`78`	`+ if self.base_url is None`
	`79`	`+ else None`
`78`	`80`	`)`
`79`	`81`	`)`
`80`	`82`	`)`