ruff compliance

fabnemEPFL · fabnemEPFL · commit 6c6a2762fcf9 · 2025-06-20T15:51:38.000+02:00
diff --git a/tests/test_indexer.py b/tests/test_indexer.py
@@ -39,9 +39,9 @@ def test_load_results(sample_jsonl):
     results = load_results(str(sample_jsonl))
     assert len(results) == 2, "Should load exactly 2 documents"
     print(type(results[0]), MultimodalSample)
-    assert isinstance(
-        results[0], MultimodalSample
-    ), "Should return MultimodalSample objects"
+    assert isinstance(results[0], MultimodalSample), (
+        "Should return MultimodalSample objects"
+    )
     # If your code overrides the .id, don't check for '1':
     assert "Document text 1" in results[0].text
     assert results[1].metadata.get("author") == "Alice"
@@ -109,9 +109,9 @@ def test_indexer_integration(
     )
 
     # Verify the client did what we expect
-    assert (
-        client_instance.create_collection.called
-    ), "Should create collection if it does not exist"
+    assert client_instance.create_collection.called, (
+        "Should create collection if it does not exist"
+    )
     assert client_instance.insert.called, "Should insert documents into Milvus"
 
 
diff --git a/tests/test_postprocessors.py b/tests/test_postprocessors.py
@@ -27,9 +27,9 @@ def test_chunker_from_load_postprocessor():
     config_args = {"chunking_strategy": "sentence", "text_chunker_config": {}}
     base_config = BasePostProcessorConfig(type="chunker", args=config_args)
     processor = load_postprocessor(base_config)
-    assert isinstance(
-        processor, MultimodalChunker
-    ), "Expected a MultimodalChunker instance."
+    assert isinstance(processor, MultimodalChunker), (
+        "Expected a MultimodalChunker instance."
+    )
 
 
 def test_chunker_process():
@@ -47,12 +47,12 @@ def test_chunker_process():
     chunks = chunker.process(sample)
     # Expect 2 chunks for the 2 sentences
     assert len(chunks) == 2, f"Expected 2 chunks, got {len(chunks)}"
-    assert (
-        chunks[0].text.strip() == "Hello world."
-    ), f"Unexpected first chunk: {chunks[0].text}"
-    assert (
-        chunks[1].text.strip() == "This is a test."
-    ), f"Unexpected second chunk: {chunks[1].text}"
+    assert chunks[0].text.strip() == "Hello world.", (
+        f"Unexpected first chunk: {chunks[0].text}"
+    )
+    assert chunks[1].text.strip() == "This is a test.", (
+        f"Unexpected second chunk: {chunks[1].text}"
+    )
 
 
 # ------------------ Filter Tests ------------------
@@ -117,16 +117,16 @@ def filter(self, sample: MultimodalSample) -> bool:
     accept_filter = DummyAcceptFilter("dummy_accept")
     accepted = accept_filter.process(sample)
     # When filter returns True, process() should return the sample wrapped in a list.
-    assert accepted == [
-        sample
-    ], f"Expected sample to be kept when filter returns True, got {accepted}"
+    assert accepted == [sample], (
+        f"Expected sample to be kept when filter returns True, got {accepted}"
+    )
 
     reject_filter = DummyRejectFilter("dummy_reject")
     rejected = reject_filter.process(sample)
     # When filter returns False, process() should return an empty list.
-    assert (
-        rejected == []
-    ), f"Expected sample to be rejected when filter returns False, got {rejected}"
+    assert rejected == [], (
+        f"Expected sample to be rejected when filter returns False, got {rejected}"
+    )
 
 
 # ------------------ NER Tests ------------------
@@ -147,9 +147,7 @@ def test_ner_from_config():
     """
     # Patch LLM.from_config to return our dummy LLM regardless of input.
     original_llm_from_config = LLM.from_config
-    LLM.from_config = (
-        lambda cfg: DummyLLM()
-    )  # pyright: ignore[reportAttributeAccessIssue]
+    LLM.from_config = lambda cfg: DummyLLM()  # pyright: ignore[reportAttributeAccessIssue]
 
     config = NERExtractorConfig(
         llm=LLMConfig("dummy"),  # dummy config; our lambda ignores it
@@ -174,9 +172,7 @@ def test_ner_process():
     which should add to the sample's metadata a list with one dictionary.
     """
     original_llm_from_config = LLM.from_config
-    LLM.from_config = (
-        lambda cfg: DummyLLM()
-    )  # pyright: ignore[reportAttributeAccessIssue]
+    LLM.from_config = lambda cfg: DummyLLM()  # pyright: ignore[reportAttributeAccessIssue]
 
     config = NERExtractorConfig(
         llm=LLMConfig("dummy"),
@@ -204,15 +200,15 @@ def test_ner_process():
     # We expect one entity: HELLO WORLD as an ORGANIZATION with the given description.
     assert len(ner_entities) == 1, f"Expected 1 entity, got {len(ner_entities)}."
     entity_info: dict[str, str] = ner_entities[0]
-    assert (
-        entity_info.get("entity") == "HELLO WORLD"
-    ), f"Unexpected entity name: {entity_info.get('entity')}"
-    assert (
-        entity_info.get("type") == "ORGANIZATION"
-    ), f"Unexpected entity type: {entity_info.get('type')}"
-    assert entity_info.get("description") == [
-        "A SAMPLE ORGANIZATION"
-    ], f"Unexpected entity description: {entity_info.get('description')}"
+    assert entity_info.get("entity") == "HELLO WORLD", (
+        f"Unexpected entity name: {entity_info.get('entity')}"
+    )
+    assert entity_info.get("type") == "ORGANIZATION", (
+        f"Unexpected entity type: {entity_info.get('type')}"
+    )
+    assert entity_info.get("description") == ["A SAMPLE ORGANIZATION"], (
+        f"Unexpected entity description: {entity_info.get('description')}"
+    )
 
     # Restore the original LLM.from_config
     LLM.from_config = original_llm_from_config
@@ -253,9 +249,9 @@ def test_tagger_from_load_tagger_modalities():
     """
     config = BaseTaggerConfig(type="modalities_counter", args={})
     tagger = load_tagger(config)
-    assert isinstance(
-        tagger, ModalitiesCounter
-    ), "Expected a ModalitiesCounter instance."
+    assert isinstance(tagger, ModalitiesCounter), (
+        "Expected a ModalitiesCounter instance."
+    )
 
 
 def test_tagger_from_load_tagger_lang_detector():
@@ -292,9 +288,9 @@ def test_tagger_process_words_counter():
     processed = tagger.process(sample)
     expected_count = len(sample.text.split())
     # WordsCounter's default metadata_key is set in its __init__ to 'word_count'
-    assert (
-        sample.metadata.get("word_count") == expected_count
-    ), f"Expected word_count {expected_count}, got {sample.metadata.get('word_count')}"
+    assert sample.metadata.get("word_count") == expected_count, (
+        f"Expected word_count {expected_count}, got {sample.metadata.get('word_count')}"
+    )
     assert isinstance(processed, list), "Expected process() to return a list."
 
 
@@ -318,9 +314,9 @@ def test_tagger_process_modalities_counter():
     processed = tagger.process(sample)
     expected_count = len(sample.modalities)
     # ModalitiesCounter's default metadata_key is 'modalities_count'
-    assert (
-        sample.metadata.get("modalities_count") == expected_count
-    ), f"Expected modalities_count {expected_count}, got {sample.metadata.get('modalities_count')}"
+    assert sample.metadata.get("modalities_count") == expected_count, (
+        f"Expected modalities_count {expected_count}, got {sample.metadata.get('modalities_count')}"
+    )
     assert isinstance(processed, list), "Expected process() to return a list."
 
 
diff --git a/tests/test_processors_local.py b/tests/test_processors_local.py
@@ -63,13 +63,13 @@ def test_docx_no_image_extraction():
     )
 
     # Ensure that the attachment placeholder is not present.
-    assert (
-        "<attachment>" not in combined_text
-    ), "Attachment tag should not appear when image extraction is disabled."
+    assert "<attachment>" not in combined_text, (
+        "Attachment tag should not appear when image extraction is disabled."
+    )
     # Verify that no images were extracted.
-    assert (
-        len(result.modalities) == 0
-    ), "Expected no images when image extraction is disabled."
+    assert len(result.modalities) == 0, (
+        "Expected no images when image extraction is disabled."
+    )
 
 
 #  ------------------ EML Processor Tests ------------------
@@ -142,14 +142,14 @@ def test_md_image_extraction():
     )
     # Count the number of attachment placeholders inserted in text
     placeholder_count = combined_text.count(custom_attachment_tag)
-    assert (
-        placeholder_count == 2
-    ), f"Expected 2 attachment placeholders, found {placeholder_count}"
+    assert placeholder_count == 2, (
+        f"Expected 2 attachment placeholders, found {placeholder_count}"
+    )
     # Assert that modalities is a list and that two images were extracted
     assert isinstance(result.modalities, list), "Modalities should be a list"
-    assert (
-        len(result.modalities) == 2
-    ), f"Expected 2 images in modalities, found {len(result.modalities)}"
+    assert len(result.modalities) == 2, (
+        f"Expected 2 images in modalities, found {len(result.modalities)}"
+    )
 
 
 # ------------------ Media Processor Tests ------------------
@@ -192,9 +192,9 @@ def test_media_process_batch():
     # Call process_batch with a dummy num_workers value
     results = processor.process_batch(files, fast_mode=False, num_workers=1)
     # Verify that each file in the batch produces a result with non-empty text and a list of modalities.
-    assert len(results) == len(
-        files
-    ), "Number of results should match number of files processed."
+    assert len(results) == len(files), (
+        "Number of results should match number of files processed."
+    )
     for result in results:
         assert result.text, "Text should not be empty"
         assert isinstance(result.modalities, list), "Modalities should be a list"
@@ -230,9 +230,9 @@ def test_pptx_extract_notes():
     )
 
     expected_text = "Data analysis has multiple facets and approaches"
-    assert (
-        expected_text in combined_text
-    ), f"Expected notes not found in extracted text: {combined_text}"
+    assert expected_text in combined_text, (
+        f"Expected notes not found in extracted text: {combined_text}"
+    )
 
 
 # ------------------ Spreadsheet Processor Tests ------------------
@@ -273,9 +273,9 @@ def test_spreadsheet_multi_sheet_content():
     # 1) Confirm that the names of each sheet appear in the extracted text
     expected_sheet_names = ["Form Responses 1"]
     for sheet_name in expected_sheet_names:
-        assert (
-            sheet_name in combined_text
-        ), f"Didn't find '{sheet_name}' in extracted text."
+        assert sheet_name in combined_text, (
+            f"Didn't find '{sheet_name}' in extracted text."
+        )
 
     # 2) Check for specific cell content that should exist in the file
     expected_snippets = [
@@ -285,9 +285,9 @@ def test_spreadsheet_multi_sheet_content():
         "Female",
     ]
     for snippet in expected_snippets:
-        assert (
-            snippet in combined_text
-        ), f"Expected '{snippet}' not found in spreadsheet text."
+        assert snippet in combined_text, (
+            f"Expected '{snippet}' not found in spreadsheet text."
+        )
 
     # 3) Since there are no images, confirm modalities is empty
     assert isinstance(result.modalities, list), "Modalities should be a list."
@@ -352,9 +352,9 @@ def test_text_process_standard():
     result = processor.process(sample_file)
     # Verify that some text is extracted and no image modalities are returned.
     assert result.text, "Text should not be empty"
-    assert (
-        isinstance(result.modalities, list) and len(result.modalities) == 0
-    ), "Modalities should be an empty list"
+    assert isinstance(result.modalities, list) and len(result.modalities) == 0, (
+        "Modalities should be an empty list"
+    )
 
 
 # ------------------ URL Processor Tests ------------------
@@ -373,9 +373,9 @@ def test_url_process_standard():
         " ".join(result.text) if isinstance(result.text, list) else result.text
     )
     # Expect that the text from example.com contains "illustrative examples".
-    assert (
-        "illustrative examples" in combined_text
-    ), "Expected 'illustrative examples' in extracted text from http://example.com"
+    assert "illustrative examples" in combined_text, (
+        "Expected 'illustrative examples' in extracted text from http://example.com"
+    )
     assert isinstance(result.modalities, list), "Modalities should be a list"
 
 
@@ -392,6 +392,6 @@ def test_url_process_invalid():
     result = processor.process(sample_url)
     # If URL processing fails, expect empty text and no modalities.
     assert not result.text, "Expected empty text for invalid URL"
-    assert (
-        isinstance(result.modalities, list) and len(result.modalities) == 0
-    ), "Expected no modalities for invalid URL"
+    assert isinstance(result.modalities, list) and len(result.modalities) == 0, (
+        "Expected no modalities for invalid URL"
+    )