write test

lpi-tn · lpi-tn · commit da5c7e451cdd · 2025-04-01T18:26:08.000+02:00
diff --git a/tests/document_classifier/test_document_classifier.py b/tests/document_classifier/test_document_classifier.py
@@ -212,3 +212,92 @@ def test_main_no_specific_sdg(
 
         # There is only one state by doc because the rest of steps were mocked
         self.assertEqual(state_in_db[0].title, Step.DOCUMENT_CLASSIFIED_NON_SDG.value)
+
+
+    @patch(
+        "welearn_datastack.nodes_workflow.DocumentClassifier.document_classifier.n_classify_slices"
+    )
+    @patch(
+        "welearn_datastack.nodes_workflow.DocumentClassifier.document_classifier.bi_classify_slices"
+    )
+    @patch(
+        "welearn_datastack.nodes_workflow.DocumentClassifier.document_classifier.retrieve_models"
+    )
+    @patch(
+        "welearn_datastack.nodes_workflow.DocumentClassifier.document_classifier.create_db_session"
+    )
+    @patch(
+        "welearn_datastack.nodes_workflow.DocumentClassifier.document_classifier.retrieve_ids_from_csv"
+    )
+    def test_main_externally_classified(
+        self,
+        mock_retrieve_ids,
+        mock_create_session,
+        mock_retrieve_models,
+        mock_bi_classify,
+        mock_n_classify,
+    ):
+        mock_bi_classify.return_value = True
+        mock_n_classify.return_value = []
+
+        doc_test_id = uuid.uuid4()
+
+        local_engine = create_engine("sqlite://")
+        s_maker = sessionmaker(local_engine)
+        handle_schema_with_sqlite(local_engine)
+
+        test_session = s_maker()
+        Base.metadata.create_all(test_session.get_bind())
+
+
+        mock_retrieve_ids.return_value = [doc_test_id]
+        session = test_session
+        mock_create_session.return_value = session
+        mock_retrieve_models.return_value = [Mock(lang="en", title="model_name")]
+
+
+        corpus_source_name = "test_corpus"
+
+        corpus_test = Corpus(
+            id=uuid.uuid4(),
+            source_name=corpus_source_name,
+            is_fix=True,
+            is_active=True,
+        )
+        doc_test = WeLearnDocument(
+            id=doc_test_id,
+            url="https://example.org",
+            corpus_id=corpus_test.id,
+            title="test",
+            lang="en",
+            full_content="test",
+            description="test",
+            details={"test": "test", "external_sdg": [10]},
+            trace=1,
+        )
+
+        slice_test_id = uuid.uuid4()
+        slice_test = DocumentSlice(
+            id=slice_test_id,
+            document_id=doc_test.id,
+            embedding=numpy.array([1, 2, 3]),
+            body="test",
+            order_sequence=0,
+            embedding_model_name="test",
+            embedding_model_id=uuid.uuid4(),
+        )
+
+        test_session.add(corpus_test)
+        test_session.add(doc_test)
+        test_session.add(slice_test)
+        test_session.commit()
+
+        document_classifier.main()
+
+        state_in_db = session.query(ProcessState).all()
+
+        # There is only one state by doc because the rest of steps were mocked
+        self.assertEqual(state_in_db[0].title, Step.DOCUMENT_CLASSIFIED_SDG.value)
+
+        sdg_in_db = session.query(Sdg).all()
+        self.assertEqual(sdg_in_db[0].sdg_number, 10)
diff --git a/welearn_datastack/nodes_workflow/DocumentClassifier/document_classifier.py b/welearn_datastack/nodes_workflow/DocumentClassifier/document_classifier.py
@@ -3,7 +3,7 @@
 import uuid
 from itertools import groupby
 from typing import List, Set
-from uuid import UUID
+from uuid import UUID, uuid4
 
 from sqlalchemy.orm import Session
 
@@ -72,9 +72,10 @@ def main() -> None:
     sdg_docs_ids: List[UUID] = []
     specific_sdgs: List[Sdg] = []
     logger.info("Starting bi-classification")
+    key_external_sdg = "external_sdg"
     slices_per_docs = sorted(slices, key=lambda x: x.document_id)  # type: ignore
     for k, g in groupby(slices_per_docs, lambda x: x.document_id):
-        doc_slices = list(g)
+        doc_slices: List[DocumentSlice] = list(g)  # type: ignore
         lang = doc_slices[0].document.lang
         bi_model = bi_model_by_lang.get(lang)
         if not bi_model:
@@ -85,8 +86,22 @@ def main() -> None:
             # No SDG found, process it later
             non_sdg_docs_ids.add(k)
             continue
-
-        doc_sdgs = n_classify_slices(doc_slices, n_model_by_lang.get(lang))  # type: ignore
+        if key_external_sdg in doc_slices[0].document.details:
+            logger.info(f"Document {doc_slices[0].document_id} was externally classified")
+            doc_sdgs: List[Sdg] = []
+            for sdg_number in doc_slices[0].document.details[key_external_sdg]:
+                for local_slice in doc_slices:
+                    doc_sdgs.append(
+                        Sdg(
+                            slice_id=local_slice.id,
+                            sdg_number=sdg_number,
+                            id=uuid4(),
+                            bi_classifier_model_id=uuid4(),
+                            n_classifier_model_id=uuid4()
+                        )
+                    )
+        else:
+            doc_sdgs = n_classify_slices(doc_slices, n_model_by_lang.get(lang))  # type: ignore
         if not doc_sdgs:
             # No SDG found, process it later
             non_sdg_docs_ids.add(k)