mypy

lpi-tn · lpi-tn · commit 8341ee704ce0 · 2025-05-26T15:48:08.000+02:00
diff --git a/Makefile b/Makefile
@@ -29,7 +29,7 @@ bandit-lint:
 .PHONY: mypy-lint
 mypy-lint:
 	echo "== mpypy lint =="
-	python -m mypy --exclude .venv/ --exclude .mypy_cache/ --exclude locustfiles/ --exclude alembic/ --show-error-codes --verbose .
+	python -m mypy --exclude .venv/ --exclude .mypy_cache/ --exclude locustfiles/ --exclude alembic/ --show-error-codes .
 	echo "== end mypy lint =="
 	echo "====================="
 
diff --git a/welearn_datastack/collectors/open_alex_collector.py b/welearn_datastack/collectors/open_alex_collector.py
@@ -26,7 +26,7 @@ def _get_oa_json(http_session, params):
         json_from_oa = resp_from_openalex.json()
         return json_from_oa
 
-    def _generate_api_query_params(self) -> Dict[str, str | bool | int]:
+    def _generate_api_query_params(self) -> Dict[str, str | bool | int | None]:
         """
         Generate the API query to get the OpenAlex works
         :return: the API query to get the OpenAlex works
@@ -58,7 +58,7 @@ def _generate_api_query_params(self) -> Dict[str, str | bool | int]:
         lang = "languages/en|languages/fr"
         type_ = "types/article|types/report|types/book|types/book-chapter"
 
-        params: Dict[str, str | bool | int] = {
+        params: Dict[str, str | bool | int | None] = {
             "filter": f"best_oa_location.license:{licenses},"
             f"is_retracted:{is_retracted},"
             f"open_access.oa_status:{oa_status},"
diff --git a/welearn_datastack/modules/retrieve_data_from_database.py b/welearn_datastack/modules/retrieve_data_from_database.py
@@ -1,7 +1,6 @@
 import logging
-from collections import defaultdict
 from datetime import datetime, timedelta
-from typing import Collection, Dict, List, Literal, Type
+from typing import Collection, Dict, List, Type, TypedDict
 from uuid import UUID
 
 from sqlalchemy import Column, desc
@@ -32,6 +31,17 @@
 logger = logging.getLogger(__name__)
 
 
+# Typing
+class ModelInfo(TypedDict):
+    model_id: UUID
+    model_name: str
+
+
+ModelsDict = Dict[UUID, ModelInfo]
+
+# logic
+
+
 def _generate_process_state_sub_query(session):
     """
     Generate subquery to retrieve the last process state for each document
@@ -277,7 +287,7 @@ def retrieve_random_documents_ids_according_process_title(
 
 def retrieve_models(
     documents_ids: list[UUID], db_session, ml_type: MLModelsType
-) -> dict[UUID, dict[Literal["model_id"] | Literal["model_name"], UUID | str]]:
+) -> ModelsDict:
     """
     Retrieve the most recent model (per document) based on corpus and used_since.
 
@@ -326,9 +336,7 @@ def retrieve_models(
     # List of (document_id, model_title)
     ret_from_db = ranked_query.all()
 
-    ret: dict[UUID, dict[Literal["model_id"] | Literal["model_name"], UUID | str]] = (
-        defaultdict(dict)
-    )
+    ret: ModelsDict = {}
     for i in ret_from_db:
         ret[i[0]] = {
             "model_id": i[2],
diff --git a/welearn_datastack/nodes_workflow/DocumentClassifier/document_classifier.py b/welearn_datastack/nodes_workflow/DocumentClassifier/document_classifier.py
@@ -89,12 +89,12 @@ def main() -> None:
     ):
         doc_slices: List[DocumentSlice] = list(group_doc_slices)  # type: ignore
 
-        bi_model_name: str = bi_model_by_docid.get(key_doc_id, dict()).get("model_name")
+        bi_model_name = bi_model_by_docid.get(key_doc_id, dict()).get("model_name")
         bi_model_id: UUID = bi_model_by_docid.get(key_doc_id, dict()).get("model_id")
-        if not bi_model_name:
+        if not bi_model_name and not isinstance(bi_model_name, str):
             logger.warning("No bi-classifier model found for document %s", key_doc_id)
             continue
-        if not bi_model_id:
+        if not bi_model_id and not isinstance(bi_model_id, UUID):
             logger.warning(
                 "No bi-classifier model id found for document %s", key_doc_id
             )
diff --git a/welearn_datastack/nodes_workflow/KeywordsExtractor/keywords_extractor.py b/welearn_datastack/nodes_workflow/KeywordsExtractor/keywords_extractor.py
@@ -73,15 +73,18 @@ def main() -> None:
         db_session.query(WeLearnDocumentKeyword).filter(
             WeLearnDocumentKeyword.welearn_document_id == wld.id
         ).delete()
-        embedding_model_name_from_db = emb_model_by_docid.get(wld.id)
+        embedding_model_name_from_db = emb_model_by_docid.get(wld.id, dict()).get(
+            "model_name"
+        )
         if not embedding_model_name_from_db:
             logger.warning(
                 "No embedding model found for document ID '%s'. Skipping keywords extraction.",
                 wld.id,
             )
             continue
         kwds = extract_keywords(
-            wld, embedding_model_name_from_db=embedding_model_name_from_db
+            wld,
+            embedding_model_name_from_db=embedding_model_name_from_db,
         )
         for kw in kwds:
             existing_keyword = db_session.query(Keyword).filter_by(keyword=kw).first()