feat: adiciona pipeline embedding_rerank para re-execução ad-hoc do BERT

diraol · claude · diraol · commit db9a515559a1 · 2026-04-14T01:26:22.000-03:00
Permite reprocessar excertos temáticos que não possuem excerpt_embedding_score
sem precisar reprocessar todos os diários desde o início.

Uso: python main -p embedding_rerank

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/main/__main__.py b/main/__main__.py
@@ -75,6 +75,24 @@ def gazette_texts_pipeline():
         run_task("tag_entities_in_excerpts", theme, themed_excerpt_ids, index)
 
 
+def embedding_rerank_pipeline():
+    """
+    Re-executa o reranqueamento de embedding para excertos sem pontuação.
+    Útil para processar excertos de execuções anteriores em que o BERT falhou
+    ou não foi executado.
+    """
+    index = create_index_interface()
+    themes = run_task("get_themes")
+
+    for theme in themes:
+        excerpt_ids = run_task("get_themed_excerpt_ids_without_embedding", theme, index)
+        if not excerpt_ids:
+            logging.info(f"Sem excertos pendentes para o tema {theme['index']}")
+            continue
+        logging.info(f"{len(excerpt_ids)} excertos para reranquear em {theme['index']}")
+        run_task("embedding_rerank_excerpts", theme, excerpt_ids, index)
+
+
 def aggregates_pipeline():
     database = create_database_interface()
     storage = create_storage_interface()
@@ -100,6 +118,8 @@ def execute_pipeline(pipeline):
             gazette_texts_pipeline()
         elif pipeline == "aggregates":
             aggregates_pipeline()
+        elif pipeline == "embedding_rerank":
+            embedding_rerank_pipeline()
         else:
             raise ValueError("Pipeline inválido.")
     finally:
diff --git a/tasks/__init__.py b/tasks/__init__.py
@@ -9,6 +9,7 @@
     "extract_text_from_gazettes": "tasks.gazette_text_extraction",
     "extract_themed_excerpts_from_gazettes": "tasks.gazette_themed_excerpts_extraction",
     "get_gazettes_to_be_processed": "tasks.list_gazettes_to_be_processed",
+    "get_themed_excerpt_ids_without_embedding": "tasks.list_themed_excerpts",
     "get_themes": "tasks.gazette_themes_listing",
     "get_territories": "tasks.list_territories",
     "tag_entities_in_excerpts": "tasks.gazette_excerpts_entities_tagging",
diff --git a/tasks/list_themed_excerpts.py b/tasks/list_themed_excerpts.py
@@ -0,0 +1,33 @@
+"""Tarefa para listar excertos temáticos existentes no índice para re-execução do embedding"""
+
+import logging
+from typing import Dict, List
+
+from index import IndexInterface
+
+
+def get_themed_excerpt_ids_without_embedding(
+    theme: Dict, index: IndexInterface
+) -> List[str]:
+    """
+    Retorna IDs de todos os excertos do índice temático que ainda não possuem
+    excerpt_embedding_score, para permitir re-execução ad-hoc do reranqueamento.
+    """
+    index_name = theme["index"]
+    logging.info(f"Buscando excertos sem embedding no índice {index_name}")
+
+    query = {
+        "query": {
+            "bool": {"must_not": {"exists": {"field": "excerpt_embedding_score"}}}
+        },
+        "_source": False,
+        "size": 1000,
+    }
+
+    ids = []
+    for page in index.paginated_search(query, index=index_name):
+        for hit in page["hits"]["hits"]:
+            ids.append(hit["_id"])
+
+    logging.info(f"Encontrados {len(ids)} excertos sem embedding em {index_name}")
+    return ids