Skip to content

Commit db9a515

Browse files
diraolclaude
andcommitted
feat: adiciona pipeline embedding_rerank para re-execução ad-hoc do BERT
Permite reprocessar excertos temáticos que não possuem excerpt_embedding_score sem precisar reprocessar todos os diários desde o início. Uso: python main -p embedding_rerank Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 6376c1f commit db9a515

3 files changed

Lines changed: 54 additions & 0 deletions

File tree

main/__main__.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,24 @@ def gazette_texts_pipeline():
7575
run_task("tag_entities_in_excerpts", theme, themed_excerpt_ids, index)
7676

7777

78+
def embedding_rerank_pipeline():
79+
"""
80+
Re-executa o reranqueamento de embedding para excertos sem pontuação.
81+
Útil para processar excertos de execuções anteriores em que o BERT falhou
82+
ou não foi executado.
83+
"""
84+
index = create_index_interface()
85+
themes = run_task("get_themes")
86+
87+
for theme in themes:
88+
excerpt_ids = run_task("get_themed_excerpt_ids_without_embedding", theme, index)
89+
if not excerpt_ids:
90+
logging.info(f"Sem excertos pendentes para o tema {theme['index']}")
91+
continue
92+
logging.info(f"{len(excerpt_ids)} excertos para reranquear em {theme['index']}")
93+
run_task("embedding_rerank_excerpts", theme, excerpt_ids, index)
94+
95+
7896
def aggregates_pipeline():
7997
database = create_database_interface()
8098
storage = create_storage_interface()
@@ -100,6 +118,8 @@ def execute_pipeline(pipeline):
100118
gazette_texts_pipeline()
101119
elif pipeline == "aggregates":
102120
aggregates_pipeline()
121+
elif pipeline == "embedding_rerank":
122+
embedding_rerank_pipeline()
103123
else:
104124
raise ValueError("Pipeline inválido.")
105125
finally:

tasks/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
"extract_text_from_gazettes": "tasks.gazette_text_extraction",
1010
"extract_themed_excerpts_from_gazettes": "tasks.gazette_themed_excerpts_extraction",
1111
"get_gazettes_to_be_processed": "tasks.list_gazettes_to_be_processed",
12+
"get_themed_excerpt_ids_without_embedding": "tasks.list_themed_excerpts",
1213
"get_themes": "tasks.gazette_themes_listing",
1314
"get_territories": "tasks.list_territories",
1415
"tag_entities_in_excerpts": "tasks.gazette_excerpts_entities_tagging",

tasks/list_themed_excerpts.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
"""Tarefa para listar excertos temáticos existentes no índice para re-execução do embedding"""
2+
3+
import logging
4+
from typing import Dict, List
5+
6+
from index import IndexInterface
7+
8+
9+
def get_themed_excerpt_ids_without_embedding(
10+
theme: Dict, index: IndexInterface
11+
) -> List[str]:
12+
"""
13+
Retorna IDs de todos os excertos do índice temático que ainda não possuem
14+
excerpt_embedding_score, para permitir re-execução ad-hoc do reranqueamento.
15+
"""
16+
index_name = theme["index"]
17+
logging.info(f"Buscando excertos sem embedding no índice {index_name}")
18+
19+
query = {
20+
"query": {
21+
"bool": {"must_not": {"exists": {"field": "excerpt_embedding_score"}}}
22+
},
23+
"_source": False,
24+
"size": 1000,
25+
}
26+
27+
ids = []
28+
for page in index.paginated_search(query, index=index_name):
29+
for hit in page["hits"]["hits"]:
30+
ids.append(hit["_id"])
31+
32+
logging.info(f"Encontrados {len(ids)} excertos sem embedding em {index_name}")
33+
return ids

0 commit comments

Comments
 (0)