77from django .conf import settings
88from django .utils import timezone
99
10- from etl .models import EtlPipelineConfig
11- from etl .services import enqueue_etl_item
1210from search_gateway .client import get_opensearch_client
1311
12+ from .bronze_transform import transform_after_indexing
1413from .exception_logs import ExceptionContext
15- from .models import HarvestStatus
14+ from .models import HarvestStatus , IndexStatus
1615from .utils import source_hash
1716
1817logger = logging .getLogger (__name__ )
1918
2019
21- def _get_index_name (model_name = None , instance = None , type_data = None ):
20+ def get_index_name (model_name = None , instance = None , type_data = None ):
2221 if instance is not None and not model_name :
2322 model_name = instance .__class__ .__name__
23+
2424 if not model_name :
2525 return None
26+
2627 if model_name == "HarvestedSciELOData" :
2728 effective_type = type_data or getattr (instance , "type_data" , None )
2829 index = {
2930 "dataset" : getattr (
30- settings , "OPENSEARCH_INDEX_RAW_SCIELO_DATA_DATASET " , None
31+ settings , "OS_INDEX_RAW_SCIELO_DATA_DATASET " , None
3132 ),
3233 "dataverse" : getattr (
33- settings , "OPENSEARCH_INDEX_RAW_SCIELO_DATA_DATAVERSE " , None
34+ settings , "OS_INDEX_RAW_SCIELO_DATA_DATAVERSE " , None
3435 )
3536 }
3637 return index .get (effective_type , None )
38+
3739 return {
38- "HarvestedPreprint" : getattr (settings , "OPENSEARCH_INDEX_RAW_PREPRINT " , None ),
39- "HarvestedBooks" : getattr (settings , "OPENSEARCH_INDEX_RAW_BOOK " , None ),
40+ "HarvestedPreprint" : getattr (settings , "OS_INDEX_RAW_PREPRINT " , None ),
41+ "HarvestedBooks" : getattr (settings , "OS_INDEX_RAW_BOOK " , None ),
4042 }.get (model_name )
4143
4244
@@ -60,10 +62,24 @@ def index_harvested_raw_data(model, index_name=None, only_success=True, refresh=
6062 queryset = model .objects .all ()
6163 if status_filter :
6264 queryset = queryset .filter (harvest_status__in = status_filter )
65+
66+ queryset = queryset .exclude (index_status = IndexStatus .SUCCESS )
67+
6368 for obj in queryset .iterator ():
64- index_name = _get_index_name (model_name = model .__name__ , instance = obj )
69+ index_name = get_index_name (model_name = model .__name__ , instance = obj )
6570 index_harvested_instance (instance = obj , index_name = index_name , refresh = False )
6671
72+ if obj .index_status == IndexStatus .SUCCESS and obj .raw_data :
73+ try :
74+ transform_after_indexing (instance = obj , model_name = model .__name__ )
75+ except Exception as exc :
76+ logger .warning (
77+ "Falha na transformação bronze %s (%s): %s" ,
78+ model .__name__ ,
79+ obj .identifier ,
80+ exc ,
81+ )
82+
6783
6884def index_harvested_instance (instance , index_name = None , refresh = False ):
6985 """
@@ -74,10 +90,12 @@ def index_harvested_instance(instance, index_name=None, refresh=False):
7490 log_model = instance .harvest_error_log .model ,
7591 fk_field = _get_error_log_fk_field (instance ),
7692 )
93+
7794 client = get_opensearch_client ()
7895 if client is None :
7996 logger .warning ("OpenSearch client não configurado." )
8097 return
98+
8199 if not index_name :
82100 logger .warning (
83101 f"Index name não configurado para { instance .__class__ .__name__ } ({ instance .identifier } )."
@@ -99,13 +117,8 @@ def index_harvested_instance(instance, index_name=None, refresh=False):
99117 },
100118 refresh = False ,
101119 )
102- if EtlPipelineConfig .objects .select_for_source (index_name , instance .raw_data ):
103- enqueue_etl_item (
104- source_index = index_name ,
105- external_id = instance .identifier ,
106- source_payload = instance .raw_data ,
107- )
108120 instance .mark_as_indexed (index_name = index_name )
121+
109122 except Exception as exc :
110123 logger .warning (
111124 f"Falha ao indexar em { index_name } { instance .__class__ .__name__ } ({ instance .identifier } ): { exc } "
@@ -119,14 +132,16 @@ def index_harvested_instance(instance, index_name=None, refresh=False):
119132
120133
121134def delete_harvested_document (model_name , identifier , refresh = False ):
122- index_name = _get_index_name (model_name = model_name )
135+ index_name = get_index_name (model_name = model_name )
123136 if not index_name :
124137 logger .warning (f"Index name não configurado para { model_name } ." )
125138 return
139+
126140 client = get_opensearch_client ()
127141 if client is None :
128142 logger .warning ("OpenSearch client não configurado." )
129143 return
144+
130145 try :
131146 logging .info (f"Removendo documento { identifier } do indice { index_name } " )
132147 client .delete (index = index_name , id = identifier , refresh = refresh )
0 commit comments