1414from poliloom .database import get_engine
1515from poliloom .logging import setup_logging
1616from sqlalchemy .orm import Session
17- from sqlalchemy import exists , func
17+ from sqlalchemy import exists , text
1818from poliloom .models import (
1919 Country ,
2020 CurrentImportEntity ,
2828 Property ,
2929 WikidataDump ,
3030 WikidataEntity ,
31- WikidataEntityLabel ,
3231)
3332
3433# Configure logging
@@ -983,8 +982,8 @@ def index_delete(confirm):
983982def index_build (batch_size , rebuild ):
984983 """Build Meilisearch index from database.
985984
986- Indexes all documents from Location, Country, Language, and Politician tables.
987- Uses upsert semantics - unchanged documents won't be re-embedded .
985+ Indexes all searchable entities with aggregated types. Each entity appears
986+ once with all its types (e.g., an entity can be both Location and Country) .
988987
989988 Use --rebuild to delete and recreate the index from scratch.
990989 """
@@ -1002,75 +1001,87 @@ def index_build(batch_size, rebuild):
10021001 if search_service .ensure_index ():
10031002 click .echo (f" Created index '{ INDEX_NAME } '" )
10041003
1005- # Reindex all documents from each model type
1004+ # Get all searchable models
10061005 models = _get_search_indexed_models ()
1006+ click .echo (f" Types: { ', ' .join (m .__name__ for m in models )} " )
1007+
1008+ # Build dynamic SQL query
1009+ # LEFT JOIN each model table and build types array from which ones match
1010+ left_joins = []
1011+ case_statements = []
1012+ group_by_columns = ["we.wikidata_id" ]
1013+
1014+ for model in models :
1015+ table_name = model .__tablename__
1016+ left_joins .append (
1017+ f"LEFT JOIN { table_name } ON we.wikidata_id = { table_name } .wikidata_id"
1018+ )
1019+ case_statements .append (
1020+ f"CASE WHEN { table_name } .wikidata_id IS NOT NULL THEN '{ model .__name__ } ' END"
1021+ )
1022+ group_by_columns .append (f"{ table_name } .wikidata_id" )
1023+
1024+ array_expr = f"array_remove(ARRAY[{ ', ' .join (case_statements )} ], NULL)"
1025+
1026+ base_sql = f"""
1027+ SELECT
1028+ we.wikidata_id,
1029+ array_agg(DISTINCT wel.label) as labels,
1030+ { array_expr } as types
1031+ FROM wikidata_entities we
1032+ JOIN wikidata_entity_labels wel ON we.wikidata_id = wel.entity_id
1033+ { chr (10 ).join (left_joins )}
1034+ WHERE we.deleted_at IS NULL
1035+ GROUP BY { ", " .join (group_by_columns )}
1036+ HAVING array_length({ array_expr } , 1) > 0
1037+ """
1038+
10071039 total_indexed = 0
10081040 task_uids = []
10091041
10101042 with Session (get_engine ()) as session :
1011- for model in models :
1012- entity_type = model .__tablename__
1013- click .echo (f"⏳ Indexing { entity_type } ..." )
1014-
1015- # Count total
1016- total = (
1017- session .query (model )
1018- .join (WikidataEntity , model .wikidata_id == WikidataEntity .wikidata_id )
1019- .filter (WikidataEntity .deleted_at .is_ (None ))
1020- .count ()
1021- )
1043+ # Count total
1044+ count_result = session .execute (text (f"SELECT COUNT(*) FROM ({ base_sql } ) subq" ))
1045+ total = count_result .scalar ()
10221046
1023- if total == 0 :
1024- click .echo (f" No { entity_type } to index" )
1025- continue
1026-
1027- indexed = 0
1028- offset = 0
1047+ if total == 0 :
1048+ click .echo (" No entities to index" )
1049+ return
10291050
1030- while offset < total :
1031- # Fetch only wikidata_id and labels (no ORM objects, no timestamps)
1032- rows = (
1033- session .query (
1034- model .wikidata_id ,
1035- func .array_agg (WikidataEntityLabel .label ),
1036- )
1037- .join (
1038- WikidataEntity , model .wikidata_id == WikidataEntity .wikidata_id
1039- )
1040- .join (
1041- WikidataEntityLabel ,
1042- WikidataEntity .wikidata_id == WikidataEntityLabel .entity_id ,
1043- )
1044- .filter (WikidataEntity .deleted_at .is_ (None ))
1045- .group_by (model .wikidata_id )
1046- .offset (offset )
1047- .limit (batch_size )
1048- .all ()
1049- )
1051+ click .echo (f" Found { total :,} entities to index" )
10501052
1051- if not rows :
1052- break
1053+ # Process in batches
1054+ offset_val = 0
1055+ while offset_val < total :
1056+ paginated_sql = f"{ base_sql } OFFSET :offset LIMIT :limit"
1057+ rows = session .execute (
1058+ text (paginated_sql ),
1059+ {"offset" : offset_val , "limit" : batch_size },
1060+ ).fetchall ()
10531061
1054- # Build search documents directly from rows
1055- documents = [
1056- SearchDocument (id = wikidata_id , type = entity_type , labels = labels )
1057- for wikidata_id , labels in rows
1058- ]
1062+ if not rows :
1063+ break
10591064
1060- # Send batch without waiting (enables Meilisearch auto-batching)
1061- task_uid = search_service .index_documents (documents )
1062- if task_uid is not None :
1063- task_uids .append (task_uid )
1065+ # Build search documents
1066+ documents = [
1067+ SearchDocument (
1068+ id = row .wikidata_id , types = list (row .types ), labels = list (row .labels )
1069+ )
1070+ for row in rows
1071+ ]
10641072
1065- indexed += len (documents )
1066- offset += batch_size
1073+ # Send batch without waiting (enables Meilisearch auto-batching)
1074+ task_uid = search_service .index_documents (documents )
1075+ if task_uid is not None :
1076+ task_uids .append (task_uid )
10671077
1068- click .echo (f" Sent: { indexed } /{ total } " )
1078+ total_indexed += len (documents )
1079+ offset_val += batch_size
10691080
1070- total_indexed += indexed
1081+ click . echo ( f" Sent: { total_indexed :, } / { total :, } " )
10711082
10721083 click .echo (
1073- f"✅ Sent { total_indexed } documents for indexing ({ len (task_uids )} tasks)"
1084+ f"✅ Sent { total_indexed :, } documents for indexing ({ len (task_uids )} tasks)"
10741085 )
10751086 click .echo (
10761087 " Indexing continues in the background. Use 'poliloom index-stats' to check progress."
0 commit comments