opensanctions
diff --git a/‎poliloom/poliloom/cli.py‎
Lines changed: 69 additions & 58 deletions b/‎poliloom/poliloom/cli.py‎
Lines changed: 69 additions & 58 deletions
diff --git a/‎poliloom/poliloom/importer/entity.py‎
Lines changed: 6 additions & 24 deletions b/‎poliloom/poliloom/importer/entity.py‎
Lines changed: 6 additions & 24 deletions
diff --git a/‎poliloom/poliloom/importer/politician.py‎
Lines changed: 7 additions & 23 deletions b/‎poliloom/poliloom/importer/politician.py‎
Lines changed: 7 additions & 23 deletions
diff --git a/‎poliloom/poliloom/models/wikidata.py‎
Lines changed: 1 addition & 1 deletion b/‎poliloom/poliloom/models/wikidata.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎poliloom/poliloom/search.py‎
Lines changed: 5 additions & 5 deletions b/‎poliloom/poliloom/search.py‎
Lines changed: 5 additions & 5 deletions
@@ -14,7 +14,7 @@
 from poliloom.database import get_engine
 from poliloom.logging import setup_logging
 from sqlalchemy.orm import Session
-from sqlalchemy import exists, func
+from sqlalchemy import exists, text
 from poliloom.models import (
     Country,
     CurrentImportEntity,
@@ -28,7 +28,6 @@
     Property,
     WikidataDump,
     WikidataEntity,
-    WikidataEntityLabel,
 )
 
 # Configure logging
@@ -983,8 +982,8 @@ def index_delete(confirm):
 def index_build(batch_size, rebuild):
     """Build Meilisearch index from database.
 
-    Indexes all documents from Location, Country, Language, and Politician tables.
-    Uses upsert semantics - unchanged documents won't be re-embedded.
+    Indexes all searchable entities with aggregated types. Each entity appears
+    once with all its types (e.g., an entity can be both Location and Country).
 
     Use --rebuild to delete and recreate the index from scratch.
     """
@@ -1002,75 +1001,87 @@ def index_build(batch_size, rebuild):
         if search_service.ensure_index():
             click.echo(f"   Created index '{INDEX_NAME}'")
 
-    # Reindex all documents from each model type
+    # Get all searchable models
     models = _get_search_indexed_models()
+    click.echo(f"   Types: {', '.join(m.__name__ for m in models)}")
+
+    # Build dynamic SQL query
+    # LEFT JOIN each model table and build types array from which ones match
+    left_joins = []
+    case_statements = []
+    group_by_columns = ["we.wikidata_id"]
+
+    for model in models:
+        table_name = model.__tablename__
+        left_joins.append(
+            f"LEFT JOIN {table_name} ON we.wikidata_id = {table_name}.wikidata_id"
+        )
+        case_statements.append(
+            f"CASE WHEN {table_name}.wikidata_id IS NOT NULL THEN '{model.__name__}' END"
+        )
+        group_by_columns.append(f"{table_name}.wikidata_id")
+
+    array_expr = f"array_remove(ARRAY[{', '.join(case_statements)}], NULL)"
+
+    base_sql = f"""
+        SELECT
+            we.wikidata_id,
+            array_agg(DISTINCT wel.label) as labels,
+            {array_expr} as types
+        FROM wikidata_entities we
+        JOIN wikidata_entity_labels wel ON we.wikidata_id = wel.entity_id
+        {chr(10).join(left_joins)}
+        WHERE we.deleted_at IS NULL
+        GROUP BY {", ".join(group_by_columns)}
+        HAVING array_length({array_expr}, 1) > 0
+    """
+
     total_indexed = 0
     task_uids = []
 
     with Session(get_engine()) as session:
-        for model in models:
-            entity_type = model.__tablename__
-            click.echo(f"⏳ Indexing {entity_type}...")
-
-            # Count total
-            total = (
-                session.query(model)
-                .join(WikidataEntity, model.wikidata_id == WikidataEntity.wikidata_id)
-                .filter(WikidataEntity.deleted_at.is_(None))
-                .count()
-            )
+        # Count total
+        count_result = session.execute(text(f"SELECT COUNT(*) FROM ({base_sql}) subq"))
+        total = count_result.scalar()
 
-            if total == 0:
-                click.echo(f"   No {entity_type} to index")
-                continue
-
-            indexed = 0
-            offset = 0
+        if total == 0:
+            click.echo("   No entities to index")
+            return
 
-            while offset < total:
-                # Fetch only wikidata_id and labels (no ORM objects, no timestamps)
-                rows = (
-                    session.query(
-                        model.wikidata_id,
-                        func.array_agg(WikidataEntityLabel.label),
-                    )
-                    .join(
-                        WikidataEntity, model.wikidata_id == WikidataEntity.wikidata_id
-                    )
-                    .join(
-                        WikidataEntityLabel,
-                        WikidataEntity.wikidata_id == WikidataEntityLabel.entity_id,
-                    )
-                    .filter(WikidataEntity.deleted_at.is_(None))
-                    .group_by(model.wikidata_id)
-                    .offset(offset)
-                    .limit(batch_size)
-                    .all()
-                )
+        click.echo(f"   Found {total:,} entities to index")
 
-                if not rows:
-                    break
+        # Process in batches
+        offset_val = 0
+        while offset_val < total:
+            paginated_sql = f"{base_sql} OFFSET :offset LIMIT :limit"
+            rows = session.execute(
+                text(paginated_sql),
+                {"offset": offset_val, "limit": batch_size},
+            ).fetchall()
 
-                # Build search documents directly from rows
-                documents = [
-                    SearchDocument(id=wikidata_id, type=entity_type, labels=labels)
-                    for wikidata_id, labels in rows
-                ]
+            if not rows:
+                break
 
-                # Send batch without waiting (enables Meilisearch auto-batching)
-                task_uid = search_service.index_documents(documents)
-                if task_uid is not None:
-                    task_uids.append(task_uid)
+            # Build search documents
+            documents = [
+                SearchDocument(
+                    id=row.wikidata_id, types=list(row.types), labels=list(row.labels)
+                )
+                for row in rows
+            ]
 
-                indexed += len(documents)
-                offset += batch_size
+            # Send batch without waiting (enables Meilisearch auto-batching)
+            task_uid = search_service.index_documents(documents)
+            if task_uid is not None:
+                task_uids.append(task_uid)
 
-                click.echo(f"   Sent: {indexed}/{total}")
+            total_indexed += len(documents)
+            offset_val += batch_size
 
-            total_indexed += indexed
+            click.echo(f"   Sent: {total_indexed:,}/{total:,}")
 
     click.echo(
-        f"✅ Sent {total_indexed} documents for indexing ({len(task_uids)} tasks)"
+        f"✅ Sent {total_indexed:,} documents for indexing ({len(task_uids)} tasks)"
     )
     click.echo(
         "   Indexing continues in the background. Use 'poliloom index-stats' to check progress."
 
@@ -19,7 +19,6 @@
     WikidataEntityLabel,
     WikidataRelation,
 )
-from ..search import SearchDocument, SearchService
 from ..wikidata_entity_processor import WikidataEntityProcessor
 
 logger = logging.getLogger(__name__)
@@ -53,27 +52,15 @@ def batch_size(self) -> int:
         """Get current batch size."""
         return len(self.entities)
 
-    def insert(self, session: Session, search_service: SearchService) -> None:
-        """Insert entities and relations into database, then index to search.
+    def insert(self, session: Session) -> None:
+        """Insert entities and relations into database.
 
-        Commits the transaction and indexes to search service after successful commit.
-        Clears the batch after completion.
+        Commits the transaction and clears the batch after completion.
+        Search indexing is handled separately by the index-build command.
         """
         if not self.has_entities():
             return
 
-        # Build search documents BEFORE modifying entities (labels get popped later)
-        entity_type = self.model_class.__tablename__
-        search_documents = [
-            SearchDocument(
-                id=entity["wikidata_id"],
-                type=entity_type,
-                labels=entity["labels"],
-            )
-            for entity in self.entities
-            if entity.get("labels")
-        ]
-
         # Insert WikidataEntity records first (without labels)
         entity_data = [
             {
@@ -117,10 +104,6 @@ def insert(self, session: Session, search_service: SearchService) -> None:
 
         session.commit()
 
-        # Index to search after successful commit
-        if search_documents:
-            search_service.index_documents(search_documents)
-
         logger.debug(
             f"Processed {len(self.entities)} {self.model_class.__name__.lower()}s "
             f"with {len(self.relations)} relations"
@@ -158,7 +141,6 @@ def _process_supporting_entities_chunk(
     # Create fresh connections for this worker process
     engine = create_engine(pool_size=2, max_overflow=3)
     session = Session(engine)
-    search_service = SearchService()
 
     # Entity collections organized by type, built from worker_config
     entity_collections = [
@@ -239,7 +221,7 @@ def _process_supporting_entities_chunk(
             # Process batches when they reach the batch size
             for collection in entity_collections:
                 if collection.batch_size() >= batch_size:
-                    collection.insert(session, search_service)
+                    collection.insert(session)
 
     except Exception as e:
         logger.error(f"Worker {worker_id}: error processing chunk: {e}")
@@ -250,7 +232,7 @@ def _process_supporting_entities_chunk(
     # Process remaining entities in final batches on successful completion
     for collection in entity_collections:
         if collection.has_entities():
-            collection.insert(session, search_service)
+            collection.insert(session)
 
     session.close()
     logger.info(f"Worker {worker_id}: finished processing {entity_count} entities")
 
@@ -21,7 +21,6 @@
     WikipediaLink,
     WikipediaProject,
 )
-from ..search import SearchDocument, SearchService
 from ..wikidata_entity_processor import WikidataEntityProcessor
 
 logger = logging.getLogger(__name__)
@@ -109,24 +108,14 @@ def _should_import_politician(entity: WikidataEntityProcessor) -> bool:
     return True
 
 
-def _insert_politicians_batch(
-    politicians: list[dict], session: Session, search_service: SearchService
-) -> None:
-    """Insert a batch of politicians into the database and index to Meilisearch."""
+def _insert_politicians_batch(politicians: list[dict], session: Session) -> None:
+    """Insert a batch of politicians into the database.
+
+    Search indexing is handled separately by the index-build command.
+    """
     if not politicians:
         return
 
-    # Build search documents BEFORE modifying politicians (labels get used later)
-    search_documents: list[SearchDocument] = [
-        SearchDocument(
-            id=p["wikidata_id"],
-            type=Politician.__tablename__,
-            labels=p["labels"],
-        )
-        for p in politicians
-        if p.get("labels")
-    ]
-
     # First, ensure WikidataEntity records exist for all politicians (without labels)
     wikidata_data = [
         {
@@ -211,10 +200,6 @@ def _insert_politicians_batch(
 
     session.commit()
 
-    # Index to Meilisearch after successful DB commit
-    if search_documents:
-        search_service.index_documents(search_documents)
-
     logger.debug(f"Processed {len(politicians)} politicians (upserted)")
 
 
@@ -233,7 +218,6 @@ def _process_politicians_chunk(
     """
     # Create fresh connections for this worker process
     engine = create_engine(pool_size=2, max_overflow=3)
-    search_service = SearchService()
 
     politicians = []
     politician_count = 0
@@ -419,7 +403,7 @@ def _process_politicians_chunk(
             # Process batches when they reach the batch size
             if len(politicians) >= batch_size:
                 with Session(engine) as session:
-                    _insert_politicians_batch(politicians, session, search_service)
+                    _insert_politicians_batch(politicians, session)
                 politicians = []
 
     except Exception as e:
@@ -429,7 +413,7 @@ def _process_politicians_chunk(
     # Process remaining entities in final batch on successful completion
     if politicians:
         with Session(engine) as session:
-            _insert_politicians_batch(politicians, session, search_service)
+            _insert_politicians_batch(politicians, session)
 
     logger.info(f"Worker {worker_id}: finished processing {entity_count} entities")
 
 
@@ -137,7 +137,7 @@ def find_similar(
         """
         return search_service.search(
             query,
-            entity_type=cls.__tablename__,
+            entity_type=cls.__name__,
             limit=limit,
             semantic_ratio=cls._search_semantic_ratio,
         )
 
@@ -28,7 +28,7 @@ class SearchDocument(TypedDict):
     """Document format for Meilisearch indexing."""
 
     id: str
-    type: str  # Entity type (e.g., 'locations', 'politicians')
+    types: list[str]  # Entity types (e.g., ['Location', 'Country'])
     labels: list[str]
 
 
@@ -66,8 +66,8 @@ def create_index(self) -> None:
         task = index.update_settings(
             {
                 "searchableAttributes": ["labels"],
-                "filterableAttributes": ["type"],
-                "displayedAttributes": ["id", "type", "labels"],
+                "filterableAttributes": ["types"],
+                "displayedAttributes": ["id", "types", "labels"],
             }
         )
         self.client.wait_for_task(task.task_uid)
@@ -179,7 +179,7 @@ def search(
 
         Args:
             query: Search query text
-            entity_type: Optional type filter (e.g., 'locations', 'politicians')
+            entity_type: Optional type filter (e.g., 'Location', 'Politician')
             limit: Maximum number of results
             semantic_ratio: Balance between keyword (0.0) and semantic (1.0) search.
                            Default 0.0 uses pure keyword search for backward compatibility.
@@ -191,7 +191,7 @@ def search(
         index = self.client.index(INDEX_NAME)
         search_params: dict = {"limit": limit}
         if entity_type:
-            search_params["filter"] = f"type = '{entity_type}'"
+            search_params["filter"] = f"types = '{entity_type}'"
 
         # Use hybrid search when semantic_ratio > 0
         if semantic_ratio > 0:
Original file line number	Diff line number	Diff line change
`@@ -137,7 +137,7 @@ def find_similar(`
`137`	`137`	`"""`
`138`	`138`	`return search_service.search(`
`139`	`139`	`query,`
`140`		`- entity_type=cls.__tablename__,`
	`140`	`+ entity_type=cls.__name__,`
`141`	`141`	`limit=limit,`
`142`	`142`	`semantic_ratio=cls._search_semantic_ratio,`
`143`	`143`	`)`