Skip to content

Commit d2d7d99

Browse files
committed
test calls to index doc create delete
1 parent dd35a7d commit d2d7d99

File tree

6 files changed

+459
-32
lines changed

6 files changed

+459
-32
lines changed

poliloom/poliloom/cli.py

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -716,22 +716,12 @@ def garbage_collect():
716716
f"📋 Previous dump timestamp: {previous_dump.last_modified.strftime('%Y-%m-%d %H:%M:%S')} UTC"
717717
)
718718

719-
# Clean up missing entities
719+
# Clean up missing entities (also removes from search index)
720720
click.echo("⏳ Cleaning up entities using two-dump validation...")
721-
deleted_entity_ids = CurrentImportEntity.cleanup_missing(
721+
deleted_entity_count = CurrentImportEntity.cleanup_missing(
722722
session, previous_dump.last_modified
723723
)
724-
click.echo(f" • Soft-deleted {len(deleted_entity_ids)} entities")
725-
726-
# Remove deleted entities from search index
727-
if deleted_entity_ids:
728-
from poliloom.search import SearchService
729-
730-
search_service = SearchService()
731-
search_service.delete_documents(deleted_entity_ids)
732-
click.echo(
733-
f" • Removed {len(deleted_entity_ids)} entities from search index"
734-
)
724+
click.echo(f" • Soft-deleted {deleted_entity_count} entities")
735725

736726
# Clean up missing statements
737727
click.echo("⏳ Cleaning up statements using two-dump validation...")
@@ -746,7 +736,7 @@ def garbage_collect():
746736
)
747737

748738
total_deleted = (
749-
len(deleted_entity_ids)
739+
deleted_entity_count
750740
+ statement_counts["properties_marked_deleted"]
751741
+ statement_counts["relations_marked_deleted"]
752742
)

poliloom/poliloom/models/wikidata.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -824,20 +824,26 @@ class CurrentImportEntity(Base):
824824

825825
@classmethod
826826
def cleanup_missing(
827-
cls, session: Session, previous_dump_timestamp: datetime
828-
) -> list[str]:
827+
cls,
828+
session: Session,
829+
previous_dump_timestamp: datetime,
830+
) -> int:
829831
"""
830832
Soft-delete entities using two-dump validation strategy.
831833
Only deletes entities missing from current dump AND older than previous dump.
832834
This prevents race conditions from incorrectly deleting recently added entities.
833835
836+
Also removes deleted entities from the search index.
837+
834838
Args:
835839
session: Database session
836840
previous_dump_timestamp: Last modified timestamp of the previous dump.
837841
838842
Returns:
839-
List of wikidata_ids that were soft-deleted
843+
Number of entities that were soft-deleted
840844
"""
845+
from poliloom.search import SearchService
846+
841847
# Only delete if: NOT in current dump AND older than previous dump
842848
# Convert timezone-aware timestamp to naive for database comparison
843849
previous_dump_naive = previous_dump_timestamp.replace(tzinfo=None)
@@ -855,7 +861,14 @@ def cleanup_missing(
855861
{"previous_dump_timestamp": previous_dump_naive},
856862
)
857863

858-
return [row[0] for row in deleted_result.fetchall()]
864+
deleted_ids = [row[0] for row in deleted_result.fetchall()]
865+
866+
# Remove deleted entities from search index
867+
if deleted_ids:
868+
search_service = SearchService()
869+
search_service.delete_documents(deleted_ids)
870+
871+
return len(deleted_ids)
859872

860873
@classmethod
861874
def clear_tracking_table(cls, session: Session) -> None:

poliloom/tests/models/test_wikidata.py

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""Tests for WikidataEntity model."""
22

3+
from unittest.mock import patch, Mock
4+
35
from sqlalchemy.dialects.postgresql import insert
46

57
from poliloom.models import WikidataEntity, WikidataRelation, RelationType
@@ -856,3 +858,151 @@ def test_entity_with_subclass_relation_to_hierarchy(self, db_session):
856858
# Position should remain
857859
count = db_session.execute(text("SELECT COUNT(*) FROM positions")).scalar()
858860
assert count == 1
861+
862+
863+
class TestCleanupOutsideHierarchySearchService:
864+
"""Test that cleanup_outside_hierarchy calls search service for deletion.
865+
866+
Uses Location because it has SearchIndexedMixin (Position does not).
867+
"""
868+
869+
def _create_hierarchy(self, db_session, root_id, child_ids):
870+
"""Helper to create a hierarchy with root and children."""
871+
# Create wikidata entities
872+
entities = [{"wikidata_id": root_id, "name": f"Root {root_id}"}]
873+
for child_id in child_ids:
874+
entities.append({"wikidata_id": child_id, "name": f"Child {child_id}"})
875+
876+
stmt = insert(WikidataEntity).values(entities)
877+
stmt = stmt.on_conflict_do_nothing(index_elements=["wikidata_id"])
878+
db_session.execute(stmt)
879+
880+
# Create subclass relations
881+
relations = []
882+
for child_id in child_ids:
883+
relations.append(
884+
{
885+
"parent_entity_id": root_id,
886+
"child_entity_id": child_id,
887+
"relation_type": RelationType.SUBCLASS_OF,
888+
"statement_id": f"{child_id}$subclass-of-{root_id}",
889+
}
890+
)
891+
892+
if relations:
893+
stmt = insert(WikidataRelation).values(relations)
894+
stmt = stmt.on_conflict_do_nothing(index_elements=["statement_id"])
895+
db_session.execute(stmt)
896+
897+
db_session.flush()
898+
899+
def _create_location_in_hierarchy(self, db_session, location_id, class_id):
900+
"""Helper to create a location that is an instance of a hierarchy class."""
901+
from poliloom.models import Location
902+
903+
# Create wikidata entity for the location
904+
stmt = insert(WikidataEntity).values(
905+
[{"wikidata_id": location_id, "name": f"Location {location_id}"}]
906+
)
907+
stmt = stmt.on_conflict_do_nothing(index_elements=["wikidata_id"])
908+
db_session.execute(stmt)
909+
910+
# Create location record
911+
stmt = insert(Location.__table__).values([{"wikidata_id": location_id}])
912+
stmt = stmt.on_conflict_do_nothing(index_elements=["wikidata_id"])
913+
db_session.execute(stmt)
914+
915+
# Create instance_of relation to the class
916+
stmt = insert(WikidataRelation).values(
917+
[
918+
{
919+
"parent_entity_id": class_id,
920+
"child_entity_id": location_id,
921+
"relation_type": RelationType.INSTANCE_OF,
922+
"statement_id": f"{location_id}$instance-of-{class_id}",
923+
}
924+
]
925+
)
926+
stmt = stmt.on_conflict_do_nothing(index_elements=["statement_id"])
927+
db_session.execute(stmt)
928+
db_session.flush()
929+
930+
def _create_orphan_location(self, db_session, location_id):
931+
"""Helper to create a location with no hierarchy relations."""
932+
from poliloom.models import Location
933+
934+
# Create wikidata entity
935+
stmt = insert(WikidataEntity).values(
936+
[{"wikidata_id": location_id, "name": f"Orphan Location {location_id}"}]
937+
)
938+
stmt = stmt.on_conflict_do_nothing(index_elements=["wikidata_id"])
939+
db_session.execute(stmt)
940+
941+
# Create location record (no relations)
942+
stmt = insert(Location.__table__).values([{"wikidata_id": location_id}])
943+
stmt = stmt.on_conflict_do_nothing(index_elements=["wikidata_id"])
944+
db_session.execute(stmt)
945+
db_session.flush()
946+
947+
def test_cleanup_calls_delete_documents(self, db_session):
948+
"""Test that cleanup_outside_hierarchy calls delete_documents on search service."""
949+
from poliloom.models import Location
950+
951+
# Create hierarchy (Q486972 is "human settlement" in Location._hierarchy_roots)
952+
self._create_hierarchy(db_session, "Q486972", ["Q100"])
953+
954+
# Create a valid location
955+
self._create_location_in_hierarchy(db_session, "Q200", "Q100")
956+
957+
# Create orphan locations that will be deleted
958+
self._create_orphan_location(db_session, "Q300")
959+
self._create_orphan_location(db_session, "Q301")
960+
961+
# Mock the SearchService (imported locally in cleanup_outside_hierarchy)
962+
mock_search_service = Mock()
963+
with patch("poliloom.search.SearchService", return_value=mock_search_service):
964+
stats = Location.cleanup_outside_hierarchy(db_session, dry_run=False)
965+
966+
# Verify delete_documents was called with the orphan IDs
967+
assert stats["entities_removed"] == 2
968+
mock_search_service.delete_documents.assert_called_once()
969+
deleted_ids = mock_search_service.delete_documents.call_args[0][0]
970+
assert set(deleted_ids) == {"Q300", "Q301"}
971+
972+
def test_cleanup_does_not_call_delete_when_nothing_removed(self, db_session):
973+
"""Test that delete_documents is not called when no entities are removed."""
974+
from poliloom.models import Location
975+
976+
# Create hierarchy
977+
self._create_hierarchy(db_session, "Q486972", ["Q100"])
978+
979+
# Create only valid locations (no orphans)
980+
self._create_location_in_hierarchy(db_session, "Q200", "Q100")
981+
982+
# Mock the SearchService
983+
mock_search_service = Mock()
984+
with patch("poliloom.search.SearchService", return_value=mock_search_service):
985+
stats = Location.cleanup_outside_hierarchy(db_session, dry_run=False)
986+
987+
# No entities removed, so delete_documents should not be called
988+
assert stats["entities_removed"] == 0
989+
mock_search_service.delete_documents.assert_not_called()
990+
991+
def test_dry_run_does_not_call_delete_documents(self, db_session):
992+
"""Test that dry_run=True does not call delete_documents."""
993+
from poliloom.models import Location
994+
995+
# Create hierarchy
996+
self._create_hierarchy(db_session, "Q486972", ["Q100"])
997+
998+
# Create orphan location
999+
self._create_orphan_location(db_session, "Q300")
1000+
1001+
# Mock the SearchService
1002+
mock_search_service = Mock()
1003+
with patch("poliloom.search.SearchService", return_value=mock_search_service):
1004+
stats = Location.cleanup_outside_hierarchy(db_session, dry_run=True)
1005+
1006+
# Dry run reports what would be removed but doesn't call delete_documents
1007+
assert stats["entities_removed"] == 1
1008+
mock_search_service.delete_documents.assert_not_called()

poliloom/tests/test_entity_importer.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,139 @@ def test_insert_wikipedia_projects_batch_with_duplicates_handling(
427427
assert final_projects[0].name == "English Wikipedia Updated"
428428

429429

430+
class TestSearchServiceIndexing:
431+
"""Test that entity importing calls search service for label indexing.
432+
433+
Note: Only entities with SearchIndexedMixin are indexed (Country, Language, Location).
434+
Position uses embedding-based search instead.
435+
"""
436+
437+
def test_insert_multiple_locations_indexes_labels(
438+
self, db_session, mock_search_service
439+
):
440+
"""Test that inserting multiple locations calls index_documents with correct data."""
441+
locations = [
442+
{
443+
"wikidata_id": "Q60",
444+
"name": "New York City",
445+
"description": "City in USA",
446+
"labels": ["New York City", "NYC", "New York"],
447+
},
448+
{
449+
"wikidata_id": "Q84",
450+
"name": "London",
451+
"description": "Capital of UK",
452+
"labels": ["London", "Greater London"],
453+
},
454+
]
455+
456+
collection = EntityCollection(model_class=Location, shared_classes=frozenset())
457+
for loc in locations:
458+
collection.add_entity(loc)
459+
460+
collection.insert(db_session, mock_search_service)
461+
462+
# Verify index_documents was called
463+
mock_search_service.index_documents.assert_called_once()
464+
465+
# Verify the documents passed to index_documents
466+
call_args = mock_search_service.index_documents.call_args[0][0]
467+
assert len(call_args) == 2
468+
469+
# Check first document
470+
doc1 = next(d for d in call_args if d["id"] == "Q60")
471+
assert doc1["type"] == "locations"
472+
assert set(doc1["labels"]) == {"New York City", "NYC", "New York"}
473+
474+
# Check second document
475+
doc2 = next(d for d in call_args if d["id"] == "Q84")
476+
assert doc2["type"] == "locations"
477+
assert set(doc2["labels"]) == {"London", "Greater London"}
478+
479+
def test_insert_locations_indexes_labels(self, db_session, mock_search_service):
480+
"""Test that inserting locations calls index_documents with correct data."""
481+
locations = [
482+
{
483+
"wikidata_id": "Q60",
484+
"name": "New York City",
485+
"description": "City in USA",
486+
"labels": ["New York City", "NYC", "New York"],
487+
},
488+
]
489+
490+
collection = EntityCollection(model_class=Location, shared_classes=frozenset())
491+
for loc in locations:
492+
collection.add_entity(loc)
493+
494+
collection.insert(db_session, mock_search_service)
495+
496+
# Verify index_documents was called with correct data
497+
mock_search_service.index_documents.assert_called_once()
498+
call_args = mock_search_service.index_documents.call_args[0][0]
499+
assert len(call_args) == 1
500+
assert call_args[0]["id"] == "Q60"
501+
assert call_args[0]["type"] == "locations"
502+
assert set(call_args[0]["labels"]) == {"New York City", "NYC", "New York"}
503+
504+
def test_insert_countries_indexes_labels(self, db_session, mock_search_service):
505+
"""Test that inserting countries calls index_documents with correct data."""
506+
countries = [
507+
{
508+
"wikidata_id": "Q30",
509+
"name": "United States",
510+
"description": "Country",
511+
"iso_code": "US",
512+
"labels": ["United States", "USA", "America"],
513+
},
514+
]
515+
516+
collection = EntityCollection(model_class=Country, shared_classes=frozenset())
517+
for country in countries:
518+
collection.add_entity(country)
519+
520+
collection.insert(db_session, mock_search_service)
521+
522+
# Verify index_documents was called
523+
mock_search_service.index_documents.assert_called_once()
524+
call_args = mock_search_service.index_documents.call_args[0][0]
525+
assert call_args[0]["id"] == "Q30"
526+
assert call_args[0]["type"] == "countries"
527+
528+
def test_insert_empty_batch_does_not_call_index(
529+
self, db_session, mock_search_service
530+
):
531+
"""Test that empty batch does not call index_documents."""
532+
collection = EntityCollection(model_class=Location, shared_classes=frozenset())
533+
collection.insert(db_session, mock_search_service)
534+
535+
# index_documents should not be called for empty batch
536+
mock_search_service.index_documents.assert_not_called()
537+
538+
def test_insert_entities_without_labels_indexes_empty_labels(
539+
self, db_session, mock_search_service
540+
):
541+
"""Test that entities without labels are indexed with empty labels list."""
542+
locations = [
543+
{
544+
"wikidata_id": "Q1",
545+
"name": "Some Location",
546+
"description": "No labels",
547+
# No labels key
548+
},
549+
]
550+
551+
collection = EntityCollection(model_class=Location, shared_classes=frozenset())
552+
for loc in locations:
553+
collection.add_entity(loc)
554+
555+
collection.insert(db_session, mock_search_service)
556+
557+
# Should still call index_documents
558+
mock_search_service.index_documents.assert_called_once()
559+
call_args = mock_search_service.index_documents.call_args[0][0]
560+
assert call_args[0]["labels"] == []
561+
562+
430563
class TestWikipediaProjectFiltering:
431564
"""Test Wikipedia project filtering logic in should_import method."""
432565

0 commit comments

Comments
 (0)