Skip to content

Commit 36e7055

Browse files
authored
feat: add missing FK indexes, unique constraints, and cleanup orphaned DAG tables (airweave-ai#1506)
* feat: integrate MyPy type checking into CI and pre-commit Adds a dedicated MyPy job to the GitHub Actions workflow, running type checks on changed Python files within the `backend/airweave` directory. This optimizes execution time by focusing only on relevant modifications. Introduces a pre-commit hook for MyPy, ensuring type correctness locally before commits are pushed. Incorporates a `ruff check` step into the `code-quality.yml` workflow for explicit linting, complementing the existing formatting check. * fix: point at /backend * ci: scope ruff lint and format checks to changed files only * feat: add missing FK indexes, unique constraints, and pre-flight check Adds indexes on unindexed foreign keys (sync_job.sync_id, sync_connection, source_connection, connection, collection, entity_relation) and utility indexes (organization.auth0_org_id, expires_at on session tables). Adds unique constraints on sync_connection(sync_id, connection_id) and entity_relation(from, to, name, org_id) to prevent duplicate rows. Removes redundant idx_entity_count_sync_def (duplicate of unique constraint). Includes pre-flight check script to validate no duplicate data exists before applying the migration. * feat: add alembic migration for indexes, constraints, and DAG table cleanup Autogenerated migration that: - Creates missing FK indexes on 7 tables - Adds unique constraints on sync_connection and entity_relation - Drops redundant idx_entity_count_sync_def - Adds utility indexes (auth0_org_id, expires_at) - Drops orphaned DAG tables (sync_dag, dag_node, dag_edge) - Fixes access_control_membership FK to include ondelete CASCADE * feat: drop useless B-tree index on search_queries.query_text B-tree on unbounded Text is only useful for exact match or prefix. Removing it saves write overhead on every INSERT into search_queries. * Delete backend/scripts/preflight_check_model_indexes.py
1 parent ed1aabb commit 36e7055

15 files changed

Lines changed: 215 additions & 32 deletions

backend/airweave/models/collection.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
"""Collection model."""
22

33
import uuid
4-
from typing import TYPE_CHECKING, List, Optional
4+
from typing import TYPE_CHECKING, Optional
55

6-
from sqlalchemy import ForeignKey, String
6+
from sqlalchemy import ForeignKey, Index, String
77
from sqlalchemy.dialects.postgresql import JSONB, UUID
88
from sqlalchemy.orm import Mapped, mapped_column, relationship
99

@@ -32,10 +32,6 @@ class Collection(OrganizationBase, UserMixin):
3232
"VectorDbDeploymentMetadata", lazy="joined"
3333
)
3434

35-
if TYPE_CHECKING:
36-
search_queries: List["SearchQuery"]
37-
source_connections: List["SourceConnection"]
38-
3935
source_connections: Mapped[list["SourceConnection"]] = relationship(
4036
"SourceConnection",
4137
back_populates="collection",
@@ -51,3 +47,5 @@ class Collection(OrganizationBase, UserMixin):
5147
cascade="all, delete-orphan",
5248
passive_deletes=True,
5349
)
50+
51+
__table_args__ = (Index("idx_collection_vdb_metadata_id", "vector_db_deployment_metadata_id"),)

backend/airweave/models/connection.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
"""Connection model."""
22

3-
from typing import TYPE_CHECKING, List, Optional
3+
from typing import TYPE_CHECKING, Any, List, Optional
44
from uuid import UUID
55

6-
from sqlalchemy import CheckConstraint, ForeignKey, String, Text, event
6+
from sqlalchemy import CheckConstraint, ForeignKey, Index, String, Text, event
77
from sqlalchemy import Enum as SQLAlchemyEnum
88
from sqlalchemy.orm import Mapped, Session, mapped_column, relationship
99

@@ -115,8 +115,6 @@ class Connection(Base):
115115
)
116116

117117
__table_args__ = (
118-
# Enforce that organization_id, created_by_email, and modified_by_email are not null
119-
# except for the specific native connections
120118
CheckConstraint(
121119
"""
122120
(short_name IN ('qdrant_native', 'neo4j_native', 'local_text2vec'))
@@ -127,12 +125,14 @@ class Connection(Base):
127125
""",
128126
name="ck_connection_native_or_complete",
129127
),
128+
Index("idx_connection_organization_id", "organization_id"),
129+
Index("idx_connection_integration_credential_id", "integration_credential_id"),
130130
)
131131

132132

133133
# Event to delete integration credential when Connection is deleted
134134
@event.listens_for(Connection, "before_delete")
135-
def delete_integration_credential(mapper, connection, target):
135+
def delete_integration_credential(mapper: Any, connection: Any, target: Any) -> None:
136136
"""When a Connection is deleted, also delete its IntegrationCredential if present."""
137137
if target.integration_credential_id:
138138
# Get the session

backend/airweave/models/connection_init_session.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from typing import TYPE_CHECKING, Optional
55
from uuid import UUID
66

7-
from sqlalchemy import JSON, DateTime, ForeignKey, String
7+
from sqlalchemy import JSON, DateTime, ForeignKey, Index, String
88
from sqlalchemy.orm import Mapped, mapped_column, relationship
99

1010
from airweave.models._base import OrganizationBase
@@ -70,6 +70,8 @@ class ConnectionInitSession(OrganizationBase):
7070

7171
redirect_session: Mapped[Optional["RedirectSession"]] = relationship("RedirectSession")
7272

73+
__table_args__ = (Index("idx_connection_init_session_expires_at", "expires_at"),)
74+
7375
@staticmethod
7476
def default_expires_at(minutes: int = 30) -> datetime:
7577
"""Return a UTC expiry timestamp ``minutes`` from now."""

backend/airweave/models/entity_count.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,5 +53,4 @@ class EntityCount(Base):
5353
),
5454
Index("idx_entity_count_sync_id", "sync_id"),
5555
Index("idx_entity_count_entity_def_id", "entity_definition_id"),
56-
Index("idx_entity_count_sync_def", "sync_id", "entity_definition_id"),
5756
)
Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
"""Models for entity relations."""
22

3-
from sqlalchemy import Column, ForeignKey, String
3+
from typing import Optional
4+
from uuid import UUID
5+
6+
from sqlalchemy import ForeignKey, Index, String, UniqueConstraint
7+
from sqlalchemy.orm import Mapped, mapped_column
48

59
from airweave.models._base import Base
610

@@ -10,8 +14,26 @@ class EntityRelation(Base):
1014

1115
__tablename__ = "entity_relation"
1216

13-
name = Column(String, nullable=False)
14-
description = Column(String)
15-
from_entity_definition_id = Column(ForeignKey("entity_definition.id"), nullable=False)
16-
to_entity_definition_id = Column(ForeignKey("entity_definition.id"), nullable=False)
17-
organization_id = Column(ForeignKey("organization.id"), nullable=True)
17+
name: Mapped[str] = mapped_column(String, nullable=False)
18+
description: Mapped[Optional[str]] = mapped_column(String, nullable=True)
19+
from_entity_definition_id: Mapped[UUID] = mapped_column(
20+
ForeignKey("entity_definition.id"), nullable=False
21+
)
22+
to_entity_definition_id: Mapped[UUID] = mapped_column(
23+
ForeignKey("entity_definition.id"), nullable=False
24+
)
25+
organization_id: Mapped[Optional[UUID]] = mapped_column(
26+
ForeignKey("organization.id"), nullable=True
27+
)
28+
29+
__table_args__ = (
30+
UniqueConstraint(
31+
"from_entity_definition_id",
32+
"to_entity_definition_id",
33+
"name",
34+
"organization_id",
35+
name="uq_entity_relation",
36+
),
37+
Index("idx_entity_relation_from", "from_entity_definition_id"),
38+
Index("idx_entity_relation_to", "to_entity_definition_id"),
39+
)

backend/airweave/models/organization.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from typing import TYPE_CHECKING, List, Optional
44

5-
from sqlalchemy import JSON, String, Text
5+
from sqlalchemy import JSON, Index, String, Text
66
from sqlalchemy.orm import Mapped, mapped_column, relationship
77

88
from airweave.models._base import Base
@@ -77,3 +77,5 @@ class Organization(Base):
7777
cascade="all, delete-orphan",
7878
lazy="noload",
7979
)
80+
81+
__table_args__ = (Index("idx_organization_auth0_org_id", "auth0_org_id"),)

backend/airweave/models/organization_billing.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ class OrganizationBilling(Base):
1717

1818
__tablename__ = "organization_billing"
1919

20-
# Foreign key to organization
20+
# TODO: organization_id is typed as str but organization.id is UUID.
21+
# Needs a data migration to fix — do not change the type without one.
2122
organization_id: Mapped[str] = mapped_column(
2223
ForeignKey("organization.id", ondelete="CASCADE"), unique=True
2324
)

backend/airweave/models/redirect_session.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from datetime import datetime, timedelta, timezone
44

5-
from sqlalchemy import DateTime, String, Text
5+
from sqlalchemy import DateTime, Index, String, Text
66
from sqlalchemy.orm import Mapped, mapped_column
77

88
from airweave.models._base import OrganizationBase
@@ -29,6 +29,8 @@ class RedirectSession(OrganizationBase):
2929
# Expiry; entries are consumed (deleted) on first use or when expired
3030
expires_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False)
3131

32+
__table_args__ = (Index("idx_redirect_session_expires_at", "expires_at"),)
33+
3234
@staticmethod
3335
def default_expires_at(minutes: int = 5) -> datetime:
3436
"""Generate a default expiration datetime for redirect sessions.

backend/airweave/models/search_query.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,6 @@ class SearchQuery(OrganizationBase, UserMixin):
124124
Index("ix_search_queries_api_key_created", "api_key_id", "created_at"),
125125
Index("ix_search_queries_is_streaming", "is_streaming"),
126126
Index("ix_search_queries_retrieval_strategy", "retrieval_strategy"),
127-
Index("ix_search_queries_query_text", "query_text"), # For text analysis
128127
Index("ix_search_queries_duration", "duration_ms"),
129128
Index("ix_search_queries_results_count", "results_count"),
130129
)

backend/airweave/models/source_connection.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
"""Source connection model."""
22

33
from time import sleep
4-
from typing import TYPE_CHECKING, Optional
4+
from typing import TYPE_CHECKING, Any, Optional
55
from uuid import UUID
66

7-
from sqlalchemy import JSON, Boolean, ForeignKey, String, Text, event
7+
from sqlalchemy import JSON, Boolean, ForeignKey, Index, String, Text, event
88
from sqlalchemy.orm import Mapped, Session, mapped_column, relationship
99

1010
from airweave.models._base import OrganizationBase, UserMixin
@@ -90,10 +90,16 @@ class SourceConnection(OrganizationBase, UserMixin):
9090
lazy="noload",
9191
)
9292

93+
__table_args__ = (
94+
Index("idx_source_connection_sync_id", "sync_id"),
95+
Index("idx_source_connection_connection_id", "connection_id"),
96+
Index("idx_source_connection_collection_id", "readable_collection_id"),
97+
)
98+
9399

94100
# Event to delete parent Sync when SourceConnection is deleted
95101
@event.listens_for(SourceConnection, "before_delete")
96-
def delete_parent_sync_and_connection(mapper, connection, target):
102+
def delete_parent_sync_and_connection(mapper: Any, connection: Any, target: Any) -> None:
97103
"""When a SourceConnection is deleted, also delete its parent Sync and Connection."""
98104
# Delete parent Sync if it exists
99105
if target.sync_id:

0 commit comments

Comments
 (0)