Skip to content

Commit 0264b80

Browse files
authored
feat: add table-existence verification to provisioning completeness (#2123)
* feat: add table-existence verification to provisioning completeness check After schema migrations run, verify expected tables exist in tenant schema before transitioning to active state. This closes the partial-provisioning gap where the schema namespace exists but migrations failed silently. Each service can define a SentinelTable (the primary domain table created by its first migration). If set, that specific table is checked. If not set, the service is assumed to have no required tables (e.g., internal-account). Default service configs now include sentinel tables for all services with migrations. * fix: align SentinelTable doc with implementation, remove dead COUNT query The doc comment promised "at least one table exists" verification when SentinelTable is empty, but the code intentionally allowed zero tables for services without migrations. Fix the doc to match the behavior and remove the unused COUNT query. --------- Co-authored-by: Ben Coombs <bjcoombs@users.noreply.github.com>
1 parent e1a4745 commit 0264b80

5 files changed

Lines changed: 256 additions & 18 deletions

File tree

services/tenant/provisioner/errors.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,4 +84,9 @@ var (
8484
// ErrHookPanic indicates a post-provisioning hook panicked.
8585
// The recovered panic value is wrapped in the error chain.
8686
ErrHookPanic = errors.New("post-provisioning hook panicked")
87+
88+
// ErrSchemaVerificationFailed indicates that post-migration table verification failed.
89+
// The tenant schema exists but expected tables were not created, indicating
90+
// migrations ran but did not produce the expected database objects.
91+
ErrSchemaVerificationFailed = errors.New("schema provisioning verification failed: expected tables not found")
8792
)

services/tenant/provisioner/postgres_provisioner.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,14 @@ func (p *PostgresProvisioner) ProvisionSchemas(ctx context.Context, tenantID ten
126126
return err
127127
}
128128

129+
// Verify expected tables exist before transitioning to active.
130+
// This catches partial provisioning where schema exists but migrations failed silently.
131+
if err := p.verifySchemaProvisioned(ctx, tenantID.SchemaName(), logger); err != nil {
132+
logger.Error("schema verification failed after migrations", "error", err)
133+
p.markProvisioningFailed(ctx, status, err.Error())
134+
return err
135+
}
136+
129137
// Mark as active
130138
status.State = StateActive
131139
status.UpdatedAt = timeNow()

services/tenant/provisioner/postgres_provisioner_test.go

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1896,6 +1896,163 @@ func TestPostgresProvisioner_PostProvisioningHooks_PanicRecovery(t *testing.T) {
18961896
assert.Equal(t, StateActive, status.State)
18971897
}
18981898

1899+
func TestPostgresProvisioner_VerifySchemaProvisioned_SentinelTable(t *testing.T) {
1900+
tc := setupTestContainer(t)
1901+
defer tc.cleanup(t)
1902+
1903+
tenantID := tenant.MustNewTenantID("sentinel_test")
1904+
createTestTenant(t, tc.db, tenantID.String())
1905+
1906+
// Create migration that creates a specific table
1907+
svcDir := filepath.Join(tc.migDir, "sentinel-service")
1908+
require.NoError(t, os.MkdirAll(svcDir, 0o755))
1909+
createTestMigration(t, svcDir, "20251201000000_init.sql", `
1910+
CREATE TABLE my_sentinel (id UUID PRIMARY KEY DEFAULT gen_random_uuid());
1911+
`)
1912+
1913+
config := &Config{
1914+
Services: []ServiceConfig{
1915+
{
1916+
Name: "sentinel-service",
1917+
MigrationPath: svcDir,
1918+
DatabaseURL: tc.connStr,
1919+
SentinelTable: "my_sentinel",
1920+
},
1921+
},
1922+
ProvisioningTimeout: 30 * time.Second,
1923+
}
1924+
1925+
prov, err := NewPostgresProvisioner(tc.db, config)
1926+
require.NoError(t, err)
1927+
defer prov.Close()
1928+
1929+
// Provision should succeed - sentinel table is created by migration
1930+
err = prov.ProvisionSchemas(context.Background(), tenantID)
1931+
require.NoError(t, err)
1932+
1933+
status, err := prov.GetProvisioningStatus(context.Background(), tenantID)
1934+
require.NoError(t, err)
1935+
assert.Equal(t, StateActive, status.State)
1936+
}
1937+
1938+
func TestPostgresProvisioner_VerifySchemaProvisioned_MissingSentinelTable(t *testing.T) {
1939+
tc := setupTestContainer(t)
1940+
defer tc.cleanup(t)
1941+
1942+
tenantID := tenant.MustNewTenantID("missing_sentinel")
1943+
createTestTenant(t, tc.db, tenantID.String())
1944+
1945+
// Create migration that creates a table with a DIFFERENT name than the sentinel
1946+
svcDir := filepath.Join(tc.migDir, "wrong-table-service")
1947+
require.NoError(t, os.MkdirAll(svcDir, 0o755))
1948+
createTestMigration(t, svcDir, "20251201000000_init.sql", `
1949+
CREATE TABLE some_other_table (id UUID PRIMARY KEY DEFAULT gen_random_uuid());
1950+
`)
1951+
1952+
config := &Config{
1953+
Services: []ServiceConfig{
1954+
{
1955+
Name: "wrong-table-service",
1956+
MigrationPath: svcDir,
1957+
DatabaseURL: tc.connStr,
1958+
SentinelTable: "expected_table_that_doesnt_exist",
1959+
},
1960+
},
1961+
ProvisioningTimeout: 30 * time.Second,
1962+
}
1963+
1964+
prov, err := NewPostgresProvisioner(tc.db, config)
1965+
require.NoError(t, err)
1966+
defer prov.Close()
1967+
1968+
// Provision should fail verification - sentinel table doesn't exist
1969+
err = prov.ProvisionSchemas(context.Background(), tenantID)
1970+
require.Error(t, err)
1971+
assert.ErrorIs(t, err, ErrSchemaVerificationFailed)
1972+
1973+
// Status should be failed
1974+
status, err := prov.GetProvisioningStatus(context.Background(), tenantID)
1975+
require.NoError(t, err)
1976+
assert.Equal(t, StateFailed, status.State)
1977+
assert.Contains(t, status.ErrorMessage, "expected_table_that_doesnt_exist")
1978+
}
1979+
1980+
func TestPostgresProvisioner_VerifySchemaProvisioned_NoSentinelWithTables(t *testing.T) {
1981+
tc := setupTestContainer(t)
1982+
defer tc.cleanup(t)
1983+
1984+
tenantID := tenant.MustNewTenantID("no_sentinel_ok")
1985+
createTestTenant(t, tc.db, tenantID.String())
1986+
1987+
// Service with no sentinel table configured but has migrations
1988+
svcDir := filepath.Join(tc.migDir, "no-sentinel-service")
1989+
require.NoError(t, os.MkdirAll(svcDir, 0o755))
1990+
createTestMigration(t, svcDir, "20251201000000_init.sql", `
1991+
CREATE TABLE any_table (id UUID PRIMARY KEY DEFAULT gen_random_uuid());
1992+
`)
1993+
1994+
config := &Config{
1995+
Services: []ServiceConfig{
1996+
{
1997+
Name: "no-sentinel-service",
1998+
MigrationPath: svcDir,
1999+
DatabaseURL: tc.connStr,
2000+
// SentinelTable intentionally empty
2001+
},
2002+
},
2003+
ProvisioningTimeout: 30 * time.Second,
2004+
}
2005+
2006+
prov, err := NewPostgresProvisioner(tc.db, config)
2007+
require.NoError(t, err)
2008+
defer prov.Close()
2009+
2010+
// Should succeed - no sentinel check, tables exist
2011+
err = prov.ProvisionSchemas(context.Background(), tenantID)
2012+
require.NoError(t, err)
2013+
2014+
status, err := prov.GetProvisioningStatus(context.Background(), tenantID)
2015+
require.NoError(t, err)
2016+
assert.Equal(t, StateActive, status.State)
2017+
}
2018+
2019+
func TestPostgresProvisioner_VerifySchemaProvisioned_EmptySchemaNoSentinel(t *testing.T) {
2020+
tc := setupTestContainer(t)
2021+
defer tc.cleanup(t)
2022+
2023+
tenantID := tenant.MustNewTenantID("empty_schema")
2024+
createTestTenant(t, tc.db, tenantID.String())
2025+
2026+
// Service with no sentinel and no migrations - should succeed (e.g., internal-account)
2027+
svcDir := filepath.Join(tc.migDir, "empty-service")
2028+
require.NoError(t, os.MkdirAll(svcDir, 0o755))
2029+
// No migration files
2030+
2031+
config := &Config{
2032+
Services: []ServiceConfig{
2033+
{
2034+
Name: "empty-service",
2035+
MigrationPath: svcDir,
2036+
DatabaseURL: tc.connStr,
2037+
// SentinelTable intentionally empty
2038+
},
2039+
},
2040+
ProvisioningTimeout: 30 * time.Second,
2041+
}
2042+
2043+
prov, err := NewPostgresProvisioner(tc.db, config)
2044+
require.NoError(t, err)
2045+
defer prov.Close()
2046+
2047+
// Should succeed - empty service with no sentinel is OK
2048+
err = prov.ProvisionSchemas(context.Background(), tenantID)
2049+
require.NoError(t, err)
2050+
2051+
status, err := prov.GetProvisioningStatus(context.Background(), tenantID)
2052+
require.NoError(t, err)
2053+
assert.Equal(t, StateActive, status.State)
2054+
}
2055+
18992056
func TestNewPostgresProvisioner_NilPlatformDB(t *testing.T) {
19002057
config := &Config{
19012058
Services: []ServiceConfig{

services/tenant/provisioner/provisioner.go

Lines changed: 31 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,13 @@ type ServiceConfig struct {
297297
// If empty, falls back to a constructed URL based on service name:
298298
// postgres://meridian_{service}_user@cockroachdb:26257/meridian_{service}?sslmode=disable
299299
DatabaseURL string
300+
301+
// SentinelTable is the name of a table that must exist after migrations complete.
302+
// Used for post-migration verification to detect partial provisioning where
303+
// the schema exists but migrations failed silently.
304+
// If empty, the service is assumed to have no required tables and verification
305+
// is skipped (e.g., services that only need a schema namespace without migrations).
306+
SentinelTable string
300307
}
301308

302309
// PostProvisioningHook is called after successful schema provisioning for a tenant.
@@ -380,32 +387,38 @@ func DefaultConfig() *Config {
380387
}
381388
}
382389

383-
// defaultServiceNames lists all BIAN services that require schema provisioning.
390+
// defaultServiceDefs lists all BIAN services that require schema provisioning.
384391
// Order matters: services are provisioned in the order listed.
385-
var defaultServiceNames = []string{
386-
"party",
387-
"current-account",
388-
"position-keeping",
389-
"financial-accounting",
390-
"payment-order",
391-
"market-information",
392-
"reference-data",
392+
// SentinelTable is the primary domain table created by the first migration;
393+
// its presence confirms migrations ran to completion.
394+
var defaultServiceDefs = []struct {
395+
Name string
396+
SentinelTable string // empty for services with no provisioner-specific migrations
397+
}{
398+
{"party", "party"},
399+
{"current-account", "account"},
400+
{"position-keeping", "financial_position_log"},
401+
{"financial-accounting", "financial_booking_log"},
402+
{"payment-order", "payment_order"},
403+
{"market-information", "data_source"},
404+
{"reference-data", "instrument_definition"},
393405
// Services below require org_<tenant> schemas for tenant-scoped
394406
// queries but have no provisioner-specific migrations.
395-
"internal-account",
396-
"reconciliation",
397-
"identity",
398-
"control-plane",
407+
{"internal-account", ""},
408+
{"reconciliation", ""},
409+
{"identity", ""},
410+
{"control-plane", ""},
399411
}
400412

401413
// buildDefaultServiceConfigs constructs ServiceConfig entries for all default services.
402414
func buildDefaultServiceConfigs(basePath string) []ServiceConfig {
403-
configs := make([]ServiceConfig, 0, len(defaultServiceNames))
404-
for _, name := range defaultServiceNames {
415+
configs := make([]ServiceConfig, 0, len(defaultServiceDefs))
416+
for _, def := range defaultServiceDefs {
405417
configs = append(configs, ServiceConfig{
406-
Name: name,
407-
MigrationPath: basePath + "/" + name,
408-
DatabaseURL: getServiceDatabaseURL(name),
418+
Name: def.Name,
419+
MigrationPath: basePath + "/" + def.Name,
420+
DatabaseURL: getServiceDatabaseURL(def.Name),
421+
SentinelTable: def.SentinelTable,
409422
})
410423
}
411424
return configs

services/tenant/provisioner/provisioner_helpers.go

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,61 @@ func (p *PostgresProvisioner) dropSchemaInAllDBs(ctx context.Context, schemaName
118118
return nil
119119
}
120120

121+
// verifySchemaProvisioned checks that expected tables exist in the tenant schema
122+
// after migrations have been applied. This closes the partial-provisioning gap
123+
// where the schema namespace exists but migrations failed halfway through.
124+
//
125+
// For each service:
126+
// - If SentinelTable is set, verifies that specific table exists
127+
// - If SentinelTable is empty, verification is skipped (service has no required tables)
128+
//
129+
// Returns nil if verification passes, or an error listing which services failed.
130+
func (p *PostgresProvisioner) verifySchemaProvisioned(ctx context.Context, schemaName string, logger *slog.Logger) error {
131+
var failedServices []string
132+
133+
for _, svc := range p.config.Services {
134+
serviceDB, ok := p.serviceDbs[svc.Name]
135+
if !ok {
136+
continue // Already caught by provisionSingleService
137+
}
138+
139+
if svc.SentinelTable != "" {
140+
// Check for specific sentinel table
141+
var exists bool
142+
err := serviceDB.WithContext(bypassCtx(ctx)).Raw(
143+
`SELECT EXISTS(
144+
SELECT 1 FROM information_schema.tables
145+
WHERE table_schema = ? AND table_name = ?
146+
)`, schemaName, svc.SentinelTable,
147+
).Scan(&exists).Error
148+
if err != nil {
149+
logger.Error("failed to verify sentinel table",
150+
"service", svc.Name,
151+
"sentinel_table", svc.SentinelTable,
152+
"error", err)
153+
failedServices = append(failedServices, fmt.Sprintf("%s (query error: %v)", svc.Name, err))
154+
continue
155+
}
156+
if !exists {
157+
logger.Error("sentinel table missing after migration",
158+
"service", svc.Name,
159+
"schema", schemaName,
160+
"sentinel_table", svc.SentinelTable)
161+
failedServices = append(failedServices, fmt.Sprintf("%s (missing table: %s)", svc.Name, svc.SentinelTable))
162+
}
163+
} else {
164+
// No sentinel table configured - service has no required tables
165+
// (e.g., internal-account, reconciliation). Skip verification.
166+
logger.Debug("no sentinel table configured, skipping verification", "service", svc.Name)
167+
}
168+
}
169+
170+
if len(failedServices) > 0 {
171+
return fmt.Errorf("%w: %s", ErrSchemaVerificationFailed, strings.Join(failedServices, "; "))
172+
}
173+
return nil
174+
}
175+
121176
// isAlreadyExistsError checks if the error indicates an object already exists.
122177
//
123178
// IDEMPOTENCY: This is a key idempotency mechanism for migrations. When a migration

0 commit comments

Comments
 (0)