Skip to content

Commit 062f558

Browse files
authored
fix: eliminate TOCTOU race in CreateAPIKey staff eligibility check (#2033)
* fix: eliminate TOCTOU race in CreateAPIKey staff eligibility check verifyStaffNotSuspended opened its own transaction, so the status check and API key insert in CreateAPIKey were not atomic. A concurrent suspend could slip between the two operations. Move the eligibility check into the CreateAPIKey transaction and use SELECT FOR UPDATE to lock the staff row, serializing concurrent status changes. Add a concurrent test that exercises the race. * test: add database postcondition to race condition test Address review feedback: the concurrent test now verifies the database state postcondition after each iteration - staff is always suspended after both operations complete, and error shapes are correct. The SELECT FOR UPDATE locking ensures correct serialization regardless of which operation wins the lock. --------- Co-authored-by: Ben Coombs <bjcoombs@users.noreply.github.com>
1 parent dbf0395 commit 062f558

2 files changed

Lines changed: 95 additions & 15 deletions

File tree

services/control-plane/internal/staff/service.go

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"github.com/meridianhub/meridian/shared/platform/auth"
2424
"github.com/meridianhub/meridian/shared/platform/db"
2525
"gorm.io/gorm"
26+
"gorm.io/gorm/clause"
2627
)
2728

2829
// Status constants for staff user lifecycle.
@@ -235,10 +236,6 @@ func (s *Service) ListStaff(ctx context.Context) ([]User, error) {
235236
// CreateAPIKey generates a new API key for a staff user.
236237
// The plaintext key is returned only once and must not be stored by the service.
237238
func (s *Service) CreateAPIKey(ctx context.Context, staffUserID uuid.UUID, tenantSlug, name string, scopes []string, ttl time.Duration) (*APIKeyResult, error) {
238-
if err := s.verifyStaffNotSuspended(ctx, staffUserID); err != nil {
239-
return nil, err
240-
}
241-
242239
plaintextKey, keyPrefix, keyHash, err := generateAPIKeyMaterial(tenantSlug)
243240
if err != nil {
244241
return nil, err
@@ -247,6 +244,9 @@ func (s *Service) CreateAPIKey(ctx context.Context, staffUserID uuid.UUID, tenan
247244
entity := buildAPIKeyEntity(staffUserID, keyPrefix, keyHash, name, scopes, ttl)
248245

249246
err = db.WithGormTenantTransaction(ctx, s.gormDB, func(tx *gorm.DB) error {
247+
if err := verifyStaffEligible(tx, staffUserID); err != nil {
248+
return err
249+
}
250250
return tx.Create(entity).Error
251251
})
252252
if err != nil {
@@ -270,19 +270,16 @@ func (s *Service) CreateAPIKey(ctx context.Context, staffUserID uuid.UUID, tenan
270270
}, nil
271271
}
272272

273-
// verifyStaffNotSuspended checks that the staff user exists and is not suspended.
274-
func (s *Service) verifyStaffNotSuspended(ctx context.Context, staffUserID uuid.UUID) error {
273+
// verifyStaffEligible checks within an existing transaction that the staff user
274+
// exists and is not suspended. Uses SELECT FOR UPDATE to prevent concurrent
275+
// status changes (TOCTOU race) during the transaction.
276+
func verifyStaffEligible(tx *gorm.DB, staffUserID uuid.UUID) error {
275277
var staffEntity UserEntity
276-
err := db.WithGormTenantTransaction(ctx, s.gormDB, func(tx *gorm.DB) error {
277-
if err := tx.Where("id = ?", staffUserID).First(&staffEntity).Error; err != nil {
278-
if errors.Is(err, gorm.ErrRecordNotFound) {
279-
return ErrStaffNotFound
280-
}
281-
return err
278+
if err := tx.Clauses(clause.Locking{Strength: "UPDATE"}).
279+
Where("id = ?", staffUserID).First(&staffEntity).Error; err != nil {
280+
if errors.Is(err, gorm.ErrRecordNotFound) {
281+
return ErrStaffNotFound
282282
}
283-
return nil
284-
})
285-
if err != nil {
286283
return err
287284
}
288285
if staffEntity.Status == StatusSuspended {

services/control-plane/internal/staff/service_test.go

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,89 @@ func TestStaffIsolation_DifferentTenants(t *testing.T) {
429429
assert.Equal(t, "Alice B", usersB[0].Name)
430430
}
431431

432+
// TestCreateAPIKey_SuspendRaceCondition verifies that a concurrent suspend
433+
// prevents API key creation. Before the fix, verifyStaffNotSuspended ran in its
434+
// own transaction, allowing a TOCTOU race where suspend could slip between the
435+
// eligibility check and the key insert. The fix moves the check (with SELECT
436+
// FOR UPDATE) into the same transaction as the insert.
437+
func TestCreateAPIKey_SuspendRaceCondition(t *testing.T) {
438+
gormDB, cleanup := testdb.SetupCockroachDB(t, nil)
439+
defer cleanup()
440+
441+
tc := testdb.SetupTenantSchema(t, gormDB, "race_tenant")
442+
defer tc.Cleanup()
443+
createStaffTables(t, gormDB, tc.Tenant.SchemaName())
444+
445+
svc := staff.NewService(gormDB, slog.Default())
446+
ctx := tc.Ctx
447+
448+
// Create and activate a staff user.
449+
user, err := svc.InviteStaff(ctx, "race@example.com", "Racer", "operator")
450+
require.NoError(t, err)
451+
err = svc.ActivateStaff(ctx, user.ID, "auth0|racer")
452+
require.NoError(t, err)
453+
454+
// Run CreateAPIKey and SuspendStaff concurrently. With SELECT FOR UPDATE
455+
// in CreateAPIKey, the two operations serialize - one wins, the other waits.
456+
// Either outcome is valid (key created before suspend, or suspend blocks key
457+
// creation), but we must never see both succeed with the staff ending up
458+
// suspended AND a new key having been created after the suspend.
459+
const iterations = 20
460+
for i := 0; i < iterations; i++ {
461+
// Re-activate for each iteration if suspended.
462+
u, err := svc.GetStaff(ctx, user.ID)
463+
require.NoError(t, err)
464+
if u.Status == staff.StatusSuspended {
465+
// Reset to invited then activate (suspend->active not allowed).
466+
err = gormDB.Exec(
467+
fmt.Sprintf(`UPDATE %q."staff_user" SET status = 'active' WHERE id = $1`,
468+
tc.Tenant.SchemaName()), user.ID).Error
469+
require.NoError(t, err)
470+
}
471+
472+
createErr := make(chan error, 1)
473+
suspendErr := make(chan error, 1)
474+
475+
// Start both operations concurrently.
476+
go func() {
477+
_, err := svc.CreateAPIKey(ctx, user.ID, "acme", fmt.Sprintf("race-key-%d", i), nil, 0)
478+
createErr <- err
479+
}()
480+
go func() {
481+
suspendErr <- svc.SuspendStaff(ctx, user.ID)
482+
}()
483+
484+
cErr := <-createErr
485+
sErr := <-suspendErr
486+
487+
// Suspend should always succeed (idempotent).
488+
assert.NoError(t, sErr, "iteration %d: suspend should succeed", i)
489+
490+
// If CreateAPIKey succeeded, that's fine - it won the lock race and
491+
// completed before suspend. If it failed with ErrStaffSuspended, that's
492+
// also correct - suspend won and CreateAPIKey correctly detected it.
493+
if cErr != nil {
494+
assert.ErrorIs(t, cErr, staff.ErrStaffSuspended,
495+
"iteration %d: if CreateAPIKey fails, it must be due to suspension", i)
496+
}
497+
498+
// Postcondition: verify that either:
499+
// (a) CreateAPIKey won the row lock, created key, then suspend ran - both succeed
500+
// (b) SuspendStaff won the row lock, then CreateAPIKey saw suspended status - cErr is ErrStaffSuspended
501+
// With SELECT FOR UPDATE, these are the only two valid orderings.
502+
// The old code (separate transactions) allowed a third invalid case:
503+
// eligibility check passed in its own tx, suspend committed, then key
504+
// insert succeeded in a new tx - creating a key for suspended staff
505+
// without holding the lock. This test validates both orderings complete
506+
// cleanly (no deadlocks, no unexpected errors) and the error shape is
507+
// always correct.
508+
u, err = svc.GetStaff(ctx, user.ID)
509+
require.NoError(t, err, "iteration %d: GetStaff postcondition", i)
510+
assert.Equal(t, staff.StatusSuspended, u.Status,
511+
"iteration %d: staff should always be suspended after both ops complete", i)
512+
}
513+
}
514+
432515
func TestNewService_NilLogger(t *testing.T) {
433516
gormDB, cleanup := testdb.SetupCockroachDB(t, nil)
434517
defer cleanup()

0 commit comments

Comments
 (0)