🐛 fix(repository): restore slow-path bucket read dropped by shard migration (backport to 0.10.x) (#431)

sodre · web-flow · commit 7d18595e9716 · 2026-06-25T23:36:07.000-04:00
## Summary Backports the slow-path bucket-read fix from #429 (merged to `main`) to the `release/0.10.x` maintenance branch for a **v0.10.3** patch release. **Bug:** `Repository.batch_get_entity_and_buckets()` — the `acquire()` slow-path / refill-recovery read — classified `BatchGetItem` response items using the stale `sk.startswith(schema.SK_BUCKET)` (`"#BUCKET#"`) prefix. Bucket state records use `SK == "#STATE"` (`sk_state()`) since the per-shard partition-key migration ([GHSA-76rv-2r9v-c5m6](GHSA-76rv-2r9v-c5m6)), so the filter never matched and every bucket was silently dropped. The slow path then treated existing buckets as new, the conditional write failed, and `acquire()` raised `RateLimitExceeded` with `retry_after=0.0` instead of refilling. **Effect:** wait-then-acquire refill recovery was broken. - With `speculative_writes=True`, it breaks the refill fallback (masked in production only when the aggregator Lambda refills out-of-band). - With `speculative_writes=False` or `--no-aggregator`, it is fully broken. This bug shipped in **v0.10.1** and is present in **v0.10.2**; this PR delivers the fix as **v0.10.3**. **Fix:** one line — `elif sk.startswith(schema.SK_BUCKET):` → `elif sk == schema.sk_state():` (async source `repository.py`; regenerated into `sync_repository.py`). Cherry-picked cleanly from `main` commits `7e61fd3` (fix + unit tests) and `8123fea` (LocalStack tests), with `-x` provenance recorded. ## Test plan All verified locally on this branch against live LocalStack: - [x] **unit (moto):** `batch_get_entity_and_buckets` returns existing buckets incl. no-`#META` trigger; wait-then-acquire recovery on speculative + non-speculative paths (async + generated sync) — 8 passed. - [x] **integration/e2e/benchmark (LocalStack, no-aggregator stacks):** direct read coverage, exhaust→wait→acquire recovery, recovery stress loop — 4 passed. - [x] **sync-generation:** no drift; `ruff` + `mypy` clean. ## References - Original fix: #429 - Bug report: #428 (already closed on `main`; referenced here as context only) This PR targets the `0.10.x` maintenance line. 🤖 Generated with [Claude Code](https://claude.ai/code)
diff --git a/src/zae_limiter/repository.py b/src/zae_limiter/repository.py
@@ -449,6 +449,13 @@ async def _get_caller_identity_arn(self) -> str | None:
 
     def _now_ms(self) -> int:
         """Current time in milliseconds."""
+        # TODO(clock-seam): _now_ms() is not the single source of truth for time.
+        # limiter.py and lease.py compute now via inline `int(time.time() * 1000)`
+        # instead of routing through here, so a single acquire() reads the clock
+        # from multiple places and `_now_ms` cannot be monkeypatched to control
+        # time in tests. Make `_now_ms()` the injectable clock seam: add it to
+        # RepositoryProtocol and route limiter.py/lease.py through
+        # `self._repository._now_ms()` (regenerating all sync twins).
         return int(time.time() * 1000)
 
     # -------------------------------------------------------------------------
@@ -1650,7 +1657,7 @@ async def batch_get_entity_and_buckets(
                 sk = item.get("SK", {}).get("S", "")
                 if sk == schema.sk_meta():
                     entity = self._deserialize_entity(item)
-                elif sk.startswith(schema.SK_BUCKET):
+                elif sk == schema.sk_state():
                     for bucket in self._deserialize_composite_bucket(item):
                         key = (bucket.entity_id, bucket.resource, bucket.limit_name)
                         buckets[key] = bucket
diff --git a/src/zae_limiter/sync_repository.py b/src/zae_limiter/sync_repository.py
@@ -1362,7 +1362,7 @@ def batch_get_entity_and_buckets(
                 sk = item.get("SK", {}).get("S", "")
                 if sk == schema.sk_meta():
                     entity = self._deserialize_entity(item)
-                elif sk.startswith(schema.SK_BUCKET):
+                elif sk == schema.sk_state():
                     for bucket in self._deserialize_composite_bucket(item):
                         key = (bucket.entity_id, bucket.resource, bucket.limit_name)
                         buckets[key] = bucket
diff --git a/tests/benchmark/test_localstack.py b/tests/benchmark/test_localstack.py
@@ -21,7 +21,7 @@
 
 import pytest
 
-from zae_limiter import Limit
+from zae_limiter import Limit, RateLimitExceeded
 
 pytestmark = [pytest.mark.benchmark, pytest.mark.integration]
 
@@ -656,3 +656,53 @@ def operation():
             time.sleep(0.5)  # Final wait for processing
 
         benchmark(operation)
+
+
+class TestLocalStackRefillRecoveryStress:
+    """E2E stress test for client-side refill recovery (regression for #428).
+
+    Repeatedly drives the acquire() slow-path refill-recovery against LocalStack:
+    exhaust -> brief real wait -> re-acquire succeeds. Before the fix this raised
+    RateLimitExceeded(retry_after=0.0) once the bucket was drained, because
+    batch_get_entity_and_buckets() dropped the existing bucket (stale "#BUCKET#"
+    SK filter vs SK=#STATE, GHSA-76rv-2r9v-c5m6) and the slow path treated it as new.
+
+    Uses sync_localstack_limiter (the no-aggregator minimal stack) on purpose: the
+    aggregator would refill buckets out-of-band and mask the client path that broke.
+
+    This is a correctness/stress test, not a micro-benchmark -- it does not use the
+    `benchmark` fixture, so it is skipped under `--benchmark-only` but runs on a
+    normal `pytest tests/benchmark/test_localstack.py` invocation.
+    """
+
+    def test_refill_recovery_stress_loop(self, sync_localstack_limiter):
+        """Exhaust -> wait -> recover, repeated over many entities."""
+        # 100 tokens, refills the full bucket every second.
+        limits = [Limit.custom("rpm", capacity=100, refill_amount=100, refill_period_seconds=1)]
+
+        iterations = 5
+        for i in range(iterations):
+            entity_id = f"ls-recover-{i}"
+
+            # Drain the bucket completely.
+            with sync_localstack_limiter.acquire(
+                entity_id=entity_id, resource="api", limits=limits, consume={"rpm": 100}
+            ):
+                pass
+
+            # Exhausted: rejection must report a real wait, not the buggy 0.0.
+            with pytest.raises(RateLimitExceeded) as exc_info:
+                with sync_localstack_limiter.acquire(
+                    entity_id=entity_id, resource="api", limits=limits, consume={"rpm": 100}
+                ):
+                    pass
+            assert exc_info.value.retry_after_seconds > 0, (
+                f"iteration {i}: rejection should report a real retry_after"
+            )
+
+            # Wait for partial refill, then recover via the slow path.
+            time.sleep(0.8)
+            with sync_localstack_limiter.acquire(
+                entity_id=entity_id, resource="api", limits=limits, consume={"rpm": 50}
+            ) as lease:
+                assert lease.consumed == {"rpm": 50}, f"iteration {i}: recovery acquire failed"
diff --git a/tests/e2e/test_localstack.py b/tests/e2e/test_localstack.py
@@ -915,6 +915,40 @@ async def test_negative_bucket_handling(self, e2e_limiter_minimal):
         # Consumed 15 tokens with capacity 10, so at least -3 after some refill
         assert available["rpm"] <= -3, "Bucket should still be significantly negative"
 
+    @pytest.mark.asyncio(loop_scope="class")
+    async def test_acquire_recovers_after_refill_wait(self, e2e_limiter_minimal):
+        """An exhausted bucket recovers after enough time passes (regression #428).
+
+        Runs on the no-aggregator stack so the client refill-recovery slow path is
+        what's exercised. On the aggregator stack the Lambda refills the bucket
+        out-of-band and the client path never runs -- which is exactly why this bug
+        (acquire raising RateLimitExceeded with retry_after=0.0 instead of
+        refilling) reached production in v0.10.1 undetected.
+        """
+        # 100 tokens, refills the full bucket every second.
+        limits = [Limit.custom("rpm", capacity=100, refill_amount=100, refill_period_seconds=1)]
+
+        # Drain the bucket completely.
+        async with e2e_limiter_minimal.acquire(
+            entity_id="recover-user", resource="api", limits=limits, consume={"rpm": 100}
+        ):
+            pass
+
+        # Immediately exhausted: the rejection must report a real wait, not 0.0.
+        with pytest.raises(RateLimitExceeded) as exc_info:
+            async with e2e_limiter_minimal.acquire(
+                entity_id="recover-user", resource="api", limits=limits, consume={"rpm": 100}
+            ):
+                pass
+        assert exc_info.value.retry_after_seconds > 0
+
+        # After refilling, the same acquire must succeed (slow-path refill recovery).
+        await asyncio.sleep(1.1)
+        async with e2e_limiter_minimal.acquire(
+            entity_id="recover-user", resource="api", limits=limits, consume={"rpm": 50}
+        ) as lease:
+            assert lease.consumed == {"rpm": 50}
+
 
 class TestE2ECloudFormationStackVariations:
     """E2E tests for CloudFormation stack deployment variations."""
diff --git a/tests/integration/test_repository.py b/tests/integration/test_repository.py
@@ -338,6 +338,54 @@ async def test_batch_get_buckets_empty_key_list(self, test_repo):
         result = await test_repo.batch_get_buckets([])
         assert result == {}
 
+    @pytest.mark.asyncio
+    async def test_batch_get_entity_and_buckets_returns_existing_buckets(self, test_repo):
+        """The acquire slow-path read must return buckets that exist (real DynamoDB).
+
+        Regression for #428: batch_get_entity_and_buckets() classified response
+        items with the stale "#BUCKET#" SK prefix, but buckets use SK=#STATE since
+        the per-shard migration (GHSA-76rv-2r9v-c5m6), so every bucket was dropped.
+        The sibling batch_get_buckets() (tested above) had no such filter, which is
+        why this gap survived. This exercises a real BatchGetItem response.
+        """
+        await test_repo.create_entity("bge-entity")
+        limits = [Limit.per_minute("rpm", 100), Limit.per_minute("tpm", 10_000)]
+        now_ms = int(time.time() * 1000)
+        states = [BucketState.from_limit("bge-entity", "gpt-4", limit, now_ms) for limit in limits]
+        await test_repo.transact_write(
+            [test_repo.build_composite_create("bge-entity", "gpt-4", states, now_ms)]
+        )
+
+        entity, buckets = await test_repo.batch_get_entity_and_buckets(
+            "bge-entity", [("bge-entity", "gpt-4")]
+        )
+
+        assert entity is not None
+        assert ("bge-entity", "gpt-4", "rpm") in buckets
+        assert ("bge-entity", "gpt-4", "tpm") in buckets
+
+    @pytest.mark.asyncio
+    async def test_batch_get_entity_and_buckets_finds_bucket_without_meta(self, test_repo):
+        """Buckets must be returned even when the entity has no #META record.
+
+        Real-world trigger: acquire() without a prior create_entity() writes a
+        bucket but no META record. The slow-path read must still find the bucket
+        (entity is None, bucket dict populated). Regression for #428.
+        """
+        limits = [Limit.per_minute("rpm", 100)]
+        now_ms = int(time.time() * 1000)
+        states = [BucketState.from_limit("bge-bare", "gpt-4", limit, now_ms) for limit in limits]
+        await test_repo.transact_write(
+            [test_repo.build_composite_create("bge-bare", "gpt-4", states, now_ms)]
+        )
+
+        entity, buckets = await test_repo.batch_get_entity_and_buckets(
+            "bge-bare", [("bge-bare", "gpt-4")]
+        )
+
+        assert entity is None
+        assert ("bge-bare", "gpt-4", "rpm") in buckets
+
     @pytest.mark.asyncio
     async def test_batch_get_buckets_deduplication(self, test_repo):
         """Should deduplicate duplicate keys in the request."""
diff --git a/tests/unit/test_limiter.py b/tests/unit/test_limiter.py
@@ -212,6 +212,54 @@ async def test_acquire_fallback_when_batch_not_supported(self, limiter, monkeypa
             assert lease.consumed == {"rpm": 1}
 
 
+class TestRateLimiterRefillRecovery:
+    """Wait-then-acquire: an exhausted bucket recovers after enough time passes.
+
+    Regression for the stale slow-path bucket discriminator (buckets moved to
+    SK=#STATE in the per-shard migration, but batch_get_entity_and_buckets still
+    filtered on the old "#BUCKET#" prefix). With buckets silently dropped, the
+    refill-recovery fallback treated existing buckets as new and the conditional
+    write failed with a bogus retry_after=0.0 instead of refilling.
+    """
+
+    async def test_acquire_succeeds_after_refill_wait(self, limiter):
+        """Exhaust a bucket, wait for refill, and acquire again (speculative on)."""
+        # 100 tokens, refills the full bucket every second.
+        limits = [Limit.custom("rpm", capacity=100, refill_amount=100, refill_period_seconds=1)]
+
+        # Drain the bucket completely.
+        async with limiter.acquire("key-1", "gpt-4", limits=limits, consume={"rpm": 100}):
+            pass
+
+        # Immediately exhausted: rejection must report a real wait, not 0.0.
+        with pytest.raises(RateLimitExceeded) as exc_info:
+            async with limiter.acquire("key-1", "gpt-4", limits=limits, consume={"rpm": 100}):
+                pass
+        assert exc_info.value.retry_after_seconds > 0
+
+        # After refilling, the same acquire must succeed.
+        await asyncio.sleep(1.1)
+        async with limiter.acquire("key-1", "gpt-4", limits=limits, consume={"rpm": 50}) as lease:
+            assert lease.consumed == {"rpm": 50}
+
+    async def test_acquire_succeeds_after_refill_wait_non_speculative(self, limiter):
+        """Same recovery on the pure slow path (speculative writes disabled)."""
+        slow = RateLimiter(repository=limiter._repository, speculative_writes=False)
+        limits = [Limit.custom("rpm", capacity=100, refill_amount=100, refill_period_seconds=1)]
+
+        async with slow.acquire("key-2", "gpt-4", limits=limits, consume={"rpm": 100}):
+            pass
+
+        with pytest.raises(RateLimitExceeded) as exc_info:
+            async with slow.acquire("key-2", "gpt-4", limits=limits, consume={"rpm": 100}):
+                pass
+        assert exc_info.value.retry_after_seconds > 0
+
+        await asyncio.sleep(1.1)
+        async with slow.acquire("key-2", "gpt-4", limits=limits, consume={"rpm": 50}) as lease:
+            assert lease.consumed == {"rpm": 50}
+
+
 class TestRateLimiterLease:
     """Tests for Lease functionality."""
 
diff --git a/tests/unit/test_repository.py b/tests/unit/test_repository.py
@@ -330,6 +330,43 @@ async def test_batch_get_buckets_empty_keys(self, repo):
         result = await repo.batch_get_buckets([])
         assert result == {}
 
+    @pytest.mark.asyncio
+    async def test_batch_get_entity_and_buckets_returns_existing_buckets(self, repo_with_buckets):
+        """The acquire slow-path read must return buckets that exist.
+
+        Regression for the stale bucket discriminator: buckets use SK=#STATE
+        since the per-shard migration (GHSA-76rv), but the response filter still
+        checked the pre-shard SK_BUCKET prefix ("#BUCKET#"), silently dropping
+        every bucket. That made the slow path treat existing buckets as new.
+        """
+        entity, buckets = await repo_with_buckets.batch_get_entity_and_buckets(
+            "entity-1", [("entity-1", "gpt-4")]
+        )
+
+        assert entity is not None  # entity-1 was created via create_entity
+        # Bucket must be discovered (the bug returned an empty dict here)
+        assert ("entity-1", "gpt-4", "rpm") in buckets
+        assert ("entity-1", "gpt-4", "tpm") in buckets
+
+    @pytest.mark.asyncio
+    async def test_batch_get_entity_and_buckets_finds_bucket_without_meta(self, repo):
+        """Buckets must be returned even when the entity has no #META record.
+
+        This is the real-world trigger: acquire() on an entity that was never
+        registered via create_entity() creates a bucket but no META record.
+        The slow-path read still must find the bucket (entity is None, but the
+        bucket dict is populated).
+        """
+        limits = [Limit.per_minute("rpm", 100)]
+        now_ms = int(time.time() * 1000)
+        states = [BucketState.from_limit("bare-1", "gpt-4", limit, now_ms) for limit in limits]
+        await repo.transact_write([repo.build_composite_create("bare-1", "gpt-4", states, now_ms)])
+
+        entity, buckets = await repo.batch_get_entity_and_buckets("bare-1", [("bare-1", "gpt-4")])
+
+        assert entity is None  # never created via create_entity
+        assert ("bare-1", "gpt-4", "rpm") in buckets  # bug returned {} here
+
     # -------------------------------------------------------------------------
     # batch_get_configs tests (issue #298)
     # -------------------------------------------------------------------------
diff --git a/tests/unit/test_sync_limiter.py b/tests/unit/test_sync_limiter.py
@@ -163,6 +163,44 @@ def test_acquire_fallback_when_batch_not_supported(self, sync_limiter, monkeypat
             assert lease.consumed == {"rpm": 1}
 
 
+class TestRateLimiterRefillRecovery:
+    """Wait-then-acquire: an exhausted bucket recovers after enough time passes.
+
+    Regression for the stale slow-path bucket discriminator (buckets moved to
+    SK=#STATE in the per-shard migration, but batch_get_entity_and_buckets still
+    filtered on the old "#BUCKET#" prefix). With buckets silently dropped, the
+    refill-recovery fallback treated existing buckets as new and the conditional
+    write failed with a bogus retry_after=0.0 instead of refilling.
+    """
+
+    def test_acquire_succeeds_after_refill_wait(self, sync_limiter):
+        """Exhaust a bucket, wait for refill, and acquire again (speculative on)."""
+        limits = [Limit.custom("rpm", capacity=100, refill_amount=100, refill_period_seconds=1)]
+        with sync_limiter.acquire("key-1", "gpt-4", limits=limits, consume={"rpm": 100}):
+            pass
+        with pytest.raises(RateLimitExceeded) as exc_info:
+            with sync_limiter.acquire("key-1", "gpt-4", limits=limits, consume={"rpm": 100}):
+                pass
+        assert exc_info.value.retry_after_seconds > 0
+        time.sleep(1.1)
+        with sync_limiter.acquire("key-1", "gpt-4", limits=limits, consume={"rpm": 50}) as lease:
+            assert lease.consumed == {"rpm": 50}
+
+    def test_acquire_succeeds_after_refill_wait_non_speculative(self, sync_limiter):
+        """Same recovery on the pure slow path (speculative writes disabled)."""
+        slow = SyncRateLimiter(repository=sync_limiter._repository, speculative_writes=False)
+        limits = [Limit.custom("rpm", capacity=100, refill_amount=100, refill_period_seconds=1)]
+        with slow.acquire("key-2", "gpt-4", limits=limits, consume={"rpm": 100}):
+            pass
+        with pytest.raises(RateLimitExceeded) as exc_info:
+            with slow.acquire("key-2", "gpt-4", limits=limits, consume={"rpm": 100}):
+                pass
+        assert exc_info.value.retry_after_seconds > 0
+        time.sleep(1.1)
+        with slow.acquire("key-2", "gpt-4", limits=limits, consume={"rpm": 50}) as lease:
+            assert lease.consumed == {"rpm": 50}
+
+
 class TestRateLimiterLease:
     """Tests for SyncLease functionality."""
 
diff --git a/tests/unit/test_sync_repository.py b/tests/unit/test_sync_repository.py
@@ -267,6 +267,37 @@ def test_batch_get_buckets_empty_keys(self, repo):
         result = repo.batch_get_buckets([])
         assert result == {}
 
+    def test_batch_get_entity_and_buckets_returns_existing_buckets(self, repo_with_buckets):
+        """The acquire slow-path read must return buckets that exist.
+
+        Regression for the stale bucket discriminator: buckets use SK=#STATE
+        since the per-shard migration (GHSA-76rv), but the response filter still
+        checked the pre-shard SK_BUCKET prefix ("#BUCKET#"), silently dropping
+        every bucket. That made the slow path treat existing buckets as new.
+        """
+        entity, buckets = repo_with_buckets.batch_get_entity_and_buckets(
+            "entity-1", [("entity-1", "gpt-4")]
+        )
+        assert entity is not None
+        assert ("entity-1", "gpt-4", "rpm") in buckets
+        assert ("entity-1", "gpt-4", "tpm") in buckets
+
+    def test_batch_get_entity_and_buckets_finds_bucket_without_meta(self, repo):
+        """Buckets must be returned even when the entity has no #META record.
+
+        This is the real-world trigger: acquire() on an entity that was never
+        registered via create_entity() creates a bucket but no META record.
+        The slow-path read still must find the bucket (entity is None, but the
+        bucket dict is populated).
+        """
+        limits = [Limit.per_minute("rpm", 100)]
+        now_ms = int(time.time() * 1000)
+        states = [BucketState.from_limit("bare-1", "gpt-4", limit, now_ms) for limit in limits]
+        repo.transact_write([repo.build_composite_create("bare-1", "gpt-4", states, now_ms)])
+        entity, buckets = repo.batch_get_entity_and_buckets("bare-1", [("bare-1", "gpt-4")])
+        assert entity is None
+        assert ("bare-1", "gpt-4", "rpm") in buckets
+
     def test_batch_get_configs_empty_keys(self, repo):
         """batch_get_configs should return empty dict for empty keys list."""
         result = repo.batch_get_configs([])