diff --git a/Microsoft.Azure.Cosmos.Encryption/src/EncryptionKeyStoreProviderImpl.cs b/Microsoft.Azure.Cosmos.Encryption/src/EncryptionKeyStoreProviderImpl.cs index 086e9855d8..b19b577bbe 100644 --- a/Microsoft.Azure.Cosmos.Encryption/src/EncryptionKeyStoreProviderImpl.cs +++ b/Microsoft.Azure.Cosmos.Encryption/src/EncryptionKeyStoreProviderImpl.cs @@ -31,7 +31,12 @@ public EncryptionKeyStoreProviderImpl(IKeyEncryptionKeyResolver keyEncryptionKey { this.keyEncryptionKeyResolver = keyEncryptionKeyResolver; this.ProviderName = providerName; - this.DataEncryptionKeyCacheTimeToLive = TimeSpan.Zero; + + // Enable the MDE library's built-in DEK byte cache. When ProtectedDataEncryptionKey cache + // expires (every 1–2 hours), the DEK byte cache still holds the unwrapped key bytes, so + // reconstruction avoids Key Vault HTTP calls. The 2-hour TTL outlives the default + // ProtectedDataEncryptionKey TTL (1 hour), covering most steady-state cache misses. + this.DataEncryptionKeyCacheTimeToLive = TimeSpan.FromHours(2); } public override string ProviderName { get; } diff --git a/Microsoft.Azure.Cosmos.Encryption/tests/Microsoft.Azure.Cosmos.Encryption.Tests/EncryptionKeyStoreProviderImplTests.cs b/Microsoft.Azure.Cosmos.Encryption/tests/Microsoft.Azure.Cosmos.Encryption.Tests/EncryptionKeyStoreProviderImplTests.cs new file mode 100644 index 0000000000..39927acfe9 --- /dev/null +++ b/Microsoft.Azure.Cosmos.Encryption/tests/Microsoft.Azure.Cosmos.Encryption.Tests/EncryptionKeyStoreProviderImplTests.cs @@ -0,0 +1,24 @@ +//------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------ + +namespace Microsoft.Azure.Cosmos.Encryption.Tests +{ + using System; + using global::Azure.Core.Cryptography; + using Microsoft.VisualStudio.TestTools.UnitTesting; + using Moq; + + [TestClass] + public class EncryptionKeyStoreProviderImplTests + { + [TestMethod] + public void Constructor_DekByteCacheEnabled_TwoHours() + { + Mock mockResolver = new Mock(); + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(mockResolver.Object, "testProvider"); + + Assert.AreEqual(TimeSpan.FromHours(2), provider.DataEncryptionKeyCacheTimeToLive); + } + } +} diff --git a/openspec/changes/reduce-encryption-contention/.openspec.yaml b/openspec/changes/reduce-encryption-contention/.openspec.yaml new file mode 100644 index 0000000000..8f0b86992e --- /dev/null +++ b/openspec/changes/reduce-encryption-contention/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-03-05 diff --git a/openspec/changes/reduce-encryption-contention/design.md b/openspec/changes/reduce-encryption-contention/design.md new file mode 100644 index 0000000000..487304c072 --- /dev/null +++ b/openspec/changes/reduce-encryption-contention/design.md @@ -0,0 +1,285 @@ +## Context + +Client-side encryption in the Cosmos DB .NET SDK uses a global `SemaphoreSlim(1,1)` that serializes all encryption key resolution operations — specifically, the construction or cache lookup of `ProtectedDataEncryptionKey` and `KeyEncryptionKey` objects via the Microsoft Data Encryption library's `GetOrCreate` methods. These are the only operations guarded by the semaphore; encrypt/decrypt of document payloads happens outside the semaphore. The semaphore guards the Microsoft Data Encryption library's `ProtectedDataEncryptionKey.GetOrCreate` and `KeyEncryptionKey.GetOrCreate` — both of which are cache lookups on the hot path but trigger synchronous Key Vault HTTP calls on cache miss. `ProtectedDataEncryptionKey` resolution is sync, on the hot path, and makes two blocking HTTP calls to Key Vault (Resolve + UnwrapKey) under the semaphore. + +The call amplification is severe: `DecryptObjectAsync` calls `BuildEncryptionAlgorithmForSettingAsync` per encrypted leaf value per document per page. With concurrent feed iterators, this produces thousands of semaphore acquisitions per second, all serialized to a single permit. Note: both read (decrypt) and write (encrypt) paths acquire the semaphore for key resolution; however, the customer workload triggering this issue is read-heavy (Change Feed Processor with concurrent `FeedIterator`). A dedicated investigation task (1.9) covers whether specific operation types trigger disproportionate contention. + +**Structural constraint**: The Microsoft Data Encryption library's `ProtectedDataEncryptionKey` constructor chain is sync (C# cannot make base-ctor calls async). `EncryptionKeyStoreProvider.UnwrapKey` returns `byte[]` (sync). The Microsoft Data Encryption library's `LocalCache.GetOrCreate` takes `Func` not `Func>`. The sync chain from `ProtectedDataEncryptionKey` → `KeyEncryptionKey.DecryptEncryptionKey` → `UnwrapKey` → `Resolve` + `UnwrapKey` on Key Vault cannot be made async at any point in the Microsoft Data Encryption library's type hierarchy. However, `IKeyEncryptionKeyResolver` exposes `ResolveAsync` and `IKeyEncryptionKey` exposes `UnwrapKeyAsync` — these async variants are reachable from our code, just not through the Microsoft Data Encryption library's call chain. + +## Goals / Non-Goals + +**Goals:** +- Eliminate `OperationCanceledException` cascade when `ProtectedDataEncryptionKey` cache expires under concurrent load +- Reduce semaphore hold time on `ProtectedDataEncryptionKey` cache miss from 200ms–2.4s to microseconds +- No public API changes. No breaking changes. +- Minimal complexity: prefer the simplest change that solves the customer’s problem + +**Non-Goals:** +- Changing the Microsoft Data Encryption library’s type hierarchy or making the `ProtectedDataEncryptionKey` constructor async +- Removing the semaphore entirely (it guards the Microsoft Data Encryption library internal state mutation — still needed) +- Per-key semaphores (reduces cross-key contention but same-key still serializes; added complexity for marginal gain) +- Client Encryption Key rotation handling (Client Encryption Key replacement requires data migration — re-encrypting every document — and is an offline operation; not a runtime concern for caching) +- Supporting key rotation detection at the cache level (Client Encryption Key rewrap doesn’t change plaintext bytes; new Client Encryption Key policy creates new `EncryptionSettingForProperty` objects) + +## Approach Comparison + +Two approaches were evaluated. Approach A (enable the Microsoft Data Encryption library’s built-in Data Encryption Key byte cache) is the recommended starting point due to its simplicity and minimal risk surface. + +### Approach A: Enable Data Encryption Key Byte Cache (Low-Risk Baseline) + +#### Why this was not considered earlier + +The original design dismissed the Data Encryption Key byte cache with circular reasoning: + +> “Enabling the Microsoft Data Encryption library Data Encryption Key byte cache (`DataEncryptionKeyCacheTimeToLive`) as a standalone layer (redundant when async prefetch is working)” + +This is backward — the question is whether the prefetch is needed at all if enabling the built-in cache solves the problem. The oversight occurred because investigation focused on the `ProtectedDataEncryptionKey` cache layer and the semaphore, without tracing the full call chain through `UnwrapKey` → `GetOrCreateDataEncryptionKey` → `LocalCache.GetOrCreate` to discover that the SDK **explicitly disables** a cache that would prevent the Key Vault calls entirely. + +#### How the built-in cache works + +The Microsoft Data Encryption library’s `EncryptionKeyStoreProvider` base class has a `LocalCache` keyed by the hex-encoded wrapped key bytes. It is checked inside `UnwrapKey` → `GetOrCreateDataEncryptionKey` before the `UnwrapKeyCore` delegate (which calls `Resolve()` + `UnwrapKey()` on Key Vault) is invoked. + +The Cosmos SDK’s `EncryptionKeyStoreProviderImpl` constructor **explicitly disables** this cache: + +```csharp +this.DataEncryptionKeyCacheTimeToLive = TimeSpan.Zero; +``` + +When `TimeToLive <= TimeSpan.Zero`, `LocalCache.GetOrCreate` skips the cache entirely and always invokes the delegate — meaning every `ProtectedDataEncryptionKey` cache miss triggers two Key Vault HTTP calls. + +#### What enabling it solves + +The customer’s problem is the **steady-state `ProtectedDataEncryptionKey` cache time-to-live expiry** (every 1–2 hours) under sustained concurrent load: + +1. `ProtectedDataEncryptionKey` cache time-to-live expires → cache miss → constructor runs under semaphore +2. Constructor → `KeyEncryptionKey.DecryptEncryptionKey` → `UnwrapKey` → **Data Encryption Key byte cache check** +3. **Today (cache disabled):** `LocalCache.GetOrCreate` skips cache → `UnwrapKeyCore` → `Resolve()` + `UnwrapKey()` on Key Vault (200ms–2.4s) +4. **With cache enabled:** `LocalCache.GetOrCreate` finds a warm entry (populated 1 hour ago when the previous `ProtectedDataEncryptionKey` was constructed) → returns instantly → **zero Key Vault calls** +5. Semaphore hold time drops to **microseconds** — identical to the prefetch approach’s goal + +This works because the Data Encryption Key byte cache time-to-live (set to 2 hours or longer) outlives the `ProtectedDataEncryptionKey` cache time-to-live (1–2 hours, defaults to 1 hour via `keyCacheTimeToLive`). When the `ProtectedDataEncryptionKey` expires and is reconstructed, the wrapped key bytes haven’t changed — the Data Encryption Key byte cache key is `encryptedKey.ToHexString()`, which is the same wrapped blob — so the cache hits. + +#### Critical limitation: periodic time-to-live alignment causes co-expiry + +However, the Data Encryption Key byte cache uses `AbsoluteExpirationRelativeToNow` — the time-to-live is set once at creation and **not renewed on cache hit**. This means the Data Encryption Key byte cache entry created at T=0 expires at T=2h regardless of how many `ProtectedDataEncryptionKey` reconstructions hit it in between. + +**With Data Encryption Key byte cache time-to-live = 2h, `ProtectedDataEncryptionKey` time-to-live = 1h (defaults):** + +| Time | `ProtectedDataEncryptionKey` cache | Data Encryption Key byte cache | Semaphore hold time | +|------|-----------|----------------|-----| +| T=0h | Populated (expires T=1h) | Populated (expires T=2h) | Cold start — Key Vault I/O | +| T=1h | **MISS** → reconstruct | HIT (not renewed, still expires T=2h) | **Microseconds** | +| T=2h | **MISS** → reconstruct | **MISS** (co-expired) → Key Vault I/O, repopulated (expires T=4h) | **200ms–2.4s — thundering herd possible** | +| T=3h | **MISS** → reconstruct | HIT (not renewed, still expires T=4h) | **Microseconds** | +| T=4h | **MISS** → reconstruct | **MISS** (co-expired) → Key Vault I/O | **200ms–2.4s** | + +**Pattern: every `DataEncryptionKeyCacheTimeToLive` interval, both caches co-expire and the full Key Vault call path is hit under the semaphore.** The thundering herd still occurs at these alignment boundaries — just less frequently (every 2h instead of every 1h with defaults). + +Setting a much longer Data Encryption Key byte cache time-to-live (e.g. 24h) would reduce the frequency further (1 slow miss per 24h), but: +- The thundering herd at the 24h alignment boundary is identical to today’s problem — just rare +- A 24h stale key material window may raise security review concerns +- If `keyCacheTimeToLive` is configured shorter (e.g. 10 minutes), the alignment frequency increases proportionally + +**Verdict: Approach A reduces frequency of the problem but does not eliminate it.** It’s a low-risk improvement but not a complete fix for customers with sustained high-concurrency workloads and strict cancellation timeouts. + +#### The change + +```csharp +// EncryptionKeyStoreProviderImpl constructor — change ONE line: +this.DataEncryptionKeyCacheTimeToLive = TimeSpan.Zero; +// becomes: +this.DataEncryptionKeyCacheTimeToLive = TimeSpan.FromHours(2); +// (or: keyCacheTimeToLive * 2, ensuring it outlives the ProtectedDataEncryptionKey cache) +``` + +#### Key rotation correctness + +Client Encryption Key rewrap (the only runtime rotation) changes the wrapped key bytes → different `encryptedKey.ToHexString()` → different cache key → cache miss → fresh Key Vault call. The plaintext Data Encryption Key bytes are unchanged, so the result is correct. No cache coherence issue. + +#### KEK revocation detection + +Enabling the Data Encryption Key byte cache introduces a conflict with KEK (Key Encryption Key) revocation detection. The existing error-handling path in `BuildEncryptionAlgorithmForSettingAsync` catches `RequestFailedException(403)` from `UnwrapKey` and triggers a force-refresh flow that fetches the latest Client Encryption Key properties from the backend. With the byte cache enabled, `UnwrapKey` → `GetOrCreateDataEncryptionKey` returns cached unwrapped bytes on a cache hit **without ever calling the key resolver**. The 403 is never thrown. KEK revocation becomes undetectable for the lifetime of the cache entry. + +This is not a theoretical concern — the `VerifyKekRevokeHandling` emulator test explicitly validates this flow: cache a DEK, revoke the KEK, verify that subsequent operations fail with the "needs to be rewrapped" error. With a naïve cache enable (`TimeSpan.Zero` → `TimeSpan.FromHours(2)`), this test fails because the cached bytes mask the revocation. + +Simple eviction doesn't work either — evicting the byte cache entry before every `BuildProtectedDataEncryptionKeyAsync` call renders the cache useless, since that's the only code path that reads it. This is functionally equivalent to `DataEncryptionKeyCacheTimeToLive = TimeSpan.Zero`. + +**This is resolved by the custom cache in Approach B** — see Decision 2 for the `revalidationInterval` design that separates the performance and security time scales. + +#### Summary of Approach A limitations + +| Scenario | Approach A behavior | +|---|---| +| **Steady-state time-to-live expiry (PDEK miss, DEK hit)** | Fast — no Key Vault calls. Covers most PDEK reconstructions. | +| **Periodic co-expiry (PDEK miss AND DEK miss)** | Slow — full Key Vault I/O under semaphore. Thundering herd still possible at alignment boundaries. | +| **Cold start** (first-ever call, no prior cache entry) | Key Vault I/O under semaphore — but at cold start there’s no concurrent load, so no thundering herd | +| **Reducing Resolve calls** | On a true cache miss, still calls `Resolve()` + `UnwrapKey()` (2 Key Vault calls). Does not cache the resolved `IKeyEncryptionKey` | +| **Proactive refresh before expiry** | No proactive refresh — relies on the next access to populate | +| **KEK revocation detection** | Not supported by simple byte cache — cached bytes mask revocation. Resolved by Approach B’s `revalidationInterval` (see Decision 2). | + +### Approach B: Async Prefetch with Background Refresh (Recommended) + +This is the full approach originally proposed. It adds a sealed subclass `CachingEncryptionKeyStoreProviderImpl` with async prefetch outside the semaphore, inflight deduplication, a resolved-client cache (caching the `IKeyEncryptionKey` / `CryptographyClient` per Customer Master Key URL), proactive background refresh with jitter, a concurrency limiter, disposal lifecycle with `CancellationTokenSource`, and environment variable gating. + +Approach B eliminates the co-expiry problem because the prefetch cache is refreshed **proactively before time-to-live expiry** — the sync `UnwrapKey` inside the semaphore always finds a warm prefetch entry. It also covers cold start (prefetch runs outside the semaphore), reduces Key Vault calls from 2 to 1 (via resolved-client cache), and handles the background refresh lifecycle cleanly. + +**However**, Approach B is significantly more complex. Consider whether the complexity is justified for the customer’s actual workload. + +#### Can the approaches be combined? + +**Yes — and this is recommended.** Enabling the Data Encryption Key byte cache (Approach A) as a baseline improvement is low-risk and benefits all customers immediately. Approach B layers on top for customers who need zero-contention guarantees: + +- Approach A: default-on, no environment variable needed, reduces slow-miss frequency from every `keyCacheTimeToLive` to every `DataEncryptionKeyCacheTimeToLive` +- Approach B: opt-in via environment variable, eliminates slow misses entirely via proactive prefetch + +The full Approach B design (decisions, risks, migration plan) is retained below. + +## Recommendation + +1. **Ship Approach A (enable Data Encryption Key byte cache) as a default-on baseline.** One-line change, benefits all customers, reduces slow-miss frequency. Low risk. +2. **Ship Approach B (async prefetch) gated behind `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED` environment variable** for customers who need zero-contention guarantees under sustained concurrent load. +3. **Benchmark both** against the customer workload to validate: (a) Approach A reduces frequency as expected, (b) Approach B eliminates contention entirely. +4. The `AeadAes256CbcHmac256EncryptionAlgorithm.GetOrCreate` cleanup (see Future Considerations) can ship independently with either approach. + + +## Decisions + +### Decision 1: Async prefetch in a sealed subclass, not in the base `EncryptionKeyStoreProviderImpl` + +**Chosen**: New `CachingEncryptionKeyStoreProviderImpl` (sealed, internal) extends `EncryptionKeyStoreProviderImpl`. Base class stays clean sync-only. `EncryptionCosmosClient` instantiates the subclass when environment variable is on. + +**Rationale**: The base class serves all existing customers with zero behavior change. The subclass adds: prefetch `ConcurrentDictionary`, resolved-client cache (a `ConcurrentDictionary` that caches the `CryptographyClient` returned by `IKeyEncryptionKeyResolver.Resolve()` per Customer Master Key URL — called "resolved-client" because it caches the resolved `IKeyEncryptionKey` client object, not the unwrapped key bytes), `CancellationTokenSource`, `Cleanup()` method, and overridden `UnwrapKey` that checks prefetch cache first. Separation keeps the risk surface isolated. + +**Rejected alternative**: Adding prefetch directly to base class with conditional branches — mixes concerns, harder to reason about lifecycle, risk of unintended behavior when environment variable is off. + +### Decision 2: `ConcurrentDictionary` for prefetch cache (not `MemoryCache` or the Microsoft Data Encryption library's `LocalCache`) + +**Chosen**: `ConcurrentDictionary` with manual time-to-live tracking (expiry stored alongside value). + +**Rationale**: The Microsoft Data Encryption library's `LocalCache.GetOrCreate` is get-or-create only — no `Set`, `Remove`, or `Evict` API. `MemoryCache` adds a dependency and has its own concurrency model. `ConcurrentDictionary` is simple, lock-free reads, and we control time-to-live ourselves. + +#### KEK revocation via revalidation interval + +The custom cache also resolves the KEK revocation detection problem described in Approach A's "KEK revocation detection" section. The fundamental tension is between two goals that both go through the same `UnwrapKey` → resolver path: +1. **Performance**: skip the resolver call (avoid Key Vault HTTP I/O under the semaphore) +2. **Security**: call the resolver (detect KEK revocation) + +These are contradictory on any single call. The custom cache resolves this by tracking a `lastValidated` timestamp per entry and introducing a `revalidationInterval`: + +``` +CacheEntry { + byte[] UnwrappedBytes // the cached unwrapped DEK bytes + DateTime LastValidated // when we last confirmed KEK access via the resolver +} +``` + +Lookup behavior: + +| Cache state | Action | +|---|---| +| **Cache miss** | Call resolver (`UnwrapKeyCore`), populate entry with `LastValidated = UtcNow` | +| **Cache hit, within `revalidationInterval`** | Return cached bytes immediately — no resolver call | +| **Cache hit, past `revalidationInterval`** | Call resolver to re-validate KEK access. **Succeeds**: update `LastValidated`, return bytes. **Fails (403)**: evict entry, rethrow — existing force-refresh path handles recovery | + +**Key parameters**: + +- `cacheTtl` (overall entry lifetime): 2 hours — same as the current `DataEncryptionKeyCacheTimeToLive` intent. Controls how long unwrapped bytes are retained at all. +- `revalidationInterval` (KEK access re-check frequency): recommended default matching `ProtectedDataEncryptionKey` time-to-live (1 hour). This is the maximum window during which KEK revocation goes undetected. + +This **separates the time scales**: ~99% of `UnwrapKey` calls (within `revalidationInterval`) are pure cache hits for performance, while periodic calls (at revalidation boundaries) contact the resolver for security validation. KEK revocation is detected within `revalidationInterval` — not immediately, but within a bounded and configurable window. The existing `BuildEncryptionAlgorithmForSettingAsync` 403 → force-refresh flow is unchanged. + +**Test compatibility**: The `VerifyKekRevokeHandling` test sets `ProtectedDataEncryptionKey.TimeToLive = TimeSpan.Zero`, meaning every operation rebuilds the PDEK and calls `UnwrapKey`. With `revalidationInterval` derived from the PDEK time-to-live (zero → every call validates), the test passes — every `UnwrapKey` call contacts the resolver and detects revocation immediately. + +**Implementation notes**: +- Override `GetOrCreateDataEncryptionKey` in `EncryptionKeyStoreProviderImpl` (or the `CachingEncryptionKeyStoreProviderImpl` subclass) — the base class method is `protected virtual` +- Set `DataEncryptionKeyCacheTimeToLive = TimeSpan.Zero` on the base class to disable its simple cache; the override handles all caching with revalidation +- Derive `revalidationInterval` from `ProtectedDataEncryptionKey.TimeToLive`: if PDEK time-to-live is zero (no caching), revalidation is zero (every call validates); if PDEK time-to-live is 1 hour, revalidation is 1 hour (one validation per PDEK rebuild cycle) + +### Decision 3: Proactive refresh via `Task.Run`, not `Timer` + +**Chosen**: On cache access, if within a configurable refresh window of expiry, fire `Task.Run` to refresh in background. The refresh window SHALL be **20% of the cache time-to-live, capped at a maximum of 5 minutes**. For example: time-to-live = 1 hour → refresh at 12 minutes before expiry (20% × 60 min = 12 min, but capped to 5 min → 5 minutes); time-to-live = 10 minutes → refresh at 2 minutes before expiry (20% × 10 min = 2 min). Deduplicate via `ConcurrentDictionary`. + +**Rationale**: `Timer` requires managing timer lifecycle per key, disposal ordering, and thread-pool callbacks. `Task.Run` on access is simpler — only fires when the key is actually being used (no wasted refresh for unused keys). Tied to `CancellationTokenSource` for cleanup on dispose. The refresh window is percentage-based rather than a fixed duration because the `keyCacheTimeToLive` is customer-configurable (defaults to 1 hour, max 2 hours from `ProtectedDataEncryptionKey.TimeToLive`, but can be set shorter). A fixed 5-minute window would be disproportionate for short time-to-live values. + +**Rejected alternative**: `System.Threading.Timer` per key — more deterministic timing but complex lifecycle management (dispose timers on cache eviction, handle timer callbacks after disposal). + +## Risks / Trade-offs + +- **[Non-risk] Cache coherence on key rotation**: Client Encryption Key rewrap (same plaintext Data Encryption Key, new Customer Master Key wrapper) is the only runtime rotation — all caches remain valid. Client Encryption Key replacement is out of scope (requires data migration, offline operation). + +- **[Risk] Background refresh lifecycle]: `Task.Run` captures `this`, keeping the provider and all its dependencies (resolver, credential chain) GC-rooted until the task completes. → **Mitigation**: `CancellationTokenSource` cancelled on `Cleanup()`/`Dispose()`. `Interlocked.Exchange` for idempotent dispose. Background tasks observe cancellation token. + +- **[Risk] Two copies of plaintext Data Encryption Key in memory**: `ProtectedDataEncryptionKey` internal field + prefetch `ConcurrentDictionary`. → **Mitigation**: Same process, same threat boundary, same time-to-live. No new attack surface. Industry standard (Azure Key Vault SDK recommends caching). + +- **[Risk] Environment variable read at construction only**: If an operator enables the environment variable and restarts the app, old client instances still run without optimization. → **Mitigation**: This is by design — matches SDK `ConfigurationManager` pattern. Client must be reconstructed to pick up changes. + +- **[Risk] Azure Key Vault burst on multi-Client Encryption Key proactive refresh**: If multiple Client Encryption Keys have similar time-to-live values (common at startup — all populated in a short window), their proactive refreshes fire simultaneously. Each refresh is deduplicated per key (1 call per Client Encryption Key), but N Client Encryption Keys = N concurrent Azure Key Vault calls. Azure Key Vault throttles at 4000 ops/vault/10s — unlikely to hit for typical customers (1–10 Client Encryption Keys), but possible for large deployments. → **Mitigation (recommended)**: Add jitter to the proactive refresh window (random offset within the percentage-based refresh window, per key) so Client Encryption Keys don’t all refresh at the same instant. Consider a concurrency limiter on background Azure Key Vault calls. + +- **[Trade-off] Prefetch adds complexity for correctness**: Async prefetch + background refresh + disposal lifecycle is more complex than a one-liner (`TimeSpan.Zero` → `TimeSpan.FromHours(1)`). But it solves the actual problem — `ProtectedDataEncryptionKey` resolution is async, off the hot path, and doesn’t overload Azure Key Vault. → **Accept**: Complexity is isolated in a sealed subclass, gated by environment variable. + +## Migration Plan + +1. All changes are gated by `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED` (off by default). +2. Ship in next SDK release with release notes documenting the environment variable. +3. Customers experiencing contention enable the environment variable. No code changes to their app. +4. After bake time (1–2 release cycles), consider enabling by default. +5. **Rollback**: Set environment variable to false (or unset it). Immediate revert to current behavior on next client construction. + +## Performance Benchmarking Strategy + +The fix must be validated with quantitative evidence, not just passing tests. The benchmark should reproduce the customer's actual workload — not a synthetic microbenchmark. + +**Customer workload profile (from incident investigation):** +- Service running high-concurrency `CosmosFeedIterator` reads with client-side encryption enabled +- Multiple encrypted properties per document (nested objects/arrays → per-leaf-value semaphore acquisition) +- Azure Key Vault-backed key resolver (`KeyResolver(DefaultTokenCredential)`) with `DefaultAzureCredential` (Managed Identity in production) +- Sustained concurrent load — multiple feed iterators running in parallel +- Failure point: `ProtectedDataEncryptionKey` cache time-to-live expires (every 1–2 hours) → first thread blocks on sync Key Vault I/O inside semaphore → queued threads' cancellation tokens fire → `OperationCanceledException` +- Cancellation timeout: ~5s (inferred from call stack — Change Feed Processor lease rebalance or request timeout) + +**The benchmark must reproduce this pattern**: concurrent feed iterator reads decrypting multi-property documents, crossing a `ProtectedDataEncryptionKey` cache time-to-live boundary, with realistic Key Vault latency. + +**What to measure:** +- **Semaphore hold time** (p50, p95, p99): How long each thread holds the semaphore during `BuildProtectedDataEncryptionKeyAsync`. Baseline (without fix) should show 200ms–2.4s on cache miss. With fix, should drop to microseconds. +- **End-to-end decrypt latency** (p50, p95, p99): Time from `ReadNextAsync` call to decrypted response, under concurrent load. +- **Throughput**: Decrypted documents per second at N concurrent feed iterators. +- **Azure Key Vault call count**: Total `Resolve` + `UnwrapKey` calls to Key Vault over a fixed workload. Should drop from O(`ProtectedDataEncryptionKey` cache misses × 2) to O(Client Encryption Keys × 1 per time-to-live interval). +- **`OperationCanceledException` rate**: With a 5s cancellation timeout (matching customer pattern), count how many operations are cancelled. Baseline should reproduce the customer's issue; with fix, should be zero. + +**Benchmark configuration:** +- Concurrent feed iterators: 1, 10, 50, 100 +- Encrypted properties per document: 1, 5, 10 +- Documents per page: 10, 50 +- Client Encryption Key count: 1, 5 (to test multi-Client Encryption Key refresh burst) +- Key Vault latency: simulated via `InMemoryKeyResolver` with configurable delay (0ms, 100ms, 500ms, 2000ms) — 500ms represents typical Azure Key Vault latency, 2000ms represents cold token + Azure Key Vault worst case +- `ProtectedDataEncryptionKey` cache state: warm (cache hit) and cold (force time-to-live expiry to simulate the 2-hour boundary) +- **Key scenario (must reproduce customer failure)**: Start with warm cache under sustained concurrent reads (50+ feed iterators), then force `ProtectedDataEncryptionKey` cache time-to-live expiry mid-workload with 500ms simulated Azure Key Vault latency and 5s cancellation timeout. Baseline should show `OperationCanceledException` cascade; with fix, should show zero cancellations. + +**Comparison:** +- Environment variable off (baseline — current behavior) vs. environment variable on (with fix) +- Same workload, same concurrency, same simulated Key Vault latency +- Output: CSV with latency percentiles, throughput, Azure Key Vault call counts, cancellation counts + +**Where to run:** +- Unit-level microbenchmarks (in-process, `InMemoryKeyResolver` with configurable delay) for semaphore hold time and throughput +- Emulator end-to-end benchmarks for end-to-end latency with real Cosmos operations (requires emulator) + +## Open Questions + +1. Should the resolved-client cache (`IKeyEncryptionKey`) be invalidated on a timer, or only on Customer Master Key URL mismatch? The `CryptographyClient` is stateless and long-lived, so indefinite caching until URL change seems safe. +2. What should the prefetch cache time-to-live be? Should it match `ProtectedDataEncryptionKey` cache time-to-live (1–2 hours) or be shorter? +3. **Azure Key Vault burst on multi-Client Encryption Key refresh**: If a customer has N distinct Client Encryption Keys all created around the same time, their prefetch caches expire roughly together, causing N concurrent `ResolveAsync` + `UnwrapKeyAsync` calls to Azure Key Vault in one burst. Should the proactive refresh window include jitter/stagger (e.g., random offset within the percentage-based refresh window) to spread calls over time? Should there be a rate limiter (max M concurrent background Azure Key Vault calls)? +4. **Single vs. dual `ConcurrentDictionary` for prefetch**: The current design uses two dictionaries — `ConcurrentDictionary` (result cache) and `ConcurrentDictionary` (inflight deduplication). The result cache serves the sync `UnwrapKey` path (needs `byte[]` immediately, can't `await`). The inflight dictionary is ephemeral (populated only during active Azure Key Vault calls, removed on completion). An alternative is a single `ConcurrentDictionary>` where the sync path calls `.Result` (safe when the task is already completed from a prior prefetch). Trade-off: fewer data structures vs. less explicit intent. Should the implementer choose based on readability? +5. **Stale or missing prefetch cache entry at sync `UnwrapKey` time**: If the prefetch cache does not have a refreshed `ProtectedDataEncryptionKey` entry when the sync `UnwrapKey` is called inside the semaphore (e.g. prefetch failed, prefetch was slow, or the entry expired between prefetch and semaphore acquisition), the sync path falls through to the existing `Resolve()` + `UnwrapKey()` Key Vault calls — identical to current behavior. The semaphore hold time in this case is the same as today (200ms–2.4s on cache miss). This is the expected fallback; the optimization is best-effort and the worst case is unchanged from the status quo. + +## Future Considerations + +### Use the Microsoft Data Encryption library's built-in algorithm cache (`AeadAes256CbcHmac256EncryptionAlgorithm.GetOrCreate`) + +The Microsoft Data Encryption library already has a static `LocalCache, AeadAes256CbcHmac256EncryptionAlgorithm>` with a `GetOrCreate` method. The SDK currently bypasses it by calling `new AeadAes256CbcHmac256EncryptionAlgorithm(pdek, encType)` directly at `EncryptionSettingForProperty.cs:114`. + +Switching to `GetOrCreate` would eliminate a redundant allocation per encrypted leaf value on the hot path (when the `ProtectedDataEncryptionKey` cache hits, the same `ProtectedDataEncryptionKey` reference produces equal `Tuple` keys → algorithm cache hit). `ProtectedDataEncryptionKey` overrides `Equals`/`GetHashCode` with structural equality (`Name` + `KeyEncryptionKey` + `rootKeyHexString`), so the cache key works correctly. + +**Why it's not in this change**: The `GetOrCreate` call occurs *after* the semaphore is released — it doesn't reduce semaphore contention. To skip the semaphore entirely, the cache check would need to be hoisted above `BuildProtectedDataEncryptionKeyAsync`, which reintroduces the ETag validation complexity and interaction with the Forbidden retry path. The async prefetch already makes the semaphore hold time microseconds, so the marginal benefit of skipping it entirely is small. + +**Low-risk cleanup**: `new` → `GetOrCreate` is a one-line change that saves allocations without changing semantics. Does not conflict with the `injectedAlgorithm` test hook (that path returns before reaching line 114). Could be done independently of this change. diff --git a/openspec/changes/reduce-encryption-contention/proposal.md b/openspec/changes/reduce-encryption-contention/proposal.md new file mode 100644 index 0000000000..3e8dd32a9c --- /dev/null +++ b/openspec/changes/reduce-encryption-contention/proposal.md @@ -0,0 +1,29 @@ +## Why + +Customer-side encryption operations throw `OperationCanceledException` under concurrent load because a global `SemaphoreSlim(1,1)` in `BuildProtectedDataEncryptionKeyAsync` serializes construction of `ProtectedDataEncryptionKey` objects — the resolved, unwrapped encryption keys needed for every encrypt/decrypt operation. The semaphore guards `KeyEncryptionKey.GetOrCreate` and `ProtectedDataEncryptionKey.GetOrCreate` — the Microsoft Data Encryption library's static get-or-create cache operations that, on cache miss, invoke the `createItem` delegate which triggers synchronous Key Vault HTTP calls (Resolve + UnwrapKey). The semaphore ensures only one thread at a time performs the expensive key creation, preventing duplicate Key Vault calls for the same key. However, this means every encrypted leaf value in every document on every page contends on this single-permit lock — even when the cache is warm and the hold time would be microseconds. The root cause: `ProtectedDataEncryptionKey` resolution is synchronous, happens on the hot path under the semaphore, and makes two blocking HTTP calls to Key Vault on cache miss. An internal customer is actively impacted. + +## What Changes + +- **Async prefetch of `ProtectedDataEncryptionKey` resolution outside the semaphore** via `ResolveAsync()` + `UnwrapKeyAsync()` before semaphore acquisition. The sync `UnwrapKey` inside the semaphore cannot be removed — the Microsoft Data Encryption library's `ProtectedDataEncryptionKey.GetOrCreate` constructor chain calls it unconditionally. Instead, the prefetch populates a cache so that when the Microsoft Data Encryption library's sync `UnwrapKey` fires, it returns cached bytes instantly instead of making HTTP calls to Key Vault. The semaphore is still acquired, but held for microseconds (cache read) instead of 200ms–2.4s (HTTP I/O). +- **Proactive background refresh** of the prefetched unwrapped Data Encryption Key bytes (the plaintext AES-256 key material returned by Key Vault's `UnwrapKey`) approximately 5 minutes before time-to-live expiry, deduplicated to one Azure Key Vault call per key per interval. Prevents thundering herd at time-to-live boundary. +- **Cache the resolved `IKeyEncryptionKey` (CryptographyClient)** per Customer Master Key URL so each refresh makes one Azure Key Vault call (`UnwrapKey`) instead of two (`Resolve` + `UnwrapKey`). +- All changes gated behind an opt-in environment variable (`AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED`), off by default. No public API changes. No breaking changes. + +## Capabilities + +### New Capabilities +- `async-dek-prefetch`: Async prefetch of `ProtectedDataEncryptionKey` resolution outside the semaphore using `ResolveAsync()` + `UnwrapKeyAsync()`, with a `ConcurrentDictionary` prefetch cache that the sync `UnwrapKey` reads from. Includes proactive background refresh before time-to-live expiry and lifecycle management via `CancellationTokenSource`. +- `resolved-client-cache`: Cache the `IKeyEncryptionKey` (CryptographyClient) returned by `Resolve()` per Customer Master Key URL to eliminate redundant Key Vault HTTP GETs on each refresh. +- `env-var-feature-gate`: Environment variable gate (`AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED`) to opt in to all caching/prefetch layers. Off by default. Follows existing SDK `ConfigurationManager` pattern. + +### Modified Capabilities + + +## Impact + +- **Files**: `EncryptionKeyStoreProviderImpl.cs` (or new subclass), `EncryptionCosmosClient.cs`, `EncryptionSettingForProperty.cs` (prefetch wiring only) +- **Dependencies**: No new packages. Uses existing `IKeyEncryptionKeyResolver.ResolveAsync()` / `IKeyEncryptionKey.UnwrapKeyAsync()` from `Azure.Core.Cryptography`. +- **APIs**: No public API changes. Internal-only. +- **Security**: Plaintext Data Encryption Key bytes are already cached in-process by the Microsoft Data Encryption library's `ProtectedDataEncryptionKey`. New caches hold the same bytes with the same time-to-live in the same process — no new attack surface. +- **Risk**: New caching layers introduce lifecycle complexity (background refresh, disposal, cache coherence on key rotation). Gated by environment variable for safe rollout. +- **Testing**: Unit tests for each cache layer + concurrency. Emulator end-to-end tests with environment variable enabled. diff --git a/openspec/changes/reduce-encryption-contention/specs/async-dek-prefetch/spec.md b/openspec/changes/reduce-encryption-contention/specs/async-dek-prefetch/spec.md new file mode 100644 index 0000000000..d0313ddd51 --- /dev/null +++ b/openspec/changes/reduce-encryption-contention/specs/async-dek-prefetch/spec.md @@ -0,0 +1,65 @@ +## ADDED Requirements + +### Requirement: Async prefetch of unwrapped Data Encryption Key bytes outside the semaphore +The system SHALL provide a `PrefetchUnwrapKeyAsync` method that calls `ResolveAsync()` + `UnwrapKeyAsync()` asynchronously and stores the result in a `ConcurrentDictionary` prefetch cache. This method SHALL be called before semaphore acquisition in `BuildProtectedDataEncryptionKeyAsync`. + +#### Scenario: Prefetch warms cache before semaphore +- **WHEN** `BuildEncryptionAlgorithmForSettingAsync` enters the cold path and calls `PrefetchUnwrapKeyAsync` before acquiring the semaphore +- **THEN** `ResolveAsync` and `UnwrapKeyAsync` SHALL execute asynchronously (yielding the thread), and the result SHALL be stored in the prefetch cache + +#### Scenario: Sync UnwrapKey reads from prefetch cache +- **WHEN** the Microsoft Data Encryption library's sync `UnwrapKey` is called inside the semaphore and the prefetch cache has a valid entry for the wrapped key +- **THEN** `UnwrapKey` SHALL return the cached bytes immediately without calling `Resolve()` or `UnwrapKey()` on Key Vault + +#### Scenario: Sync fallback on prefetch cache miss +- **WHEN** the Microsoft Data Encryption library's sync `UnwrapKey` is called inside the semaphore and the prefetch cache does NOT have an entry (race condition, prefetch failed, or prefetch not called) +- **THEN** `UnwrapKey` SHALL fall through to the existing sync `Resolve()` + `UnwrapKey()` path (identical to current behavior) + +### Requirement: Concurrent prefetch deduplication +The system SHALL deduplicate concurrent prefetch calls for the same wrapped key so that only one async Key Vault call flies per key at a time. + +#### Scenario: Multiple threads prefetch same key +- **WHEN** N threads simultaneously call `PrefetchUnwrapKeyAsync` for the same wrapped key (N can be any number of concurrent callers) +- **THEN** only one `ResolveAsync` + `UnwrapKeyAsync` call SHALL be made to Key Vault; all N threads SHALL await the same `Task` +- **NOTE**: The deduplication guarantee is independent of the number of concurrent callers. Test scenarios SHOULD use a representative concurrency level (e.g. 50) but the invariant holds for any N ≥ 2. + +### Requirement: Proactive background refresh before time-to-live expiry +The system SHALL schedule a background refresh of the prefetch cache entry when the entry is within the refresh window of its time-to-live expiry (20% of cache time-to-live, capped at 5 minutes maximum), so that the next consumer finds a warm cache. + +#### Scenario: Background refresh fires before expiry +- **WHEN** a prefetch cache entry is within the refresh window (20% of time-to-live, max 5 minutes) of expiry and is accessed +- **THEN** the system SHALL initiate a background `Task.Run` that calls `ResolveAsync` + `UnwrapKeyAsync` and updates the cache entry + +#### Scenario: Background refresh failure does not crash +- **WHEN** the background refresh call fails (Key Vault down, 429 throttle, network error) +- **THEN** the failure SHALL be caught and logged; the existing cache entry SHALL remain until its time-to-live expires; the sync fallback path SHALL handle the next call +- **NOTE**: The background refresh SHALL NOT retry on failure. Retrying with backoff risks spanning past the cache entry's time-to-live expiry — at which point the entry is gone, concurrent threads find no cache hit, and all fall through to the sync `Resolve()` + `UnwrapKey()` path under the semaphore, recreating the thundering herd problem this design prevents. Instead, fail fast: log the failure, keep serving the existing entry until time-to-live expiry, and rely on the next natural prefetch call (on the next cache access) to retry organically. The prefetch path already deduplicates concurrent calls, so retry coordination is built in. No explicit cache invalidation is needed — time-to-live expiry naturally clears stale entries. + +### Requirement: Prefetch cache time-to-live matches `ProtectedDataEncryptionKey` cache time-to-live +The prefetch cache entry time-to-live SHALL match the `ProtectedDataEncryptionKey.TimeToLive` value. + +#### Scenario: Cache entry expires with `ProtectedDataEncryptionKey` +- **WHEN** the `ProtectedDataEncryptionKey` cache time-to-live (1–2 hours) elapses +- **THEN** the prefetch cache entry for the same key SHALL also be expired, ensuring a fresh Key Vault call on the next cold path + +### Requirement: Lifecycle management via CancellationTokenSource +The async prefetch layer SHALL use a `CancellationTokenSource` to cancel in-flight background refresh tasks on disposal. + +#### Scenario: Disposal cancels background tasks +- **WHEN** `EncryptionCosmosClient.Dispose()` is called +- **THEN** the `CancellationTokenSource` SHALL be cancelled, all in-flight background refresh tasks SHALL observe cancellation, and the prefetch cache SHALL be cleared + +#### Scenario: Double-dispose is safe +- **WHEN** `Dispose()` is called multiple times +- **THEN** the second and subsequent calls SHALL be no-ops (idempotent via `Interlocked.Exchange`) + +### Requirement: Prefetch errors do not propagate to callers +The prefetch call in `BuildEncryptionAlgorithmForSettingAsync` SHALL be best-effort. + +#### Scenario: Prefetch throws non-cancellation exception +- **WHEN** `PrefetchUnwrapKeyAsync` throws an exception that is not `OperationCanceledException` +- **THEN** the exception SHALL be caught and swallowed; execution SHALL continue to semaphore acquisition and the normal sync path + +#### Scenario: Prefetch throws OperationCanceledException +- **WHEN** `PrefetchUnwrapKeyAsync` throws `OperationCanceledException` (caller's token fired) +- **THEN** the exception SHALL propagate to the caller (same as current behavior when `WaitAsync` is cancelled) diff --git a/openspec/changes/reduce-encryption-contention/specs/env-var-feature-gate/spec.md b/openspec/changes/reduce-encryption-contention/specs/env-var-feature-gate/spec.md new file mode 100644 index 0000000000..240888252c --- /dev/null +++ b/openspec/changes/reduce-encryption-contention/specs/env-var-feature-gate/spec.md @@ -0,0 +1,34 @@ +## ADDED Requirements + +### Requirement: Single environment variable gates all optimization layers +The system SHALL use a single environment variable `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED` to enable or disable all caching and prefetch layers for Data Encryption Key resolution (resolved-client cache, async Data Encryption Key prefetch, proactive background Data Encryption Key refresh). + +#### Scenario: Environment variable not set — all layers disabled +- **WHEN** `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED` is not set in the environment +- **THEN** all behavior SHALL be identical to the current codebase: no resolved-client cache, no async prefetch, no proactive background refresh + +#### Scenario: Environment variable set to true — all layers enabled +- **WHEN** `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED` is set to `true` (case-insensitive) +- **THEN** all caching and prefetch layers SHALL be active + +#### Scenario: Environment variable set to false or invalid — all layers disabled +- **WHEN** `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED` is set to `false`, empty, or any value that does not parse as `true` +- **THEN** all behavior SHALL be identical to the env-var-not-set case + +### Requirement: Environment variable read at EncryptionCosmosClient construction time +The environment variable SHALL be read once during `EncryptionCosmosClient` construction and the result cached for the client's lifetime. Subsequent changes to the environment variable SHALL NOT affect an already-constructed client. + +#### Scenario: Environment variable read once at startup +- **WHEN** `EncryptionCosmosClient` is constructed +- **THEN** the environment variable SHALL be read via the SDK's `ConfigurationManager` pattern (or `Environment.GetEnvironmentVariable`) and the boolean result stored as a readonly field + +#### Scenario: Environment variable change after construction has no effect +- **WHEN** the environment variable is changed after `EncryptionCosmosClient` is constructed +- **THEN** the existing client instance SHALL continue using the value read at construction time + +### Requirement: No public API changes +There SHALL be no new public classes, methods, properties, or parameters exposed. All optimization layers SHALL be entirely internal, activated only by the environment variable. + +#### Scenario: Public API surface unchanged +- **WHEN** the encryption package is built with optimizations enabled +- **THEN** the public API contract (as captured in the contracts file) SHALL be identical to the current version diff --git a/openspec/changes/reduce-encryption-contention/specs/resolved-client-cache/spec.md b/openspec/changes/reduce-encryption-contention/specs/resolved-client-cache/spec.md new file mode 100644 index 0000000000..7588ff514f --- /dev/null +++ b/openspec/changes/reduce-encryption-contention/specs/resolved-client-cache/spec.md @@ -0,0 +1,30 @@ +## ADDED Requirements + +### Requirement: Resolved IKeyEncryptionKey cached per Customer Master Key URL +The system SHALL maintain a `ConcurrentDictionary` keyed by `encryptionKeyId` (the Customer Master Key URL) that caches the `CryptographyClient` returned by `IKeyEncryptionKeyResolver.Resolve()`. + +#### Scenario: First resolve for a Customer Master Key URL +- **WHEN** `UnwrapKey` (or prefetch) is called for a Customer Master Key URL not yet in the resolved-client cache +- **THEN** the system SHALL call `Resolve(keyId)` (or `ResolveAsync(keyId)` on the async path), store the returned `IKeyEncryptionKey` in the cache, and use it for the `UnwrapKey` call + +#### Scenario: Subsequent resolve for the same Customer Master Key URL +- **WHEN** `UnwrapKey` is called for a Customer Master Key URL that is already in the resolved-client cache +- **THEN** the system SHALL skip the `Resolve()` call and use the cached `IKeyEncryptionKey` directly for the `UnwrapKey` call + +#### Scenario: Key Vault calls halved on true cache miss +- **WHEN** a `ProtectedDataEncryptionKey` cache miss triggers `UnwrapKey` with a warm resolved-client cache +- **THEN** only one Key Vault HTTP call SHALL be made (`UnwrapKey` POST) instead of two (`Resolve` GET + `UnwrapKey` POST) + +### Requirement: No secret material in resolved-client cache +The `IKeyEncryptionKey` object SHALL contain only the Key Vault URL, key name, key version, and HTTP pipeline configuration. It SHALL NOT contain any private key material. + +#### Scenario: Cache contents are non-secret +- **WHEN** the resolved-client cache is inspected +- **THEN** each entry SHALL be a `CryptographyClient` (or equivalent) containing only the Customer Master Key URL and auth pipeline reference — no RSA private key bytes + +### Requirement: Cache invalidation on Customer Master Key URL change +The resolved-client cache entry SHALL be invalidated when the Customer Master Key URL changes (key rotation where the Client Encryption Key is rewrapped to a different Customer Master Key). + +#### Scenario: Customer Master Key URL changes after rewrap +- **WHEN** `ClientEncryptionKeyProperties.EncryptionKeyWrapMetadata.Value` returns a different Customer Master Key URL than the one cached +- **THEN** the system SHALL call `Resolve()` with the new URL and update the cache entry diff --git a/openspec/changes/reduce-encryption-contention/tasks.md b/openspec/changes/reduce-encryption-contention/tasks.md new file mode 100644 index 0000000000..9e949d921c --- /dev/null +++ b/openspec/changes/reduce-encryption-contention/tasks.md @@ -0,0 +1,78 @@ +## 1. Security Validation — Background Data Encryption Key Refresh & Indefinite IKeyEncryptionKey Caching + +- [ ] 1.1 **Threat model: plaintext Data Encryption Key in prefetch cache.** Document what is cached (plaintext AES-256 bytes), where it lives (in-process `ConcurrentDictionary`), and confirm it is the same key material already held by the Microsoft Data Encryption library's `ProtectedDataEncryptionKey`. Identify any net-new exposure vs. current state. Deliverable: written assessment in this task item or design.md addendum. +- [ ] 1.2 **Threat model: background `Task.Run` refresh.** Identify risks of a background task calling `ResolveAsync` + `UnwrapKeyAsync` on a timer-like pattern: (a) GC rooting of provider/credential chain while task is in-flight, (b) token credential refresh happening on a ThreadPool thread outside the caller's context, (c) plaintext bytes being written to cache from a background thread after `Dispose()` is called. For each risk, document mitigation (CancellationTokenSource, Interlocked dispose guard, cancellation token observation). +- [ ] 1.3 **Threat model: indefinite `IKeyEncryptionKey` caching.** Confirm the cached `CryptographyClient` contains no secret material (only vault URL + key name + version + HTTP pipeline reference). Identify the invalidation trigger (Customer Master Key URL change on key rotation). Document whether an indefinite cache (no time-to-live, invalidate only on URL mismatch) is safe, or whether a bounded time-to-live is needed. Check: does the `CryptographyClient` hold a reference to a specific key version, and what happens if the key is disabled/deleted in Key Vault? +- [ ] 1.4 **Key rotation scenario.** Confirm that Client Encryption Key rewrap (the only runtime rotation) does not change plaintext Data Encryption Key bytes, so all caches remain valid. Client Encryption Key replacement is out of scope (requires data migration, offline operation). Document this assumption. +- [ ] 1.5 **Stale key material window.** Quantify the maximum time plaintext Data Encryption Key bytes from a revoked Customer Master Key can remain in memory (prefetch cache time-to-live + `ProtectedDataEncryptionKey` cache time-to-live). Client Encryption Key replacement is out of scope. Confirm the window is acceptable per Azure Key Vault best practices. +- [ ] 1.6 **Explore: can `AeadAes256CbcHmac256EncryptionAlgorithm.GetOrCreate` replace `new`?** The Microsoft Data Encryption library has a built-in static `LocalCache` for algorithm instances keyed by `(DataEncryptionKey, EncryptionType)`. The SDK bypasses it by calling `new` directly. Evaluate: (a) does `ProtectedDataEncryptionKey.Equals`/`GetHashCode` produce correct cache hits for the same logical key? (b) is there any thread-safety concern with the Microsoft Data Encryption library's `LocalCache`? (c) could this be a standalone one-line cleanup independent of this change? Document findings. +- [ ] 1.7 **Understand the wrapped Client Encryption Key fetch path from Cosmos DB.** Read `GetClientEncryptionKeyPropertiesAsync` in `EncryptionCosmosClient` and `EncryptionSettingForProperty`. Understand: (a) how the wrapped Client Encryption Key is fetched (gateway-cached read via `AsyncCache`), (b) when it refreshes (force refresh on Forbidden retry, ETag-based staleness check), (c) whether anything in its caching design (AsyncCache pattern, ETag invalidation, gateway cache headers) can inform or be reused for the prefetch cache design. Document any applicable patterns. +- [ ] 1.8 **Review with implementing engineer.** Walk through findings from 1.1–1.9 before proceeding to implementation. Resolve any open questions. Update design.md Risks section if new risks are identified. +- [ ] 1.9 **Operation-type contention analysis.** Investigate whether specific operation types (read/decrypt vs. write/encrypt, point reads vs. feed iterators, Change Feed Processor vs. manual iteration) trigger disproportionate semaphore contention. Map the call path from each operation type to `BuildEncryptionAlgorithmForSettingAsync` and document the semaphore acquisition frequency per operation type. Identify if any operation type can bypass the semaphore (e.g. if the algorithm is already cached at a higher level). + +## 2. Environment Variable Feature Gate + +- [ ] 2.1 Read `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED` in `EncryptionCosmosClient` constructor (via `Environment.GetEnvironmentVariable`, case-insensitive `true` check). Store as `internal readonly bool IsOptimisticDecryptionEnabled`. +- [ ] 2.2 Verify environment variable is read once at construction and cached — subsequent environment variable changes do not affect existing client instances. +- [ ] 2.3 Add unit test: environment variable not set → `IsOptimisticDecryptionEnabled` is false. +- [ ] 2.4 Add unit test: environment variable set to `true` / `True` / `TRUE` → `IsOptimisticDecryptionEnabled` is true. +- [ ] 2.5 Add unit test: environment variable set to `false` / empty / garbage → `IsOptimisticDecryptionEnabled` is false. + +## 3. Resolved Client Cache (`CachingEncryptionKeyStoreProviderImpl`) + +- [ ] 4.1 Create `CachingEncryptionKeyStoreProviderImpl` as a sealed internal class extending `EncryptionKeyStoreProviderImpl`. +- [ ] 4.2 Add `ConcurrentDictionary resolvedClientCache` field. +- [ ] 3.3 Override `UnwrapKey`: before calling `Resolve(keyId)`, check `resolvedClientCache.TryGetValue(keyId, out client)`. On hit, use cached client directly for `UnwrapKey`. On miss, call `Resolve`, store result, then call `UnwrapKey`. +- [ ] 3.4 Add unit test: first call for a Customer Master Key URL calls `Resolve`; second call skips `Resolve` and calls only `UnwrapKey` (verify via mock). +- [ ] 3.5 Add unit test: different Customer Master Key URLs get independent cache entries; resolving one does not affect the other. + +## 4. Async Data Encryption Key Prefetch (`CachingEncryptionKeyStoreProviderImpl`) + +- [ ] 4.1 Add `ConcurrentDictionary prefetchCache` field to `CachingEncryptionKeyStoreProviderImpl`. +- [ ] 4.2 Add `ConcurrentDictionary inflightPrefetches` field for deduplication. +- [ ] 4.3 Implement `internal async Task PrefetchUnwrapKeyAsync(string keyId, string algorithm, byte[] wrappedKey, CancellationToken ct)`: check prefetch cache → if miss, check inflight → if miss, call `ResolveAsync` + `UnwrapKeyAsync`, store result in prefetch cache with time-to-live, remove from inflight. +- [ ] 4.4 Update `UnwrapKey` override in `CachingEncryptionKeyStoreProviderImpl`: check prefetch cache first (by `wrappedKey.ToHexString()`). On hit, return immediately. On miss, fall through to resolved-client-cache path then to sync fallback. +- [ ] 4.5 Wire `PrefetchUnwrapKeyAsync` call into `EncryptionSettingForProperty.BuildEncryptionAlgorithmForSettingAsync` — call it BEFORE `SemaphoreSlim.WaitAsync`, wrapped in try/catch (swallow non-cancellation exceptions, propagate `OperationCanceledException`). +- [ ] 4.6 Add unit test: prefetch populates cache → sync `UnwrapKey` returns immediately without Key Vault call. +- [ ] 4.7 Add unit test: prefetch cache miss → sync `UnwrapKey` falls through to sync Resolve + UnwrapKey (existing behavior). +- [ ] 4.8 Add unit test: 50 concurrent threads calling `PrefetchUnwrapKeyAsync` for the same key → only one `ResolveAsync` + `UnwrapKeyAsync` call (deduplication). +- [ ] 4.9 Add unit test: prefetch throws (simulated Key Vault failure) → caller proceeds to semaphore and sync path without error. +- [ ] 4.10 Add unit test: prefetch throws `OperationCanceledException` → exception propagates to caller. + +## 5. Proactive Background Refresh + +- [ ] 5.1 In `CachingEncryptionKeyStoreProviderImpl`: add `CancellationTokenSource backgroundCts` field. +- [ ] 5.2 On prefetch cache access, if entry is within the refresh window before time-to-live expiry (20% of cache time-to-live, capped at 5 minutes maximum, with per-key jitter to stagger multi-Client Encryption Key refreshes), schedule `Task.Run` to call `ResolveAsync` + `UnwrapKeyAsync` and update the cache entry. Use `backgroundCts.Token`. Deduplicate via `inflightPrefetches`. Enforce a concurrency limiter (`SemaphoreSlim`, e.g. max 3 concurrent background refresh calls) to prevent unbounded fan-out with many Client Encryption Keys. +- [ ] 5.3 Add `Cleanup()` method: cancel `backgroundCts`, clear `prefetchCache`, clear `resolvedClientCache`, clear `inflightPrefetches`. Use `Interlocked.Exchange(ref disposed, 1)` for idempotent disposal. +- [ ] 5.4 Wire `Cleanup()` into `EncryptionCosmosClient.Dispose(bool disposing)`. +- [ ] 5.5 Add unit test: access a cache entry near time-to-live expiry → background refresh fires (verify via mock that `ResolveAsync` is called). +- [ ] 5.6 Add unit test: background refresh fails → existing cache entry persists until time-to-live; no exception propagated. +- [ ] 5.7 Add unit test: `Cleanup()` cancels in-flight background tasks (verify `OperationCanceledException` observed by background task). +- [ ] 5.8 Add unit test: double `Cleanup()` is safe (no `ObjectDisposedException`). +- [ ] 5.9 Add unit test: concurrency limiter caps background refresh tasks (e.g. 10 Client Encryption Keys trigger refresh simultaneously, verify only max 3 concurrent Azure Key Vault calls via mock). + +## 6. Integration & Wiring + +- [ ] 6.1 In `EncryptionCosmosClient` constructor: if environment variable is on, instantiate `CachingEncryptionKeyStoreProviderImpl` instead of `EncryptionKeyStoreProviderImpl`. +- [ ] 6.2 Expose `PrefetchUnwrapKeyAsync` on base class as a virtual no-op (`return Task.CompletedTask`) so `EncryptionSettingForProperty` can call it without knowing the concrete type. +- [ ] 6.3 Verify build: `dotnet build Microsoft.Azure.Cosmos.Encryption/src/ -c Release` — 0 errors, 0 warnings. +- [ ] 6.4 Run existing unit tests: `dotnet test Microsoft.Azure.Cosmos.Encryption/tests/Microsoft.Azure.Cosmos.Encryption.Tests/ --no-build` — all pass. +- [ ] 6.5 Run emulator end-to-end tests with `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED=true`: all existing `MdeEncryptionTests` pass. +- [ ] 6.6 Run emulator end-to-end tests without environment variable: all existing `MdeEncryptionTests` pass (verify zero behavior change). + +## 7. Contract & API Validation + +- [ ] 7.1 Verify no public API changes: diff the contracts file before and after. No new public types, methods, or properties. +- [ ] 7.2 Verify `EncryptionKeyStoreProviderImpl` remains `internal`. +- [ ] 7.3 Verify `CachingEncryptionKeyStoreProviderImpl` is `internal sealed`. + +## 8. Performance Benchmarking + +- [ ] 8.1 Create `InMemoryKeyResolver` test harness with configurable delay per `Resolve`/`UnwrapKey` call and call-count tracking. +- [ ] 8.2 Benchmark: semaphore hold time (p50/p95/p99) at 50 concurrent threads, simulated Key Vault latency 500ms, environment variable off vs. on. Expect: off = hundreds of ms, on = microseconds. +- [ ] 8.3 Benchmark: end-to-end decrypt throughput (docs/sec) at 10, 50, 100 concurrent feed iterators with 5 encrypted properties per document. Environment variable off vs. on. +- [ ] 8.4 Benchmark: Azure Key Vault call count over 1000 decrypt operations with 1 Client Encryption Key. Environment variable off (expect 2 per `ProtectedDataEncryptionKey` cache miss) vs. on (expect 1 per time-to-live refresh, resolved client cached). +- [ ] 8.5 Benchmark: `OperationCanceledException` count at 50 concurrent threads with 5s cancellation timeout and simulated 2s Key Vault latency. Environment variable off (expect failures) vs. on (expect zero). +- [ ] 8.6 Benchmark: multi-Client Encryption Key refresh burst — 5 Client Encryption Keys with aligned time-to-live values, verify jitter + concurrency limiter staggers Azure Key Vault calls (measure peak concurrent Azure Key Vault calls; should not exceed concurrency limit). +- [ ] 8.7 Document results in a comparison table (baseline vs. optimized) with all metrics. +- [ ] 8.8 **Customer thread pool investigation.** Gather information about the impacted customer's thread pool configuration (min/max threads, thread pool starvation indicators) to inform benchmark concurrency levels and validate that the fix is effective under their actual thread pool constraints.