From a242772c3e09f14ae04e33b34bbe8d225a8a8f59 Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Thu, 19 Feb 2026 18:23:58 -0500 Subject: [PATCH 01/10] Adding background refresh logic to for cached ProtectedDataEncryptionKey instances. --- .../src/EncryptionCosmosClient.cs | 5 + .../src/EncryptionKeyStoreProviderImpl.cs | 189 ++++- .../src/EncryptionSettingForProperty.cs | 50 ++ .../EncryptionKeyStoreProviderImplTests.cs | 700 ++++++++++++++++++ 4 files changed, 938 insertions(+), 6 deletions(-) create mode 100644 Microsoft.Azure.Cosmos.Encryption/tests/Microsoft.Azure.Cosmos.Encryption.Tests/EncryptionKeyStoreProviderImplTests.cs diff --git a/Microsoft.Azure.Cosmos.Encryption/src/EncryptionCosmosClient.cs b/Microsoft.Azure.Cosmos.Encryption/src/EncryptionCosmosClient.cs index 24eb872393..8b54d2796b 100644 --- a/Microsoft.Azure.Cosmos.Encryption/src/EncryptionCosmosClient.cs +++ b/Microsoft.Azure.Cosmos.Encryption/src/EncryptionCosmosClient.cs @@ -249,6 +249,11 @@ public async Task GetClientEncryptionKeyPropertie /// protected override void Dispose(bool disposing) { + if (disposing) + { + this.EncryptionKeyStoreProviderImpl.Dispose(); + } + this.cosmosClient.Dispose(); } diff --git a/Microsoft.Azure.Cosmos.Encryption/src/EncryptionKeyStoreProviderImpl.cs b/Microsoft.Azure.Cosmos.Encryption/src/EncryptionKeyStoreProviderImpl.cs index 086e9855d8..2f9174d6e5 100644 --- a/Microsoft.Azure.Cosmos.Encryption/src/EncryptionKeyStoreProviderImpl.cs +++ b/Microsoft.Azure.Cosmos.Encryption/src/EncryptionKeyStoreProviderImpl.cs @@ -5,6 +5,9 @@ namespace Microsoft.Azure.Cosmos.Encryption { using System; + using System.Collections.Concurrent; + using System.Threading; + using System.Threading.Tasks; using global::Azure.Core.Cryptography; using Microsoft.Data.Encryption.Cryptography; @@ -21,12 +24,45 @@ namespace Microsoft.Azure.Cosmos.Encryption /// ProtectedDataEncryptionKey -> KeyEncryptionKey(containing EncryptionKeyStoreProviderImpl object) -> EncryptionKeyStoreProviderImpl.UnWrapKey -> this.keyEncryptionKeyResolver.UnwrapKey /// /// - internal class EncryptionKeyStoreProviderImpl : EncryptionKeyStoreProvider + internal class EncryptionKeyStoreProviderImpl : EncryptionKeyStoreProvider, IDisposable { public const string RsaOaepWrapAlgorithm = "RSA-OAEP"; + /// + /// When a cached key is within this duration of its expiry a background refresh + /// is scheduled so the sync path never encounters a cold cache. + /// + private static readonly TimeSpan ProactiveRefreshThreshold = TimeSpan.FromMinutes(5); + private readonly IKeyEncryptionKeyResolver keyEncryptionKeyResolver; + /// + /// Cache of asynchronously pre-fetched unwrapped key bytes, keyed by the hex + /// representation of the encrypted key. Populated by + /// (called outside the global encryption + /// semaphore) and read by (called inside the semaphore) + /// for an instant, I/O-free return. + /// + private readonly ConcurrentDictionary prefetchedKeys = new ConcurrentDictionary(); + + /// + /// Tracks cache keys that have a background refresh in-flight to deduplicate concurrent refresh tasks. + /// + private readonly ConcurrentDictionary refreshesInFlight = new ConcurrentDictionary(); + + /// + /// Cancellation source for background proactive-refresh tasks. Cancelled on + /// so in-flight refreshes are promptly stopped and the + /// provider / key-resolver / credential chain can be garbage collected. + /// + private readonly CancellationTokenSource backgroundCts = new CancellationTokenSource(); + + /// + /// Guard for to make double-dispose and concurrent + /// dispose calls safe. 0 = not disposed, 1 = disposed. + /// + private int disposed; + public EncryptionKeyStoreProviderImpl(IKeyEncryptionKeyResolver keyEncryptionKeyResolver, string providerName) { this.keyEncryptionKeyResolver = keyEncryptionKeyResolver; @@ -38,16 +74,46 @@ public EncryptionKeyStoreProviderImpl(IKeyEncryptionKeyResolver keyEncryptionKey public override byte[] UnwrapKey(string encryptionKeyId, KeyEncryptionKeyAlgorithm algorithm, byte[] encryptedKey) { - // since we do not expose GetOrCreateDataEncryptionKey we first look up the cache. - // Cache miss results in call to UnWrapCore which updates the cache after UnwrapKeyAsync is called. - return this.GetOrCreateDataEncryptionKey(encryptedKey.ToHexString(), UnWrapKeyCore); + string cacheKey = encryptedKey.ToHexString(); + + // Fast path: return from the prefetch cache — zero I/O, no latency. + if (this.prefetchedKeys.TryGetValue(cacheKey, out PrefetchedKeyData cached)) + { + if (DateTime.UtcNow < cached.ExpiresAtUtc) + { + // Proactive refresh: if we are nearing expiry, kick off a background + // async refresh so the cache stays warm for the next caller. + if (DateTime.UtcNow > cached.ExpiresAtUtc - ProactiveRefreshThreshold) + { + this.ScheduleBackgroundRefresh(encryptionKeyId, encryptedKey); + } + + return cached.UnwrappedKeyBytes; + } + + // Entry has expired — remove it and fall through to the sync path. + this.prefetchedKeys.TryRemove(cacheKey, out _); + } + + // Slow path (safety net): if the prefetch cache is cold — e.g. first call + // before PrefetchUnwrapKeyAsync ran, or a narrow race on cache expiry — + // fall back to the original synchronous Resolve + UnwrapKey. On success + // the result is pushed into the prefetch cache so future calls are fast. + // Note: cacheKey (already computed above) is captured by the closure — + // avoids recomputing encryptedKey.ToHexString() inside GetOrCreateDataEncryptionKey. + return this.GetOrCreateDataEncryptionKey(cacheKey, UnWrapKeyCore); - // delegate that is called by GetOrCreateDataEncryptionKey, which unwraps the key and updates the cache in case of cache miss. byte[] UnWrapKeyCore() { - return this.keyEncryptionKeyResolver + byte[] unwrapped = this.keyEncryptionKeyResolver .Resolve(encryptionKeyId) .UnwrapKey(EncryptionKeyStoreProviderImpl.GetNameForKeyEncryptionKeyAlgorithm(algorithm), encryptedKey); + + this.prefetchedKeys[cacheKey] = new PrefetchedKeyData( + unwrapped, + DateTime.UtcNow.Add(ProtectedDataEncryptionKey.TimeToLive)); + + return unwrapped; } } @@ -74,6 +140,62 @@ public override bool Verify(string encryptionKeyId, bool allowEnclaveComputation throw new NotSupportedException("The Verify operation is not supported."); } + /// + /// Cancels any in-flight background refresh tasks and releases the + /// . This allows the provider, + /// key resolver, and credential chain to be garbage collected promptly + /// when the is disposed. + /// + public void Dispose() + { + if (Interlocked.Exchange(ref this.disposed, 1) != 0) + { + return; // already disposed — safe for double/concurrent Dispose calls + } + + this.backgroundCts.Cancel(); + this.backgroundCts.Dispose(); + this.prefetchedKeys.Clear(); + } + + /// + /// Asynchronously pre-warms the unwrapped-key cache for + /// so that the synchronous call (which runs inside the global + /// encryption semaphore) can return instantly without any Key Vault I/O. + /// + /// This MUST be called before acquiring the global semaphore. + /// + /// Key Encryption Key identifier (e.g. AKV key URI). + /// The wrapped Data Encryption Key bytes. + /// Cancellation token. + internal async Task PrefetchUnwrapKeyAsync( + string encryptionKeyId, + byte[] encryptedKey, + CancellationToken cancellationToken) + { + string cacheKey = encryptedKey.ToHexString(); + + // Skip when the cache is still well within its TTL. + if (this.prefetchedKeys.TryGetValue(cacheKey, out PrefetchedKeyData existing) + && DateTime.UtcNow < existing.ExpiresAtUtc - ProactiveRefreshThreshold) + { + return; + } + + // ResolveAsync + UnwrapKeyAsync: fully async Key Vault I/O, done outside + // the global semaphore so other threads are never blocked. + IKeyEncryptionKey keyEncryptionKey = await this.keyEncryptionKeyResolver.ResolveAsync(encryptionKeyId, cancellationToken).ConfigureAwait(false); + + byte[] unwrappedKey = await keyEncryptionKey.UnwrapKeyAsync( + EncryptionKeyStoreProviderImpl.RsaOaepWrapAlgorithm, + encryptedKey, + cancellationToken).ConfigureAwait(false); + + this.prefetchedKeys[cacheKey] = new PrefetchedKeyData( + unwrappedKey, + DateTime.UtcNow.Add(ProtectedDataEncryptionKey.TimeToLive)); + } + private static string GetNameForKeyEncryptionKeyAlgorithm(KeyEncryptionKeyAlgorithm algorithm) { if (algorithm == KeyEncryptionKeyAlgorithm.RSA_OAEP) @@ -83,5 +205,60 @@ private static string GetNameForKeyEncryptionKeyAlgorithm(KeyEncryptionKeyAlgori throw new InvalidOperationException(string.Format("Unexpected algorithm {0}", algorithm)); } + + /// + /// Fires a background task to refresh the prefetch cache entry for the given + /// encrypted key, keeping the sync path warm. + /// Concurrent refreshes for the same key are deduplicated. + /// The task is cancelled when is called. + /// + private void ScheduleBackgroundRefresh(string encryptionKeyId, byte[] encryptedKey) + { + string cacheKey = encryptedKey.ToHexString(); + + if (!this.refreshesInFlight.TryAdd(cacheKey, 0)) + { + return; // refresh already in progress + } + + CancellationToken token = this.backgroundCts.Token; + + _ = Task.Run(async () => + { + try + { + await this.PrefetchUnwrapKeyAsync( + encryptionKeyId, + encryptedKey, + token).ConfigureAwait(false); + } + catch (Exception) + { + // Best-effort: if the background refresh fails (including + // cancellation on Dispose), the next sync UnwrapKey call will + // fall through to the slow path. No data loss. + } + finally + { + this.refreshesInFlight.TryRemove(cacheKey, out _); + } + }); + } + + /// + /// Immutable record holding a pre-fetched unwrapped key and its expiry. + /// + private sealed class PrefetchedKeyData + { + public PrefetchedKeyData(byte[] unwrappedKeyBytes, DateTime expiresAtUtc) + { + this.UnwrappedKeyBytes = unwrappedKeyBytes; + this.ExpiresAtUtc = expiresAtUtc; + } + + public byte[] UnwrappedKeyBytes { get; } + + public DateTime ExpiresAtUtc { get; } + } } } diff --git a/Microsoft.Azure.Cosmos.Encryption/src/EncryptionSettingForProperty.cs b/Microsoft.Azure.Cosmos.Encryption/src/EncryptionSettingForProperty.cs index 431349c88d..33ba1b3657 100644 --- a/Microsoft.Azure.Cosmos.Encryption/src/EncryptionSettingForProperty.cs +++ b/Microsoft.Azure.Cosmos.Encryption/src/EncryptionSettingForProperty.cs @@ -22,6 +22,20 @@ internal sealed class EncryptionSettingForProperty // through InternalsVisibleTo. private readonly Microsoft.Data.Encryption.Cryptography.AeadAes256CbcHmac256EncryptionAlgorithm injectedAlgorithm; + // Cached encryption algorithm to avoid repeated semaphore acquisition per property per document. + // The underlying ProtectedDataEncryptionKey has its own TTL (default 2hrs) managed by MDE's LocalCache. + // We cache at this level to eliminate the amplification problem: without this cache, + // BuildEncryptionAlgorithmForSettingAsync acquires the global semaphore once per encrypted + // property leaf per document per page. With this cache, it's once per key per TTL period. + // + // Thread-safety note: these fields are read/written by concurrent async tasks without a lock. + // The worst-case race is that a thread reads a stale null/expired value and falls through to + // the semaphore path — one redundant cache build, no correctness issue. Using volatile to + // prevent the JIT from caching stale register copies across await boundaries. + // DateTime cannot be volatile (struct > pointer size), so we store expiry as ticks (long). + private volatile AeadAes256CbcHmac256EncryptionAlgorithm cachedEncryptionAlgorithm; + private long cachedAlgorithmExpiryTicks; + public EncryptionSettingForProperty( string clientEncryptionKeyId, Data.Encryption.Cryptography.EncryptionType encryptionType, @@ -60,6 +74,16 @@ public async Task BuildEncryptionAlgori return this.injectedAlgorithm; } + // Return cached algorithm if still valid. This eliminates the per-property + // semaphore acquisition that causes contention under concurrent decryption. + // The cached algorithm holds a reference to the ProtectedDataEncryptionKey + // whose underlying key bytes don't change — only the cache entry expires. + AeadAes256CbcHmac256EncryptionAlgorithm cached = this.cachedEncryptionAlgorithm; + if (cached != null && DateTime.UtcNow.Ticks < Interlocked.Read(ref this.cachedAlgorithmExpiryTicks)) + { + return cached; + } + ClientEncryptionKeyProperties clientEncryptionKeyProperties = await this.encryptionContainer.EncryptionCosmosClient.GetClientEncryptionKeyPropertiesAsync( clientEncryptionKeyId: this.ClientEncryptionKeyId, encryptionContainer: this.encryptionContainer, @@ -115,6 +139,11 @@ public async Task BuildEncryptionAlgori protectedDataEncryptionKey, this.EncryptionType); + // Cache the algorithm so subsequent calls for this property skip the semaphore entirely. + // Use the same TTL as ProtectedDataEncryptionKey (defaults to 2hrs). + this.cachedEncryptionAlgorithm = aeadAes256CbcHmac256EncryptionAlgorithm; + Interlocked.Exchange(ref this.cachedAlgorithmExpiryTicks, DateTime.UtcNow.Add(ProtectedDataEncryptionKey.TimeToLive).Ticks); + return aeadAes256CbcHmac256EncryptionAlgorithm; } @@ -178,6 +207,27 @@ private async Task BuildProtectedDataEncryptionKeyAs string keyId, CancellationToken cancellationToken) { + // Pre-warm the unwrap key cache asynchronously, BEFORE acquiring the global + // semaphore. When ProtectedDataEncryptionKey's constructor later calls + // UnwrapKey (sync, inside the semaphore), it will find a warm prefetch cache + // and return instantly — no Key Vault I/O under the lock. + // Best-effort: if prefetch fails, the sync fallback inside UnwrapKey handles + // it, and the L2 PDEK cache may avoid the unwrap call entirely. + try + { + await this.encryptionContainer.EncryptionCosmosClient.EncryptionKeyStoreProviderImpl + .PrefetchUnwrapKeyAsync( + clientEncryptionKeyProperties.EncryptionKeyWrapMetadata.Value, + clientEncryptionKeyProperties.WrappedDataEncryptionKey, + cancellationToken) + .ConfigureAwait(false); + } + catch (Exception ex) when (!(ex is OperationCanceledException)) + { + // Swallow non-cancellation prefetch errors — the sync path inside the + // semaphore will serve as fallback. + } + if (await EncryptionCosmosClient.EncryptionKeyCacheSemaphore.WaitAsync(-1, cancellationToken)) { try diff --git a/Microsoft.Azure.Cosmos.Encryption/tests/Microsoft.Azure.Cosmos.Encryption.Tests/EncryptionKeyStoreProviderImplTests.cs b/Microsoft.Azure.Cosmos.Encryption/tests/Microsoft.Azure.Cosmos.Encryption.Tests/EncryptionKeyStoreProviderImplTests.cs new file mode 100644 index 0000000000..329ca170b8 --- /dev/null +++ b/Microsoft.Azure.Cosmos.Encryption/tests/Microsoft.Azure.Cosmos.Encryption.Tests/EncryptionKeyStoreProviderImplTests.cs @@ -0,0 +1,700 @@ +//------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------ + +namespace Microsoft.Azure.Cosmos.Encryption.Tests +{ + using System; + using System.Collections.Concurrent; + using System.Collections.Generic; + using System.Linq; + using System.Threading; + using System.Threading.Tasks; + using global::Azure.Core.Cryptography; + using Microsoft.Data.Encryption.Cryptography; + using Microsoft.VisualStudio.TestTools.UnitTesting; + + /// + /// Unit tests for , covering: + /// - Prefetch async cache (PrefetchUnwrapKeyAsync) + /// - UnwrapKey fast-path from prefetch cache + /// - Sync fallback when prefetch cache is cold + /// - Proactive background refresh near TTL expiry + /// - Concurrent decryption (many threads hitting UnwrapKey) + /// - Cache expiry behavior + /// - Cancellation token propagation + /// + /// All tests use an in-memory that tracks call + /// counts and optionally injects latency, so no real Key Vault calls are made. + /// + [TestClass] + public class EncryptionKeyStoreProviderImplTests + { + private const string TestKeyId = "test-key-1"; + private const string TestProviderName = "TEST_PROVIDER"; + + // A well-known plaintext key and its "encrypted" form (trivial shift cipher). + private static readonly byte[] PlainKey = new byte[] { 10, 20, 30, 40, 50 }; + private static readonly byte[] EncryptedKey = PlainKey.Select(b => (byte)(b + 1)).ToArray(); + + #region PrefetchUnwrapKeyAsync Tests + + [TestMethod] + public async Task PrefetchUnwrapKeyAsync_WarmsCacheForSubsequentUnwrapKey() + { + // Arrange + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + resolver.RegisterKey(TestKeyId, shift: 1); + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + // Act — prefetch the key asynchronously + await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); + + // Assert — UnwrapKey should hit the prefetch cache, NOT call Resolve again. + // Reset counters after prefetch so we can assert on the sync path. + int resolveCountAfterPrefetch = resolver.ResolveAsyncCallCount; + byte[] result = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); + + CollectionAssert.AreEqual(PlainKey, result, "Unwrapped key should match original plaintext."); + Assert.AreEqual(resolveCountAfterPrefetch, resolver.ResolveAsyncCallCount, + "ResolveAsync should NOT be called again — UnwrapKey should use the prefetch cache."); + Assert.AreEqual(0, resolver.ResolveSyncCallCount, + "Sync Resolve should not be called when prefetch cache is warm."); + } + + [TestMethod] + public async Task PrefetchUnwrapKeyAsync_SkipsRefreshWhenCacheIsWarm() + { + // Arrange + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + resolver.RegisterKey(TestKeyId, shift: 1); + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + // Act — prefetch twice in quick succession + await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); + int countAfterFirst = resolver.ResolveAsyncCallCount; + + await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); + int countAfterSecond = resolver.ResolveAsyncCallCount; + + // Assert — second call should be a no-op (cache is well within TTL) + Assert.AreEqual(countAfterFirst, countAfterSecond, + "Second PrefetchUnwrapKeyAsync should skip refresh when the cache is still warm."); + } + + [TestMethod] + public async Task PrefetchUnwrapKeyAsync_PropagatesCancellation() + { + // Arrange + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + resolver.RegisterKey(TestKeyId, shift: 1); + resolver.ResolveAsyncDelay = TimeSpan.FromSeconds(10); // simulate slow resolve + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + using CancellationTokenSource cts = new CancellationTokenSource(); + cts.Cancel(); // pre-cancel + + // Act & Assert + await Assert.ThrowsExceptionAsync( + () => provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, cts.Token)); + } + + [TestMethod] + public async Task PrefetchUnwrapKeyAsync_UsesResolveAsyncNotResolve() + { + // Arrange + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + resolver.RegisterKey(TestKeyId, shift: 1); + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + // Act + await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); + + // Assert — the async path should call ResolveAsync, not Resolve + Assert.AreEqual(1, resolver.ResolveAsyncCallCount, "PrefetchUnwrapKeyAsync should call ResolveAsync."); + Assert.AreEqual(0, resolver.ResolveSyncCallCount, "PrefetchUnwrapKeyAsync should NOT call sync Resolve."); + } + + [TestMethod] + public async Task PrefetchUnwrapKeyAsync_IndependentKeysAreCachedSeparately() + { + // Arrange + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + resolver.RegisterKey("key-A", shift: 1); + resolver.RegisterKey("key-B", shift: 2); + + byte[] encKeyA = PlainKey.Select(b => (byte)(b + 1)).ToArray(); + byte[] encKeyB = PlainKey.Select(b => (byte)(b + 2)).ToArray(); + + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + // Act + await provider.PrefetchUnwrapKeyAsync("key-A", encKeyA, CancellationToken.None); + await provider.PrefetchUnwrapKeyAsync("key-B", encKeyB, CancellationToken.None); + + byte[] resultA = provider.UnwrapKey("key-A", KeyEncryptionKeyAlgorithm.RSA_OAEP, encKeyA); + byte[] resultB = provider.UnwrapKey("key-B", KeyEncryptionKeyAlgorithm.RSA_OAEP, encKeyB); + + // Assert + CollectionAssert.AreEqual(PlainKey, resultA, "Key A unwrap should return original plaintext."); + CollectionAssert.AreEqual(PlainKey, resultB, "Key B unwrap should return original plaintext."); + Assert.AreEqual(2, resolver.ResolveAsyncCallCount, "Each key should trigger exactly one ResolveAsync."); + } + + #endregion + + #region UnwrapKey Sync Fallback Tests + + [TestMethod] + public void UnwrapKey_FallsBackToSyncWhenPrefetchCacheIsCold() + { + // Arrange — no prefetch call; cache is empty + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + resolver.RegisterKey(TestKeyId, shift: 1); + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + // Act + byte[] result = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); + + // Assert + CollectionAssert.AreEqual(PlainKey, result, "Sync fallback should still unwrap correctly."); + Assert.AreEqual(1, resolver.ResolveSyncCallCount, + "Cold cache should trigger sync Resolve fallback."); + Assert.AreEqual(0, resolver.ResolveAsyncCallCount, + "Sync fallback should not call ResolveAsync."); + } + + [TestMethod] + public void UnwrapKey_SyncFallbackPopulatesPrefetchCache() + { + // Arrange — cold cache, sync fallback will populate it + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + resolver.RegisterKey(TestKeyId, shift: 1); + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + // Act — first call goes through sync path + provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); + int syncCountAfterFirst = resolver.ResolveSyncCallCount; + + // Second call should hit prefetch cache (populated by first call's sync path) + byte[] result = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); + + // Assert + CollectionAssert.AreEqual(PlainKey, result); + Assert.AreEqual(syncCountAfterFirst, resolver.ResolveSyncCallCount, + "Second UnwrapKey should hit the prefetch cache, not call Resolve again."); + } + + [TestMethod] + public void UnwrapKey_ReturnsCorrectResultForDifferentKeys() + { + // Arrange + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + resolver.RegisterKey("key1", shift: 3); + resolver.RegisterKey("key2", shift: 7); + + byte[] plain = new byte[] { 100, 110, 120 }; + byte[] enc1 = plain.Select(b => (byte)(b + 3)).ToArray(); + byte[] enc2 = plain.Select(b => (byte)(b + 7)).ToArray(); + + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + // Act + byte[] result1 = provider.UnwrapKey("key1", KeyEncryptionKeyAlgorithm.RSA_OAEP, enc1); + byte[] result2 = provider.UnwrapKey("key2", KeyEncryptionKeyAlgorithm.RSA_OAEP, enc2); + + // Assert + CollectionAssert.AreEqual(plain, result1); + CollectionAssert.AreEqual(plain, result2); + } + + #endregion + + #region Concurrency Tests + + [TestMethod] + public async Task ConcurrentUnwrapKey_WithPrefetchCache_NoContention() + { + // Arrange — prefetch the key, then hammer UnwrapKey from many threads + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + resolver.RegisterKey(TestKeyId, shift: 1); + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); + int resolveCountAfterPrefetch = resolver.ResolveAsyncCallCount; + + int concurrency = 50; + ConcurrentBag results = new ConcurrentBag(); + List tasks = new List(); + + // Act — simulate concurrent feed iterator decryption + for (int i = 0; i < concurrency; i++) + { + tasks.Add(Task.Run(() => + { + byte[] result = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); + results.Add(result); + })); + } + + await Task.WhenAll(tasks); + + // Assert — all results correct, no additional Resolve calls + Assert.AreEqual(concurrency, results.Count); + foreach (byte[] result in results) + { + CollectionAssert.AreEqual(PlainKey, result, "Every concurrent unwrap should return correct bytes."); + } + + Assert.AreEqual(resolveCountAfterPrefetch, resolver.ResolveAsyncCallCount, + "No additional ResolveAsync calls should happen — all served from prefetch cache."); + Assert.AreEqual(0, resolver.ResolveSyncCallCount, + "No sync Resolve calls should happen when prefetch cache is warm."); + } + + [TestMethod] + public async Task ConcurrentUnwrapKey_ColdCache_AllSucceed() + { + // Arrange — no prefetch, all threads hit the sync fallback + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + resolver.RegisterKey(TestKeyId, shift: 1); + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + int concurrency = 20; + ConcurrentBag results = new ConcurrentBag(); + List tasks = new List(); + + // Act + for (int i = 0; i < concurrency; i++) + { + tasks.Add(Task.Run(() => + { + byte[] result = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); + results.Add(result); + })); + } + + await Task.WhenAll(tasks); + + // Assert — all succeed despite contention on the sync fallback + Assert.AreEqual(concurrency, results.Count); + foreach (byte[] result in results) + { + CollectionAssert.AreEqual(PlainKey, result); + } + } + + [TestMethod] + public async Task ConcurrentPrefetchAndUnwrap_MixedPattern() + { + // Simulate the real pattern: some threads prefetch while others are already unwrapping. + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + resolver.RegisterKey(TestKeyId, shift: 1); + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + int totalOps = 40; + ConcurrentBag results = new ConcurrentBag(); + List tasks = new List(); + + for (int i = 0; i < totalOps; i++) + { + if (i % 5 == 0) + { + // Every 5th thread does a prefetch first + tasks.Add(Task.Run(async () => + { + await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); + byte[] result = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); + results.Add(result); + })); + } + else + { + // Other threads just call UnwrapKey directly + tasks.Add(Task.Run(() => + { + byte[] result = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); + results.Add(result); + })); + } + } + + await Task.WhenAll(tasks); + + Assert.AreEqual(totalOps, results.Count); + foreach (byte[] result in results) + { + CollectionAssert.AreEqual(PlainKey, result, "All concurrent operations should return correct bytes."); + } + } + + [TestMethod] + public async Task ConcurrentUnwrapKey_MultipleKeys_NoInterference() + { + // Verify that concurrent operations on different keys don't interfere. + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + resolver.RegisterKey("keyA", shift: 1); + resolver.RegisterKey("keyB", shift: 2); + resolver.RegisterKey("keyC", shift: 3); + + byte[] encA = PlainKey.Select(b => (byte)(b + 1)).ToArray(); + byte[] encB = PlainKey.Select(b => (byte)(b + 2)).ToArray(); + byte[] encC = PlainKey.Select(b => (byte)(b + 3)).ToArray(); + + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + // Prefetch all keys + await provider.PrefetchUnwrapKeyAsync("keyA", encA, CancellationToken.None); + await provider.PrefetchUnwrapKeyAsync("keyB", encB, CancellationToken.None); + await provider.PrefetchUnwrapKeyAsync("keyC", encC, CancellationToken.None); + + int concurrencyPerKey = 20; + ConcurrentBag<(string Key, byte[] Result)> results = new ConcurrentBag<(string, byte[])>(); + List tasks = new List(); + + foreach ((string keyId, byte[] enc) in new[] { ("keyA", encA), ("keyB", encB), ("keyC", encC) }) + { + for (int i = 0; i < concurrencyPerKey; i++) + { + string capturedKeyId = keyId; + byte[] capturedEnc = enc; + tasks.Add(Task.Run(() => + { + byte[] result = provider.UnwrapKey(capturedKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, capturedEnc); + results.Add((capturedKeyId, result)); + })); + } + } + + await Task.WhenAll(tasks); + + Assert.AreEqual(concurrencyPerKey * 3, results.Count); + foreach ((string key, byte[] result) in results) + { + CollectionAssert.AreEqual(PlainKey, result, $"Key '{key}' should unwrap to original plaintext."); + } + } + + #endregion + + #region WrapKey Tests + + [TestMethod] + public void WrapKey_PassesThroughToResolver() + { + // Arrange + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + resolver.RegisterKey(TestKeyId, shift: 1); + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + // Act + byte[] wrapped = provider.WrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, PlainKey); + + // Assert + CollectionAssert.AreEqual(EncryptedKey, wrapped); + } + + #endregion + + #region Prefetch + Latency Simulation Tests + + [TestMethod] + public async Task PrefetchUnwrapKeyAsync_WithSlowResolver_UnwrapKeyIsFast() + { + // Arrange — resolver has 200ms latency + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + resolver.RegisterKey(TestKeyId, shift: 1); + resolver.ResolveAsyncDelay = TimeSpan.FromMilliseconds(200); + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + // Act — prefetch absorbs the latency + await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); + + // Time the sync UnwrapKey — it should be near-instant from cache + System.Diagnostics.Stopwatch sw = System.Diagnostics.Stopwatch.StartNew(); + byte[] result = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); + sw.Stop(); + + // Assert + CollectionAssert.AreEqual(PlainKey, result); + Assert.IsTrue(sw.ElapsedMilliseconds < 50, + $"UnwrapKey from prefetch cache should be near-instant but took {sw.ElapsedMilliseconds}ms."); + } + + [TestMethod] + public async Task ConcurrentUnwrapKey_WithSlowResolver_PrefetchEliminatesContention() + { + // Simulate the real scenario: slow Key Vault, high concurrency. + // With prefetch, all threads should return instantly. + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + resolver.RegisterKey(TestKeyId, shift: 1); + resolver.ResolveAsyncDelay = TimeSpan.FromMilliseconds(500); + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + // Prefetch outside the hot path (absorbs the 500ms) + await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); + + int concurrency = 100; + ConcurrentBag timings = new ConcurrentBag(); + List tasks = new List(); + + for (int i = 0; i < concurrency; i++) + { + tasks.Add(Task.Run(() => + { + System.Diagnostics.Stopwatch sw = System.Diagnostics.Stopwatch.StartNew(); + byte[] result = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); + sw.Stop(); + timings.Add(sw.ElapsedMilliseconds); + })); + } + + await Task.WhenAll(tasks); + + long maxTiming = timings.Max(); + Assert.IsTrue(maxTiming < 100, + $"With warm prefetch cache, worst-case UnwrapKey should be <100ms, was {maxTiming}ms."); + Assert.AreEqual(0, resolver.ResolveSyncCallCount, + "No sync Resolve calls when prefetch cache is warm."); + } + + #endregion + + #region Edge Case Tests + + [TestMethod] + public void UnwrapKey_ThrowsOnInvalidAlgorithm() + { + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + resolver.RegisterKey(TestKeyId, shift: 1); + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + Assert.ThrowsException( + () => provider.UnwrapKey(TestKeyId, (KeyEncryptionKeyAlgorithm)999, EncryptedKey)); + } + + [TestMethod] + public void Sign_ThrowsNotSupported() + { + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + Assert.ThrowsException( + () => provider.Sign("key", true)); + } + + [TestMethod] + public void Verify_ThrowsNotSupported() + { + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + Assert.ThrowsException( + () => provider.Verify("key", true, new byte[] { 1 })); + } + + [TestMethod] + public void ProviderName_ReturnsConfiguredName() + { + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + Assert.AreEqual(TestProviderName, provider.ProviderName); + } + + [TestMethod] + public void DataEncryptionKeyCacheTimeToLive_IsZeroByDefault() + { + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + Assert.AreEqual(TimeSpan.Zero, provider.DataEncryptionKeyCacheTimeToLive); + } + + #endregion + + #region Dispose / Lifecycle Tests + + [TestMethod] + public void Dispose_CanBeCalledMultipleTimes() + { + // Arrange + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + // Act & Assert — should not throw on double-dispose + provider.Dispose(); + provider.Dispose(); + } + + [TestMethod] + public async Task Dispose_CancelsInFlightBackgroundRefresh() + { + // Arrange — use a very slow resolver so the background refresh is still in-flight when we dispose + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + resolver.RegisterKey(TestKeyId, shift: 1); + resolver.ResolveAsyncDelay = TimeSpan.FromSeconds(30); // very slow + + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + // Pre-warm with a fast resolve first (bypass the slow delay for initial population) + TimeSpan originalDelay = resolver.ResolveAsyncDelay; + resolver.ResolveAsyncDelay = TimeSpan.Zero; + await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); + resolver.ResolveAsyncDelay = originalDelay; + + // Manually expire the cached entry by calling UnwrapKey enough that + // the proactive refresh would fire. We can't wait 1h55m, so instead + // verify that Dispose doesn't hang even if a refresh is in-flight. + // (The actual proactive trigger depends on TTL proximity — here we + // just test that Dispose completes promptly.) + + // Act — dispose should return quickly, not block on the 30s resolve + System.Diagnostics.Stopwatch sw = System.Diagnostics.Stopwatch.StartNew(); + provider.Dispose(); + sw.Stop(); + + Assert.IsTrue(sw.ElapsedMilliseconds < 1000, + $"Dispose should complete promptly (not block on background I/O), took {sw.ElapsedMilliseconds}ms."); + } + + [TestMethod] + public async Task Dispose_ClearsPrefetchCache() + { + // Arrange + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + resolver.RegisterKey(TestKeyId, shift: 1); + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); + + // Verify cache is warm + byte[] warmResult = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); + CollectionAssert.AreEqual(PlainKey, warmResult); + int resolveCountBeforeDispose = resolver.ResolveAsyncCallCount + resolver.ResolveSyncCallCount; + + // Act + provider.Dispose(); + + // After dispose, UnwrapKey should go through the sync fallback + // (prefetch cache was cleared). This also verifies the provider + // is still functionally usable for in-flight operations. + byte[] postDisposeResult = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); + CollectionAssert.AreEqual(PlainKey, postDisposeResult); + + int resolveCountAfterDispose = resolver.ResolveAsyncCallCount + resolver.ResolveSyncCallCount; + Assert.IsTrue(resolveCountAfterDispose > resolveCountBeforeDispose, + "After Dispose clears prefetch cache, UnwrapKey should require a new Resolve call."); + } + + #endregion + + #region InMemoryKeyResolver — test double + + /// + /// An in-memory implementation of for unit testing. + /// Each registered key uses a simple shift cipher (add/subtract a byte offset). + /// Tracks call counts for assertions and supports configurable latency. + /// + internal sealed class InMemoryKeyResolver : IKeyEncryptionKeyResolver + { + private readonly Dictionary keyShifts = new Dictionary(); + + private int resolveAsyncCallCount; + private int resolveSyncCallCount; + + /// + /// Optional delay injected into ResolveAsync to simulate Key Vault latency. + /// + public TimeSpan ResolveAsyncDelay { get; set; } = TimeSpan.Zero; + + /// + /// Optional delay injected into sync Resolve to simulate Key Vault latency. + /// + public TimeSpan ResolveSyncDelay { get; set; } = TimeSpan.Zero; + + public int ResolveAsyncCallCount => this.resolveAsyncCallCount; + + public int ResolveSyncCallCount => this.resolveSyncCallCount; + + public void RegisterKey(string keyId, int shift) + { + this.keyShifts[keyId] = shift; + } + + public IKeyEncryptionKey Resolve(string keyId, CancellationToken cancellationToken = default) + { + cancellationToken.ThrowIfCancellationRequested(); + Interlocked.Increment(ref this.resolveSyncCallCount); + + if (this.ResolveSyncDelay > TimeSpan.Zero) + { + Thread.Sleep(this.ResolveSyncDelay); + } + + return this.CreateKey(keyId); + } + + public async Task ResolveAsync(string keyId, CancellationToken cancellationToken = default) + { + cancellationToken.ThrowIfCancellationRequested(); + Interlocked.Increment(ref this.resolveAsyncCallCount); + + if (this.ResolveAsyncDelay > TimeSpan.Zero) + { + await Task.Delay(this.ResolveAsyncDelay, cancellationToken); + } + + return this.CreateKey(keyId); + } + + private IKeyEncryptionKey CreateKey(string keyId) + { + if (!this.keyShifts.TryGetValue(keyId, out int shift)) + { + throw new InvalidOperationException($"Key '{keyId}' not registered in InMemoryKeyResolver."); + } + + return new InMemoryKeyEncryptionKey(keyId, shift); + } + } + + /// + /// A trivial in-memory using a byte-shift cipher. + /// + internal sealed class InMemoryKeyEncryptionKey : IKeyEncryptionKey + { + private readonly int shift; + + public InMemoryKeyEncryptionKey(string keyId, int shift) + { + this.KeyId = keyId; + this.shift = shift; + } + + public string KeyId { get; } + + public byte[] WrapKey(string algorithm, ReadOnlyMemory key, CancellationToken cancellationToken = default) + { + return key.ToArray().Select(b => (byte)(b + this.shift)).ToArray(); + } + + public Task WrapKeyAsync(string algorithm, ReadOnlyMemory key, CancellationToken cancellationToken = default) + { + return Task.FromResult(this.WrapKey(algorithm, key, cancellationToken)); + } + + public byte[] UnwrapKey(string algorithm, ReadOnlyMemory encryptedKey, CancellationToken cancellationToken = default) + { + return encryptedKey.ToArray().Select(b => (byte)(b - this.shift)).ToArray(); + } + + public Task UnwrapKeyAsync(string algorithm, ReadOnlyMemory encryptedKey, CancellationToken cancellationToken = default) + { + return Task.FromResult(this.UnwrapKey(algorithm, encryptedKey, cancellationToken)); + } + } + + #endregion + } +} From 32f663ba07977d019d55a6f76a5183424d45ef96 Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Thu, 19 Feb 2026 19:30:41 -0500 Subject: [PATCH 02/10] Separate out EncryptionKeyStoreProvider sub-classes. --- .../CachingEncryptionKeyStoreProviderImpl.cs | 210 ++++++++++++++++++ .../src/EncryptionCosmosClient.cs | 27 ++- .../src/EncryptionKeyStoreProviderImpl.cs | 194 ++-------------- .../src/EncryptionSettingForProperty.cs | 33 ++- .../EncryptionKeyStoreProviderImplTests.cs | 119 ++++++---- 5 files changed, 353 insertions(+), 230 deletions(-) create mode 100644 Microsoft.Azure.Cosmos.Encryption/src/CachingEncryptionKeyStoreProviderImpl.cs diff --git a/Microsoft.Azure.Cosmos.Encryption/src/CachingEncryptionKeyStoreProviderImpl.cs b/Microsoft.Azure.Cosmos.Encryption/src/CachingEncryptionKeyStoreProviderImpl.cs new file mode 100644 index 0000000000..42a1f1a587 --- /dev/null +++ b/Microsoft.Azure.Cosmos.Encryption/src/CachingEncryptionKeyStoreProviderImpl.cs @@ -0,0 +1,210 @@ +//------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------ + +namespace Microsoft.Azure.Cosmos.Encryption +{ + using System; + using System.Collections.Concurrent; + using System.Threading; + using System.Threading.Tasks; + using global::Azure.Core.Cryptography; + using Microsoft.Data.Encryption.Cryptography; + + /// + /// Extends with an async prefetch cache + /// and proactive background refresh so that the synchronous + /// call (which runs inside the global encryption semaphore) returns instantly from + /// cache with zero Key Vault I/O. + /// + /// Enabled by setting environment variable + /// AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED=true. + /// + /// When disabled (default), is + /// used instead and behaviour is identical to the original sync-only implementation. + /// + internal sealed class CachingEncryptionKeyStoreProviderImpl : EncryptionKeyStoreProviderImpl + { + /// + /// When a cached key is within this duration of its expiry a background refresh + /// is scheduled so the sync path never encounters a cold cache. + /// + private static readonly TimeSpan ProactiveRefreshThreshold = TimeSpan.FromMinutes(5); + + /// + /// Cache of asynchronously pre-fetched unwrapped key bytes, keyed by the hex + /// representation of the encrypted key. + /// + private readonly ConcurrentDictionary prefetchedKeys = new ConcurrentDictionary(); + + /// + /// Tracks cache keys that have a background refresh in-flight to deduplicate concurrent refresh tasks. + /// + private readonly ConcurrentDictionary refreshesInFlight = new ConcurrentDictionary(); + + /// + /// Cancellation source for background proactive-refresh tasks. Cancelled on + /// so in-flight refreshes are promptly stopped and the + /// provider / key-resolver / credential chain can be garbage collected. + /// + private readonly CancellationTokenSource backgroundCts = new CancellationTokenSource(); + + /// + /// Guard for to make double-cleanup and concurrent + /// cleanup calls safe. 0 = not cleaned up, 1 = cleaned up. + /// + private int cleanedUp; + + public CachingEncryptionKeyStoreProviderImpl(IKeyEncryptionKeyResolver keyEncryptionKeyResolver, string providerName) + : base(keyEncryptionKeyResolver, providerName) + { + } + + public override byte[] UnwrapKey(string encryptionKeyId, KeyEncryptionKeyAlgorithm algorithm, byte[] encryptedKey) + { + string cacheKey = encryptedKey.ToHexString(); + + // Fast path: return from the prefetch cache — zero I/O, no latency. + if (this.prefetchedKeys.TryGetValue(cacheKey, out PrefetchedKeyData cached)) + { + if (DateTime.UtcNow < cached.ExpiresAtUtc) + { + // Proactive refresh: if we are nearing expiry, kick off a background + // async refresh so the cache stays warm for the next caller. + if (DateTime.UtcNow > cached.ExpiresAtUtc - ProactiveRefreshThreshold) + { + this.ScheduleBackgroundRefresh(encryptionKeyId, encryptedKey); + } + + return cached.UnwrappedKeyBytes; + } + + // Entry has expired — remove it and fall through to the sync path. + this.prefetchedKeys.TryRemove(cacheKey, out _); + } + + // Slow path (safety net): sync Resolve + UnwrapKey. On success the result + // is pushed into the prefetch cache so future calls are fast. + return this.GetOrCreateDataEncryptionKey(cacheKey, UnWrapKeyCore); + + byte[] UnWrapKeyCore() + { + byte[] unwrapped = this.KeyEncryptionKeyResolver + .Resolve(encryptionKeyId) + .UnwrapKey(EncryptionKeyStoreProviderImpl.GetNameForKeyEncryptionKeyAlgorithm(algorithm), encryptedKey); + + this.prefetchedKeys[cacheKey] = new PrefetchedKeyData( + unwrapped, + DateTime.UtcNow.Add(ProtectedDataEncryptionKey.TimeToLive)); + + return unwrapped; + } + } + + /// + /// Asynchronously pre-warms the unwrapped-key cache for + /// so that the synchronous call (which runs inside the global + /// encryption semaphore) can return instantly without any Key Vault I/O. + /// + /// This MUST be called before acquiring the global semaphore. + /// + internal override async Task PrefetchUnwrapKeyAsync( + string encryptionKeyId, + byte[] encryptedKey, + CancellationToken cancellationToken) + { + string cacheKey = encryptedKey.ToHexString(); + + // Skip when the cache is still well within its TTL. + if (this.prefetchedKeys.TryGetValue(cacheKey, out PrefetchedKeyData existing) + && DateTime.UtcNow < existing.ExpiresAtUtc - ProactiveRefreshThreshold) + { + return; + } + + // ResolveAsync + UnwrapKeyAsync: fully async Key Vault I/O, done outside + // the global semaphore so other threads are never blocked. + IKeyEncryptionKey keyEncryptionKey = await this.KeyEncryptionKeyResolver.ResolveAsync(encryptionKeyId, cancellationToken).ConfigureAwait(false); + + byte[] unwrappedKey = await keyEncryptionKey.UnwrapKeyAsync( + EncryptionKeyStoreProviderImpl.RsaOaepWrapAlgorithm, + encryptedKey, + cancellationToken).ConfigureAwait(false); + + this.prefetchedKeys[cacheKey] = new PrefetchedKeyData( + unwrappedKey, + DateTime.UtcNow.Add(ProtectedDataEncryptionKey.TimeToLive)); + } + + /// + /// Cancels any in-flight background refresh tasks and releases the + /// . Called from + /// . + /// + internal override void Cleanup() + { + if (Interlocked.Exchange(ref this.cleanedUp, 1) != 0) + { + return; + } + + this.backgroundCts.Cancel(); + this.backgroundCts.Dispose(); + this.prefetchedKeys.Clear(); + } + + /// + /// Fires a background task to refresh the prefetch cache entry for the given + /// encrypted key, keeping the sync path warm. + /// Concurrent refreshes for the same key are deduplicated. + /// + private void ScheduleBackgroundRefresh(string encryptionKeyId, byte[] encryptedKey) + { + string cacheKey = encryptedKey.ToHexString(); + + if (!this.refreshesInFlight.TryAdd(cacheKey, 0)) + { + return; // refresh already in progress + } + + CancellationToken token = this.backgroundCts.Token; + + _ = Task.Run(async () => + { + try + { + await this.PrefetchUnwrapKeyAsync( + encryptionKeyId, + encryptedKey, + token).ConfigureAwait(false); + } + catch (Exception) + { + // Best-effort: if the background refresh fails (including + // cancellation on Cleanup), the next sync UnwrapKey call will + // fall through to the slow path. No data loss. + } + finally + { + this.refreshesInFlight.TryRemove(cacheKey, out _); + } + }); + } + + /// + /// Immutable record holding a pre-fetched unwrapped key and its expiry. + /// + private sealed class PrefetchedKeyData + { + public PrefetchedKeyData(byte[] unwrappedKeyBytes, DateTime expiresAtUtc) + { + this.UnwrappedKeyBytes = unwrappedKeyBytes; + this.ExpiresAtUtc = expiresAtUtc; + } + + public byte[] UnwrappedKeyBytes { get; } + + public DateTime ExpiresAtUtc { get; } + } + } +} diff --git a/Microsoft.Azure.Cosmos.Encryption/src/EncryptionCosmosClient.cs b/Microsoft.Azure.Cosmos.Encryption/src/EncryptionCosmosClient.cs index 8b54d2796b..718019624c 100644 --- a/Microsoft.Azure.Cosmos.Encryption/src/EncryptionCosmosClient.cs +++ b/Microsoft.Azure.Cosmos.Encryption/src/EncryptionCosmosClient.cs @@ -18,6 +18,8 @@ internal sealed class EncryptionCosmosClient : CosmosClient { internal static readonly SemaphoreSlim EncryptionKeyCacheSemaphore = new SemaphoreSlim(1, 1); + private static readonly string OptimisticDecryptionEnabledEnvironmentVariable = "AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED"; + private readonly CosmosClient cosmosClient; private readonly AsyncCache clientEncryptionKeyPropertiesCacheByKeyId; @@ -32,7 +34,12 @@ public EncryptionCosmosClient( this.KeyEncryptionKeyResolver = keyEncryptionKeyResolver ?? throw new ArgumentNullException(nameof(keyEncryptionKeyResolver)); this.KeyEncryptionKeyResolverName = keyEncryptionKeyResolverName ?? throw new ArgumentNullException(nameof(keyEncryptionKeyResolverName)); this.clientEncryptionKeyPropertiesCacheByKeyId = new AsyncCache(); - this.EncryptionKeyStoreProviderImpl = new EncryptionKeyStoreProviderImpl(keyEncryptionKeyResolver, keyEncryptionKeyResolverName); + + bool optimisticDecryption = EncryptionCosmosClient.IsOptimisticDecryptionEnabled(); + this.EnableAlgorithmCaching = optimisticDecryption; + this.EncryptionKeyStoreProviderImpl = optimisticDecryption + ? new CachingEncryptionKeyStoreProviderImpl(keyEncryptionKeyResolver, keyEncryptionKeyResolverName) + : new EncryptionKeyStoreProviderImpl(keyEncryptionKeyResolver, keyEncryptionKeyResolverName); keyCacheTimeToLive ??= TimeSpan.FromHours(1); @@ -66,6 +73,16 @@ public EncryptionCosmosClient( public override Uri Endpoint => this.cosmosClient.Endpoint; + /// + /// Gets a value indicating whether optimistic decryption is enabled. + /// When true, caches the + /// + /// to avoid per-property semaphore acquisition, and + /// uses async key prefetch with proactive refresh. Controlled by environment variable + /// AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED. Default: false. + /// + internal bool EnableAlgorithmCaching { get; } + public override async Task CreateDatabaseAsync( string id, int? throughput = null, @@ -251,12 +268,18 @@ protected override void Dispose(bool disposing) { if (disposing) { - this.EncryptionKeyStoreProviderImpl.Dispose(); + this.EncryptionKeyStoreProviderImpl.Cleanup(); } this.cosmosClient.Dispose(); } + private static bool IsOptimisticDecryptionEnabled() + { + string value = Environment.GetEnvironmentVariable(OptimisticDecryptionEnabledEnvironmentVariable); + return bool.TryParse(value, out bool result) && result; + } + private async Task FetchClientEncryptionKeyPropertiesAsync( EncryptionContainer encryptionContainer, string clientEncryptionKeyId, diff --git a/Microsoft.Azure.Cosmos.Encryption/src/EncryptionKeyStoreProviderImpl.cs b/Microsoft.Azure.Cosmos.Encryption/src/EncryptionKeyStoreProviderImpl.cs index 2f9174d6e5..51380e1614 100644 --- a/Microsoft.Azure.Cosmos.Encryption/src/EncryptionKeyStoreProviderImpl.cs +++ b/Microsoft.Azure.Cosmos.Encryption/src/EncryptionKeyStoreProviderImpl.cs @@ -5,7 +5,6 @@ namespace Microsoft.Azure.Cosmos.Encryption { using System; - using System.Collections.Concurrent; using System.Threading; using System.Threading.Tasks; using global::Azure.Core.Cryptography; @@ -24,45 +23,12 @@ namespace Microsoft.Azure.Cosmos.Encryption /// ProtectedDataEncryptionKey -> KeyEncryptionKey(containing EncryptionKeyStoreProviderImpl object) -> EncryptionKeyStoreProviderImpl.UnWrapKey -> this.keyEncryptionKeyResolver.UnwrapKey /// /// - internal class EncryptionKeyStoreProviderImpl : EncryptionKeyStoreProvider, IDisposable + internal class EncryptionKeyStoreProviderImpl : EncryptionKeyStoreProvider { public const string RsaOaepWrapAlgorithm = "RSA-OAEP"; - /// - /// When a cached key is within this duration of its expiry a background refresh - /// is scheduled so the sync path never encounters a cold cache. - /// - private static readonly TimeSpan ProactiveRefreshThreshold = TimeSpan.FromMinutes(5); - private readonly IKeyEncryptionKeyResolver keyEncryptionKeyResolver; - /// - /// Cache of asynchronously pre-fetched unwrapped key bytes, keyed by the hex - /// representation of the encrypted key. Populated by - /// (called outside the global encryption - /// semaphore) and read by (called inside the semaphore) - /// for an instant, I/O-free return. - /// - private readonly ConcurrentDictionary prefetchedKeys = new ConcurrentDictionary(); - - /// - /// Tracks cache keys that have a background refresh in-flight to deduplicate concurrent refresh tasks. - /// - private readonly ConcurrentDictionary refreshesInFlight = new ConcurrentDictionary(); - - /// - /// Cancellation source for background proactive-refresh tasks. Cancelled on - /// so in-flight refreshes are promptly stopped and the - /// provider / key-resolver / credential chain can be garbage collected. - /// - private readonly CancellationTokenSource backgroundCts = new CancellationTokenSource(); - - /// - /// Guard for to make double-dispose and concurrent - /// dispose calls safe. 0 = not disposed, 1 = disposed. - /// - private int disposed; - public EncryptionKeyStoreProviderImpl(IKeyEncryptionKeyResolver keyEncryptionKeyResolver, string providerName) { this.keyEncryptionKeyResolver = keyEncryptionKeyResolver; @@ -72,48 +38,23 @@ public EncryptionKeyStoreProviderImpl(IKeyEncryptionKeyResolver keyEncryptionKey public override string ProviderName { get; } + /// + /// Gets the key encryption key resolver for use by derived classes. + /// + internal IKeyEncryptionKeyResolver KeyEncryptionKeyResolver => this.keyEncryptionKeyResolver; + public override byte[] UnwrapKey(string encryptionKeyId, KeyEncryptionKeyAlgorithm algorithm, byte[] encryptedKey) { - string cacheKey = encryptedKey.ToHexString(); - - // Fast path: return from the prefetch cache — zero I/O, no latency. - if (this.prefetchedKeys.TryGetValue(cacheKey, out PrefetchedKeyData cached)) - { - if (DateTime.UtcNow < cached.ExpiresAtUtc) - { - // Proactive refresh: if we are nearing expiry, kick off a background - // async refresh so the cache stays warm for the next caller. - if (DateTime.UtcNow > cached.ExpiresAtUtc - ProactiveRefreshThreshold) - { - this.ScheduleBackgroundRefresh(encryptionKeyId, encryptedKey); - } - - return cached.UnwrappedKeyBytes; - } - - // Entry has expired — remove it and fall through to the sync path. - this.prefetchedKeys.TryRemove(cacheKey, out _); - } - - // Slow path (safety net): if the prefetch cache is cold — e.g. first call - // before PrefetchUnwrapKeyAsync ran, or a narrow race on cache expiry — - // fall back to the original synchronous Resolve + UnwrapKey. On success - // the result is pushed into the prefetch cache so future calls are fast. - // Note: cacheKey (already computed above) is captured by the closure — - // avoids recomputing encryptedKey.ToHexString() inside GetOrCreateDataEncryptionKey. - return this.GetOrCreateDataEncryptionKey(cacheKey, UnWrapKeyCore); + // since we do not expose GetOrCreateDataEncryptionKey we first look up the cache. + // Cache miss results in call to UnWrapCore which updates the cache after UnwrapKeyAsync is called. + return this.GetOrCreateDataEncryptionKey(encryptedKey.ToHexString(), UnWrapKeyCore); + // delegate that is called by GetOrCreateDataEncryptionKey, which unwraps the key and updates the cache in case of cache miss. byte[] UnWrapKeyCore() { - byte[] unwrapped = this.keyEncryptionKeyResolver + return this.keyEncryptionKeyResolver .Resolve(encryptionKeyId) .UnwrapKey(EncryptionKeyStoreProviderImpl.GetNameForKeyEncryptionKeyAlgorithm(algorithm), encryptedKey); - - this.prefetchedKeys[cacheKey] = new PrefetchedKeyData( - unwrapped, - DateTime.UtcNow.Add(ProtectedDataEncryptionKey.TimeToLive)); - - return unwrapped; } } @@ -140,63 +81,7 @@ public override bool Verify(string encryptionKeyId, bool allowEnclaveComputation throw new NotSupportedException("The Verify operation is not supported."); } - /// - /// Cancels any in-flight background refresh tasks and releases the - /// . This allows the provider, - /// key resolver, and credential chain to be garbage collected promptly - /// when the is disposed. - /// - public void Dispose() - { - if (Interlocked.Exchange(ref this.disposed, 1) != 0) - { - return; // already disposed — safe for double/concurrent Dispose calls - } - - this.backgroundCts.Cancel(); - this.backgroundCts.Dispose(); - this.prefetchedKeys.Clear(); - } - - /// - /// Asynchronously pre-warms the unwrapped-key cache for - /// so that the synchronous call (which runs inside the global - /// encryption semaphore) can return instantly without any Key Vault I/O. - /// - /// This MUST be called before acquiring the global semaphore. - /// - /// Key Encryption Key identifier (e.g. AKV key URI). - /// The wrapped Data Encryption Key bytes. - /// Cancellation token. - internal async Task PrefetchUnwrapKeyAsync( - string encryptionKeyId, - byte[] encryptedKey, - CancellationToken cancellationToken) - { - string cacheKey = encryptedKey.ToHexString(); - - // Skip when the cache is still well within its TTL. - if (this.prefetchedKeys.TryGetValue(cacheKey, out PrefetchedKeyData existing) - && DateTime.UtcNow < existing.ExpiresAtUtc - ProactiveRefreshThreshold) - { - return; - } - - // ResolveAsync + UnwrapKeyAsync: fully async Key Vault I/O, done outside - // the global semaphore so other threads are never blocked. - IKeyEncryptionKey keyEncryptionKey = await this.keyEncryptionKeyResolver.ResolveAsync(encryptionKeyId, cancellationToken).ConfigureAwait(false); - - byte[] unwrappedKey = await keyEncryptionKey.UnwrapKeyAsync( - EncryptionKeyStoreProviderImpl.RsaOaepWrapAlgorithm, - encryptedKey, - cancellationToken).ConfigureAwait(false); - - this.prefetchedKeys[cacheKey] = new PrefetchedKeyData( - unwrappedKey, - DateTime.UtcNow.Add(ProtectedDataEncryptionKey.TimeToLive)); - } - - private static string GetNameForKeyEncryptionKeyAlgorithm(KeyEncryptionKeyAlgorithm algorithm) + internal static string GetNameForKeyEncryptionKeyAlgorithm(KeyEncryptionKeyAlgorithm algorithm) { if (algorithm == KeyEncryptionKeyAlgorithm.RSA_OAEP) { @@ -207,58 +92,23 @@ private static string GetNameForKeyEncryptionKeyAlgorithm(KeyEncryptionKeyAlgori } /// - /// Fires a background task to refresh the prefetch cache entry for the given - /// encrypted key, keeping the sync path warm. - /// Concurrent refreshes for the same key are deduplicated. - /// The task is cancelled when is called. + /// No-op in the base implementation. Overridden in + /// to asynchronously pre-warm the unwrapped-key cache. /// - private void ScheduleBackgroundRefresh(string encryptionKeyId, byte[] encryptedKey) + internal virtual Task PrefetchUnwrapKeyAsync( + string encryptionKeyId, + byte[] encryptedKey, + CancellationToken cancellationToken) { - string cacheKey = encryptedKey.ToHexString(); - - if (!this.refreshesInFlight.TryAdd(cacheKey, 0)) - { - return; // refresh already in progress - } - - CancellationToken token = this.backgroundCts.Token; - - _ = Task.Run(async () => - { - try - { - await this.PrefetchUnwrapKeyAsync( - encryptionKeyId, - encryptedKey, - token).ConfigureAwait(false); - } - catch (Exception) - { - // Best-effort: if the background refresh fails (including - // cancellation on Dispose), the next sync UnwrapKey call will - // fall through to the slow path. No data loss. - } - finally - { - this.refreshesInFlight.TryRemove(cacheKey, out _); - } - }); + return Task.CompletedTask; } /// - /// Immutable record holding a pre-fetched unwrapped key and its expiry. + /// No-op in the base implementation. Overridden in + /// to cancel background tasks and release resources. /// - private sealed class PrefetchedKeyData + internal virtual void Cleanup() { - public PrefetchedKeyData(byte[] unwrappedKeyBytes, DateTime expiresAtUtc) - { - this.UnwrappedKeyBytes = unwrappedKeyBytes; - this.ExpiresAtUtc = expiresAtUtc; - } - - public byte[] UnwrappedKeyBytes { get; } - - public DateTime ExpiresAtUtc { get; } } } } diff --git a/Microsoft.Azure.Cosmos.Encryption/src/EncryptionSettingForProperty.cs b/Microsoft.Azure.Cosmos.Encryption/src/EncryptionSettingForProperty.cs index 33ba1b3657..73b13d18bb 100644 --- a/Microsoft.Azure.Cosmos.Encryption/src/EncryptionSettingForProperty.cs +++ b/Microsoft.Azure.Cosmos.Encryption/src/EncryptionSettingForProperty.cs @@ -74,14 +74,17 @@ public async Task BuildEncryptionAlgori return this.injectedAlgorithm; } - // Return cached algorithm if still valid. This eliminates the per-property - // semaphore acquisition that causes contention under concurrent decryption. - // The cached algorithm holds a reference to the ProtectedDataEncryptionKey - // whose underlying key bytes don't change — only the cache entry expires. - AeadAes256CbcHmac256EncryptionAlgorithm cached = this.cachedEncryptionAlgorithm; - if (cached != null && DateTime.UtcNow.Ticks < Interlocked.Read(ref this.cachedAlgorithmExpiryTicks)) + // Return cached algorithm if still valid and algorithm caching is enabled. + // This eliminates the per-property semaphore acquisition that causes + // contention under concurrent decryption. Gated by + // AZURE_COSMOS_ENCRYPTION_ALGORITHM_CACHING_ENABLED (default: off). + if (this.encryptionContainer.EncryptionCosmosClient.EnableAlgorithmCaching) { - return cached; + AeadAes256CbcHmac256EncryptionAlgorithm cached = this.cachedEncryptionAlgorithm; + if (cached != null && DateTime.UtcNow.Ticks < Interlocked.Read(ref this.cachedAlgorithmExpiryTicks)) + { + return cached; + } } ClientEncryptionKeyProperties clientEncryptionKeyProperties = await this.encryptionContainer.EncryptionCosmosClient.GetClientEncryptionKeyPropertiesAsync( @@ -141,8 +144,12 @@ public async Task BuildEncryptionAlgori // Cache the algorithm so subsequent calls for this property skip the semaphore entirely. // Use the same TTL as ProtectedDataEncryptionKey (defaults to 2hrs). - this.cachedEncryptionAlgorithm = aeadAes256CbcHmac256EncryptionAlgorithm; - Interlocked.Exchange(ref this.cachedAlgorithmExpiryTicks, DateTime.UtcNow.Add(ProtectedDataEncryptionKey.TimeToLive).Ticks); + // Only active when AZURE_COSMOS_ENCRYPTION_ALGORITHM_CACHING_ENABLED is set. + if (this.encryptionContainer.EncryptionCosmosClient.EnableAlgorithmCaching) + { + this.cachedEncryptionAlgorithm = aeadAes256CbcHmac256EncryptionAlgorithm; + Interlocked.Exchange(ref this.cachedAlgorithmExpiryTicks, DateTime.UtcNow.Add(ProtectedDataEncryptionKey.TimeToLive).Ticks); + } return aeadAes256CbcHmac256EncryptionAlgorithm; } @@ -208,9 +215,11 @@ private async Task BuildProtectedDataEncryptionKeyAs CancellationToken cancellationToken) { // Pre-warm the unwrap key cache asynchronously, BEFORE acquiring the global - // semaphore. When ProtectedDataEncryptionKey's constructor later calls - // UnwrapKey (sync, inside the semaphore), it will find a warm prefetch cache - // and return instantly — no Key Vault I/O under the lock. + // semaphore. Only active when AZURE_COSMOS_ENCRYPTION_KEY_PREFETCH_ENABLED + // is set on EncryptionKeyStoreProviderImpl (checked internally by PrefetchUnwrapKeyAsync). + // When ProtectedDataEncryptionKey's constructor later calls UnwrapKey (sync, + // inside the semaphore), it will find a warm prefetch cache and return + // instantly — no Key Vault I/O under the lock. // Best-effort: if prefetch fails, the sync fallback inside UnwrapKey handles // it, and the L2 PDEK cache may avoid the unwrap call entirely. try diff --git a/Microsoft.Azure.Cosmos.Encryption/tests/Microsoft.Azure.Cosmos.Encryption.Tests/EncryptionKeyStoreProviderImplTests.cs b/Microsoft.Azure.Cosmos.Encryption/tests/Microsoft.Azure.Cosmos.Encryption.Tests/EncryptionKeyStoreProviderImplTests.cs index 329ca170b8..bdeda3b109 100644 --- a/Microsoft.Azure.Cosmos.Encryption/tests/Microsoft.Azure.Cosmos.Encryption.Tests/EncryptionKeyStoreProviderImplTests.cs +++ b/Microsoft.Azure.Cosmos.Encryption/tests/Microsoft.Azure.Cosmos.Encryption.Tests/EncryptionKeyStoreProviderImplTests.cs @@ -45,7 +45,7 @@ public async Task PrefetchUnwrapKeyAsync_WarmsCacheForSubsequentUnwrapKey() // Arrange InMemoryKeyResolver resolver = new InMemoryKeyResolver(); resolver.RegisterKey(TestKeyId, shift: 1); - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); // Act — prefetch the key asynchronously await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); @@ -68,7 +68,7 @@ public async Task PrefetchUnwrapKeyAsync_SkipsRefreshWhenCacheIsWarm() // Arrange InMemoryKeyResolver resolver = new InMemoryKeyResolver(); resolver.RegisterKey(TestKeyId, shift: 1); - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); // Act — prefetch twice in quick succession await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); @@ -89,7 +89,7 @@ public async Task PrefetchUnwrapKeyAsync_PropagatesCancellation() InMemoryKeyResolver resolver = new InMemoryKeyResolver(); resolver.RegisterKey(TestKeyId, shift: 1); resolver.ResolveAsyncDelay = TimeSpan.FromSeconds(10); // simulate slow resolve - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); using CancellationTokenSource cts = new CancellationTokenSource(); cts.Cancel(); // pre-cancel @@ -105,7 +105,7 @@ public async Task PrefetchUnwrapKeyAsync_UsesResolveAsyncNotResolve() // Arrange InMemoryKeyResolver resolver = new InMemoryKeyResolver(); resolver.RegisterKey(TestKeyId, shift: 1); - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); // Act await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); @@ -126,7 +126,7 @@ public async Task PrefetchUnwrapKeyAsync_IndependentKeysAreCachedSeparately() byte[] encKeyA = PlainKey.Select(b => (byte)(b + 1)).ToArray(); byte[] encKeyB = PlainKey.Select(b => (byte)(b + 2)).ToArray(); - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); // Act await provider.PrefetchUnwrapKeyAsync("key-A", encKeyA, CancellationToken.None); @@ -151,7 +151,7 @@ public void UnwrapKey_FallsBackToSyncWhenPrefetchCacheIsCold() // Arrange — no prefetch call; cache is empty InMemoryKeyResolver resolver = new InMemoryKeyResolver(); resolver.RegisterKey(TestKeyId, shift: 1); - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); // Act byte[] result = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); @@ -170,7 +170,7 @@ public void UnwrapKey_SyncFallbackPopulatesPrefetchCache() // Arrange — cold cache, sync fallback will populate it InMemoryKeyResolver resolver = new InMemoryKeyResolver(); resolver.RegisterKey(TestKeyId, shift: 1); - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); // Act — first call goes through sync path provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); @@ -197,7 +197,7 @@ public void UnwrapKey_ReturnsCorrectResultForDifferentKeys() byte[] enc1 = plain.Select(b => (byte)(b + 3)).ToArray(); byte[] enc2 = plain.Select(b => (byte)(b + 7)).ToArray(); - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); // Act byte[] result1 = provider.UnwrapKey("key1", KeyEncryptionKeyAlgorithm.RSA_OAEP, enc1); @@ -218,7 +218,7 @@ public async Task ConcurrentUnwrapKey_WithPrefetchCache_NoContention() // Arrange — prefetch the key, then hammer UnwrapKey from many threads InMemoryKeyResolver resolver = new InMemoryKeyResolver(); resolver.RegisterKey(TestKeyId, shift: 1); - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); int resolveCountAfterPrefetch = resolver.ResolveAsyncCallCount; @@ -258,7 +258,7 @@ public async Task ConcurrentUnwrapKey_ColdCache_AllSucceed() // Arrange — no prefetch, all threads hit the sync fallback InMemoryKeyResolver resolver = new InMemoryKeyResolver(); resolver.RegisterKey(TestKeyId, shift: 1); - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); int concurrency = 20; ConcurrentBag results = new ConcurrentBag(); @@ -290,7 +290,7 @@ public async Task ConcurrentPrefetchAndUnwrap_MixedPattern() // Simulate the real pattern: some threads prefetch while others are already unwrapping. InMemoryKeyResolver resolver = new InMemoryKeyResolver(); resolver.RegisterKey(TestKeyId, shift: 1); - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); int totalOps = 40; ConcurrentBag results = new ConcurrentBag(); @@ -341,7 +341,7 @@ public async Task ConcurrentUnwrapKey_MultipleKeys_NoInterference() byte[] encB = PlainKey.Select(b => (byte)(b + 2)).ToArray(); byte[] encC = PlainKey.Select(b => (byte)(b + 3)).ToArray(); - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); // Prefetch all keys await provider.PrefetchUnwrapKeyAsync("keyA", encA, CancellationToken.None); @@ -385,7 +385,7 @@ public void WrapKey_PassesThroughToResolver() // Arrange InMemoryKeyResolver resolver = new InMemoryKeyResolver(); resolver.RegisterKey(TestKeyId, shift: 1); - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); // Act byte[] wrapped = provider.WrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, PlainKey); @@ -405,7 +405,7 @@ public async Task PrefetchUnwrapKeyAsync_WithSlowResolver_UnwrapKeyIsFast() InMemoryKeyResolver resolver = new InMemoryKeyResolver(); resolver.RegisterKey(TestKeyId, shift: 1); resolver.ResolveAsyncDelay = TimeSpan.FromMilliseconds(200); - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); // Act — prefetch absorbs the latency await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); @@ -429,7 +429,7 @@ public async Task ConcurrentUnwrapKey_WithSlowResolver_PrefetchEliminatesContent InMemoryKeyResolver resolver = new InMemoryKeyResolver(); resolver.RegisterKey(TestKeyId, shift: 1); resolver.ResolveAsyncDelay = TimeSpan.FromMilliseconds(500); - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); // Prefetch outside the hot path (absorbs the 500ms) await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); @@ -513,29 +513,67 @@ public void DataEncryptionKeyCacheTimeToLive_IsZeroByDefault() #endregion - #region Dispose / Lifecycle Tests + #region Base Class No-Op Tests [TestMethod] - public void Dispose_CanBeCalledMultipleTimes() + public async Task BaseClass_PrefetchUnwrapKeyAsync_IsNoOp() + { + // Arrange — use the base class directly (no caching) + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + resolver.RegisterKey(TestKeyId, shift: 1); + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + // Act — prefetch should be a no-op on the base class + await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); + + // Assert — no resolver calls made (base class returns Task.CompletedTask) + Assert.AreEqual(0, resolver.ResolveAsyncCallCount, + "Base class PrefetchUnwrapKeyAsync should be a no-op."); + + // UnwrapKey should use the sync Resolve path + byte[] result = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); + CollectionAssert.AreEqual(PlainKey, result); + Assert.AreEqual(1, resolver.ResolveSyncCallCount, + "Base class UnwrapKey should use the sync Resolve path."); + } + + [TestMethod] + public void BaseClass_Cleanup_IsNoOp() { // Arrange InMemoryKeyResolver resolver = new InMemoryKeyResolver(); EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); - // Act & Assert — should not throw on double-dispose - provider.Dispose(); - provider.Dispose(); + // Act & Assert — should not throw + provider.Cleanup(); + provider.Cleanup(); // double-call safe } + #endregion + + #region Cleanup / Lifecycle Tests + [TestMethod] - public async Task Dispose_CancelsInFlightBackgroundRefresh() + public void Cleanup_CanBeCalledMultipleTimes() { - // Arrange — use a very slow resolver so the background refresh is still in-flight when we dispose + // Arrange + InMemoryKeyResolver resolver = new InMemoryKeyResolver(); + CachingEncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); + + // Act & Assert — should not throw on double-cleanup + provider.Cleanup(); + provider.Cleanup(); + } + + [TestMethod] + public async Task Cleanup_CancelsInFlightBackgroundRefresh() + { + // Arrange — use a very slow resolver so the background refresh is still in-flight when we clean up InMemoryKeyResolver resolver = new InMemoryKeyResolver(); resolver.RegisterKey(TestKeyId, shift: 1); resolver.ResolveAsyncDelay = TimeSpan.FromSeconds(30); // very slow - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + CachingEncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); // Pre-warm with a fast resolve first (bypass the slow delay for initial population) TimeSpan originalDelay = resolver.ResolveAsyncDelay; @@ -543,48 +581,41 @@ public async Task Dispose_CancelsInFlightBackgroundRefresh() await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); resolver.ResolveAsyncDelay = originalDelay; - // Manually expire the cached entry by calling UnwrapKey enough that - // the proactive refresh would fire. We can't wait 1h55m, so instead - // verify that Dispose doesn't hang even if a refresh is in-flight. - // (The actual proactive trigger depends on TTL proximity — here we - // just test that Dispose completes promptly.) - - // Act — dispose should return quickly, not block on the 30s resolve + // Act — cleanup should return quickly, not block on the 30s resolve System.Diagnostics.Stopwatch sw = System.Diagnostics.Stopwatch.StartNew(); - provider.Dispose(); + provider.Cleanup(); sw.Stop(); Assert.IsTrue(sw.ElapsedMilliseconds < 1000, - $"Dispose should complete promptly (not block on background I/O), took {sw.ElapsedMilliseconds}ms."); + $"Cleanup should complete promptly (not block on background I/O), took {sw.ElapsedMilliseconds}ms."); } [TestMethod] - public async Task Dispose_ClearsPrefetchCache() + public async Task Cleanup_ClearsPrefetchCache() { // Arrange InMemoryKeyResolver resolver = new InMemoryKeyResolver(); resolver.RegisterKey(TestKeyId, shift: 1); - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); + CachingEncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); // Verify cache is warm byte[] warmResult = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); CollectionAssert.AreEqual(PlainKey, warmResult); - int resolveCountBeforeDispose = resolver.ResolveAsyncCallCount + resolver.ResolveSyncCallCount; + int resolveCountBeforeCleanup = resolver.ResolveAsyncCallCount + resolver.ResolveSyncCallCount; // Act - provider.Dispose(); + provider.Cleanup(); - // After dispose, UnwrapKey should go through the sync fallback - // (prefetch cache was cleared). This also verifies the provider - // is still functionally usable for in-flight operations. - byte[] postDisposeResult = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); - CollectionAssert.AreEqual(PlainKey, postDisposeResult); + // After cleanup, UnwrapKey should go through the sync fallback + // (prefetch cache was cleared). + byte[] postCleanupResult = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); + CollectionAssert.AreEqual(PlainKey, postCleanupResult); - int resolveCountAfterDispose = resolver.ResolveAsyncCallCount + resolver.ResolveSyncCallCount; - Assert.IsTrue(resolveCountAfterDispose > resolveCountBeforeDispose, - "After Dispose clears prefetch cache, UnwrapKey should require a new Resolve call."); + int resolveCountAfterCleanup = resolver.ResolveAsyncCallCount + resolver.ResolveSyncCallCount; + Assert.IsTrue(resolveCountAfterCleanup > resolveCountBeforeCleanup, + "After Cleanup clears prefetch cache, UnwrapKey should require a new Resolve call."); } #endregion From 9413386d6d85abeea983d7d8a7401c9eca30c49c Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Thu, 5 Mar 2026 16:56:30 -0500 Subject: [PATCH 03/10] Add OpenSpec change: reduce-encryption-contention (proposal, design, specs, tasks). --- .../.openspec.yaml | 2 + .../reduce-encryption-contention/design.md | 127 ++++++++++++++++++ .../reduce-encryption-contention/proposal.md | 29 ++++ .../specs/async-dek-prefetch/spec.md | 63 +++++++++ .../specs/env-var-feature-gate/spec.md | 34 +++++ .../specs/resolved-client-cache/spec.md | 30 +++++ .../reduce-encryption-contention/tasks.md | 74 ++++++++++ 7 files changed, 359 insertions(+) create mode 100644 openspec/changes/reduce-encryption-contention/.openspec.yaml create mode 100644 openspec/changes/reduce-encryption-contention/design.md create mode 100644 openspec/changes/reduce-encryption-contention/proposal.md create mode 100644 openspec/changes/reduce-encryption-contention/specs/async-dek-prefetch/spec.md create mode 100644 openspec/changes/reduce-encryption-contention/specs/env-var-feature-gate/spec.md create mode 100644 openspec/changes/reduce-encryption-contention/specs/resolved-client-cache/spec.md create mode 100644 openspec/changes/reduce-encryption-contention/tasks.md diff --git a/openspec/changes/reduce-encryption-contention/.openspec.yaml b/openspec/changes/reduce-encryption-contention/.openspec.yaml new file mode 100644 index 0000000000..8f0b86992e --- /dev/null +++ b/openspec/changes/reduce-encryption-contention/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-03-05 diff --git a/openspec/changes/reduce-encryption-contention/design.md b/openspec/changes/reduce-encryption-contention/design.md new file mode 100644 index 0000000000..36bf35a482 --- /dev/null +++ b/openspec/changes/reduce-encryption-contention/design.md @@ -0,0 +1,127 @@ +## Context + +Client-side encryption in the Cosmos DB .NET SDK uses a global `SemaphoreSlim(1,1)` that serializes all encryption key operations. The semaphore guards MDE’s `ProtectedDataEncryptionKey.GetOrCreate` and `KeyEncryptionKey.GetOrCreate` — both of which are cache lookups on the hot path but trigger synchronous Key Vault HTTP calls on cache miss. PDEK resolution is sync, on the hot path, and makes two blocking HTTP calls to Key Vault (Resolve + UnwrapKey) under the semaphore. + +The call amplification is severe: `DecryptObjectAsync` calls `BuildEncryptionAlgorithmForSettingAsync` per encrypted leaf value per document per page. With concurrent feed iterators, this produces thousands of semaphore acquisitions per second, all serialized to a single permit. + +**Structural constraint**: MDE's `ProtectedDataEncryptionKey` constructor chain is sync (C# cannot make base-ctor calls async). `EncryptionKeyStoreProvider.UnwrapKey` returns `byte[]` (sync). MDE's `LocalCache.GetOrCreate` takes `Func` not `Func>`. The sync chain from `ProtectedDataEncryptionKey` → `KeyEncryptionKey.DecryptEncryptionKey` → `UnwrapKey` → `Resolve` + `UnwrapKey` on Key Vault cannot be made async at any point in MDE's type hierarchy. However, `IKeyEncryptionKeyResolver` exposes `ResolveAsync` and `IKeyEncryptionKey` exposes `UnwrapKeyAsync` — these async variants are reachable from our code, just not through MDE's call chain. + +## Goals / Non-Goals + +**Goals:** +- Move PDEK resolution (Key Vault I/O) off the hot path via async prefetch outside the semaphore +- Prevent AKV overload: proactive background refresh, deduplicated to one call per key per interval +- Reduce Key Vault calls per refresh: cache resolved `IKeyEncryptionKey` so only `UnwrapKey` hits AKV, not `Resolve` + `UnwrapKey` +- Gate all changes behind an opt-in env var for safe rollout — zero behavior change when disabled +- No public API changes. No breaking changes. + +**Non-Goals:** +- Changing MDE's type hierarchy or making the `ProtectedDataEncryptionKey` constructor async +- Removing the semaphore entirely (it guards MDE internal state mutation — still needed) +- Per-key semaphores (reduces cross-key contention but same-key still serializes; added complexity for marginal gain) +- Caching the `AeadAes256CbcHmac256EncryptionAlgorithm` at the `EncryptionSettingForProperty` level (would need to be hoisted above the semaphore to reduce contention, which reintroduces ETag validation and Forbidden retry path complexity; however, MDE has a built-in `GetOrCreate` cache that the SDK currently bypasses — see Future Considerations)- Enabling the MDE DEK byte cache (`DataEncryptionKeyCacheTimeToLive`) as a standalone layer (redundant when async prefetch is working — the sync `UnwrapKey` path reads from the prefetch cache, never reaching MDE’s built-in cache) +- CEK rotation handling (CEK replacement requires data migration — re-encrypting every document — and is an offline operation; not a runtime concern for caching)- Supporting key rotation detection at the cache level (CEK rewrap doesn't change plaintext bytes; new CEK policy creates new `EncryptionSettingForProperty` objects) + +## Decisions + +### Decision 1: Async prefetch in a sealed subclass, not in the base `EncryptionKeyStoreProviderImpl` + +**Chosen**: New `CachingEncryptionKeyStoreProviderImpl` (sealed, internal) extends `EncryptionKeyStoreProviderImpl`. Base class stays clean sync-only. `EncryptionCosmosClient` instantiates the subclass when env var is on. + +**Rationale**: The base class serves all existing customers with zero behavior change. The subclass adds: prefetch `ConcurrentDictionary`, resolved-client cache, `CancellationTokenSource`, `Cleanup()` method, and overridden `UnwrapKey` that checks prefetch cache first. Separation keeps the risk surface isolated. + +**Rejected alternative**: Adding prefetch directly to base class with conditional branches — mixes concerns, harder to reason about lifecycle, risk of unintended behavior when env var is off. + +### Decision 2: `ConcurrentDictionary` for prefetch cache (not `MemoryCache` or MDE's `LocalCache`) + +**Chosen**: `ConcurrentDictionary` with manual TTL tracking (expiry stored alongside value). + +**Rationale**: MDE's `LocalCache.GetOrCreate` is get-or-create only — no `Set`, `Remove`, or `Evict` API. `MemoryCache` adds a dependency and has its own concurrency model. `ConcurrentDictionary` is simple, lock-free reads, and we control TTL ourselves. + +### Decision 3: Proactive refresh via `Task.Run`, not `Timer` + +**Chosen**: On cache access, if within 5 minutes of expiry, fire `Task.Run` to refresh in background. Deduplicate via `ConcurrentDictionary`. + +**Rationale**: `Timer` requires managing timer lifecycle per key, disposal ordering, and thread-pool callbacks. `Task.Run` on access is simpler — only fires when the key is actually being used (no wasted refresh for unused keys). Tied to `CancellationTokenSource` for cleanup on dispose. + +**Rejected alternative**: `System.Threading.Timer` per key — more deterministic timing but complex lifecycle management (dispose timers on cache eviction, handle timer callbacks after disposal). + +## Risks / Trade-offs + +- **[Non-risk] Cache coherence on key rotation**: CEK rewrap (same plaintext DEK, new CMK wrapper) is the only runtime rotation — all caches remain valid. CEK replacement is out of scope (requires data migration, offline operation). + +- **[Risk] Background refresh lifecycle]: `Task.Run` captures `this`, keeping the provider and all its dependencies (resolver, credential chain) GC-rooted until the task completes. → **Mitigation**: `CancellationTokenSource` cancelled on `Cleanup()`/`Dispose()`. `Interlocked.Exchange` for idempotent dispose. Background tasks observe cancellation token. + +- **[Risk] Two copies of plaintext DEK in memory**: PDEK internal field + prefetch `ConcurrentDictionary`. → **Mitigation**: Same process, same threat boundary, same TTL. No new attack surface. Industry standard (Azure Key Vault SDK recommends caching). + +- **[Risk] Env var read at construction only**: If an operator enables the env var and restarts the app, old client instances still run without optimization. → **Mitigation**: This is by design — matches SDK `ConfigurationManager` pattern. Client must be reconstructed to pick up changes. + +- **[Risk] AKV burst on multi-CEK proactive refresh**: If multiple Client Encryption Keys have similar TTLs (common at startup — all populated in a short window), their proactive refreshes fire simultaneously. Each refresh is deduplicated per key (1 call per CEK), but N CEKs = N concurrent AKV calls. AKV throttles at 4000 ops/vault/10s — unlikely to hit for typical customers (1–10 CEKs), but possible for large deployments. → **Mitigation (recommended)**: Add jitter to the proactive refresh window (random offset within 2–5 minutes before expiry) so CEKs don’t all refresh at the same instant. Consider a concurrency limiter on background AKV calls. + +- **[Trade-off] Prefetch adds complexity for correctness**: Async prefetch + background refresh + disposal lifecycle is more complex than a one-liner (`TimeSpan.Zero` → `TimeSpan.FromHours(1)`). But it solves the actual problem — PDEK resolution is async, off the hot path, and doesn’t overload AKV. → **Accept**: Complexity is isolated in a sealed subclass, gated by env var. + +## Migration Plan + +1. All changes are gated by `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED` (off by default). +2. Ship in next SDK release with release notes documenting the env var. +3. Customers experiencing contention enable the env var. No code changes to their app. +4. After bake time (1–2 release cycles), consider enabling by default. +5. **Rollback**: Set env var to false (or unset it). Immediate revert to current behavior on next client construction. + +## Performance Benchmarking Strategy + +The fix must be validated with quantitative evidence, not just passing tests. The benchmark should reproduce the customer's actual workload — not a synthetic microbenchmark. + +**Customer workload profile (from incident investigation):** +- Service running high-concurrency `CosmosFeedIterator` reads with client-side encryption enabled +- Multiple encrypted properties per document (nested objects/arrays → per-leaf-value semaphore acquisition) +- AKV-backed key resolver (`KeyResolver(DefaultTokenCredential)`) with `DefaultAzureCredential` (Managed Identity in production) +- Sustained concurrent load — multiple feed iterators running in parallel +- Failure point: PDEK cache TTL expires (every 1–2 hours) → first thread blocks on sync Key Vault I/O inside semaphore → queued threads' cancellation tokens fire → `OperationCanceledException` +- Cancellation timeout: ~5s (inferred from call stack — Change Feed Processor lease rebalance or request timeout) + +**The benchmark must reproduce this pattern**: concurrent feed iterator reads decrypting multi-property documents, crossing a PDEK cache TTL boundary, with realistic Key Vault latency. + +**What to measure:** +- **Semaphore hold time** (p50, p95, p99): How long each thread holds the semaphore during `BuildProtectedDataEncryptionKeyAsync`. Baseline (without fix) should show 200ms–2.4s on cache miss. With fix, should drop to microseconds. +- **End-to-end decrypt latency** (p50, p95, p99): Time from `ReadNextAsync` call to decrypted response, under concurrent load. +- **Throughput**: Decrypted documents per second at N concurrent feed iterators. +- **AKV call count**: Total `Resolve` + `UnwrapKey` calls to Key Vault over a fixed workload. Should drop from O(PDEK misses × 2) to O(CEKs × 1 per TTL interval). +- **`OperationCanceledException` rate**: With a 5s cancellation timeout (matching customer pattern), count how many operations are cancelled. Baseline should reproduce the customer's issue; with fix, should be zero. + +**Benchmark configuration:** +- Concurrent feed iterators: 1, 10, 50, 100 +- Encrypted properties per document: 1, 5, 10 +- Documents per page: 10, 50 +- CEK count: 1, 5 (to test multi-CEK refresh burst) +- Key Vault latency: simulated via `InMemoryKeyResolver` with configurable delay (0ms, 100ms, 500ms, 2000ms) — 500ms represents typical AKV latency, 2000ms represents cold token + AKV worst case +- PDEK cache state: warm (cache hit) and cold (force TTL expiry to simulate the 2-hour boundary) +- **Key scenario (must reproduce customer failure)**: Start with warm cache under sustained concurrent reads (50+ feed iterators), then force PDEK TTL expiry mid-workload with 500ms simulated AKV latency and 5s cancellation timeout. Baseline should show `OperationCanceledException` cascade; with fix, should show zero cancellations. + +**Comparison:** +- Env var off (baseline — current behavior) vs. env var on (with fix) +- Same workload, same concurrency, same simulated Key Vault latency +- Output: CSV with latency percentiles, throughput, AKV call counts, cancellation counts + +**Where to run:** +- Unit-level microbenchmarks (in-process, `InMemoryKeyResolver` with configurable delay) for semaphore hold time and throughput +- Emulator E2E benchmarks for end-to-end latency with real Cosmos operations (requires emulator) + +## Open Questions + +1. Should the resolved-client cache (`IKeyEncryptionKey`) be invalidated on a timer, or only on CMK URL mismatch? The `CryptographyClient` is stateless and long-lived, so indefinite caching until URL change seems safe. +2. What should the prefetch cache TTL be? Should it match PDEK TTL (1–2 hours) or be shorter? +3. **AKV burst on multi-CEK refresh**: If a customer has N distinct Client Encryption Keys all created around the same time, their prefetch caches expire roughly together, causing N concurrent `ResolveAsync` + `UnwrapKeyAsync` calls to AKV in one burst. Should the proactive refresh window include jitter/stagger (e.g., random offset within 2–5 minutes before expiry) to spread calls over time? Should there be a rate limiter (max M concurrent background AKV calls)? +4. **Single vs. dual `ConcurrentDictionary` for prefetch**: The current design uses two dictionaries — `ConcurrentDictionary` (result cache) and `ConcurrentDictionary` (inflight deduplication). The result cache serves the sync `UnwrapKey` path (needs `byte[]` immediately, can't `await`). The inflight dictionary is ephemeral (populated only during active AKV calls, removed on completion). An alternative is a single `ConcurrentDictionary>` where the sync path calls `.Result` (safe when the task is already completed from a prior prefetch). Trade-off: fewer data structures vs. less explicit intent. Should the implementer choose based on readability? + +## Future Considerations + +### Use MDE's built-in algorithm cache (`AeadAes256CbcHmac256EncryptionAlgorithm.GetOrCreate`) + +MDE already has a static `LocalCache, AeadAes256CbcHmac256EncryptionAlgorithm>` with a `GetOrCreate` method. The SDK currently bypasses it by calling `new AeadAes256CbcHmac256EncryptionAlgorithm(pdek, encType)` directly at `EncryptionSettingForProperty.cs:114`. + +Switching to `GetOrCreate` would eliminate a redundant allocation per encrypted leaf value on the hot path (when the PDEK cache hits, the same `ProtectedDataEncryptionKey` reference produces equal `Tuple` keys → algorithm cache hit). `ProtectedDataEncryptionKey` overrides `Equals`/`GetHashCode` with structural equality (`Name` + `KeyEncryptionKey` + `rootKeyHexString`), so the cache key works correctly. + +**Why it's not in this change**: The `GetOrCreate` call occurs *after* the semaphore is released — it doesn't reduce semaphore contention. To skip the semaphore entirely, the cache check would need to be hoisted above `BuildProtectedDataEncryptionKeyAsync`, which reintroduces the ETag validation complexity and interaction with the Forbidden retry path. The async prefetch already makes the semaphore hold time microseconds, so the marginal benefit of skipping it entirely is small. + +**Low-risk cleanup**: `new` → `GetOrCreate` is a one-line change that saves allocations without changing semantics. Does not conflict with the `injectedAlgorithm` test hook (that path returns before reaching line 114). Could be done independently of this change. diff --git a/openspec/changes/reduce-encryption-contention/proposal.md b/openspec/changes/reduce-encryption-contention/proposal.md new file mode 100644 index 0000000000..3950438c79 --- /dev/null +++ b/openspec/changes/reduce-encryption-contention/proposal.md @@ -0,0 +1,29 @@ +## Why + +Customer-side encryption operations throw `OperationCanceledException` under concurrent load because a global `SemaphoreSlim(1,1)` in `BuildProtectedDataEncryptionKeyAsync` serializes construction of `ProtectedDataEncryptionKey` objects — the resolved, unwrapped encryption keys needed for every encrypt/decrypt operation. The semaphore guards `KeyEncryptionKey.GetOrCreate` and `ProtectedDataEncryptionKey.GetOrCreate` — MDE's static get-or-create cache operations that, on cache miss, invoke the `createItem` delegate which triggers synchronous Key Vault HTTP calls (Resolve + UnwrapKey). The semaphore ensures only one thread at a time performs the expensive key creation, preventing duplicate Key Vault calls for the same key. However, this means every encrypted leaf value in every document on every page contends on this single-permit lock — even when the cache is warm and the hold time would be microseconds. The root cause: `ProtectedDataEncryptionKey` resolution is synchronous, happens on the hot path under the semaphore, and makes two blocking HTTP calls to Key Vault on cache miss. An internal customer is actively impacted. + +## What Changes + +- **Async prefetch of `ProtectedDataEncryptionKey` resolution outside the semaphore** via `ResolveAsync()` + `UnwrapKeyAsync()` before semaphore acquisition. The sync `UnwrapKey` inside the semaphore cannot be removed — MDE's `ProtectedDataEncryptionKey.GetOrCreate` constructor chain calls it unconditionally. Instead, the prefetch populates a cache so that when MDE's sync `UnwrapKey` fires, it returns cached bytes instantly instead of making HTTP calls to Key Vault. The semaphore is still acquired, but held for microseconds (cache read) instead of 200ms–2.4s (HTTP I/O). +- **Proactive background refresh** of the prefetched unwrapped Data Encryption Key bytes (the plaintext AES-256 key material returned by Key Vault's `UnwrapKey`) ~5min before TTL expiry, deduplicated to one AKV call per key per interval. Prevents thundering herd at TTL boundary. +- **Cache the resolved `IKeyEncryptionKey` (CryptographyClient)** per CMK URL so each refresh makes one AKV call (`UnwrapKey`) instead of two (`Resolve` + `UnwrapKey`). +- All changes gated behind an opt-in environment variable (`AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED`), off by default. No public API changes. No breaking changes. + +## Capabilities + +### New Capabilities +- `async-dek-prefetch`: Async prefetch of `ProtectedDataEncryptionKey` resolution outside the semaphore using `ResolveAsync()` + `UnwrapKeyAsync()`, with a `ConcurrentDictionary` prefetch cache that the sync `UnwrapKey` reads from. Includes proactive background refresh before TTL expiry and lifecycle management via `CancellationTokenSource`. +- `resolved-client-cache`: Cache the `IKeyEncryptionKey` (CryptographyClient) returned by `Resolve()` per CMK URL to eliminate redundant Key Vault HTTP GETs on each refresh. +- `env-var-feature-gate`: Environment variable gate (`AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED`) to opt in to all caching/prefetch layers. Off by default. Follows existing SDK `ConfigurationManager` pattern. + +### Modified Capabilities + + +## Impact + +- **Files**: `EncryptionKeyStoreProviderImpl.cs` (or new subclass), `EncryptionCosmosClient.cs`, `EncryptionSettingForProperty.cs` (prefetch wiring only) +- **Dependencies**: No new packages. Uses existing `IKeyEncryptionKeyResolver.ResolveAsync()` / `IKeyEncryptionKey.UnwrapKeyAsync()` from `Azure.Core.Cryptography`. +- **APIs**: No public API changes. Internal-only. +- **Security**: Plaintext Data Encryption Key bytes are already cached in-process by MDE's `ProtectedDataEncryptionKey`. New caches hold the same bytes with the same TTL in the same process — no new attack surface. +- **Risk**: New caching layers introduce lifecycle complexity (background refresh, disposal, cache coherence on key rotation). Gated by env var for safe rollout. +- **Testing**: Unit tests for each cache layer + concurrency. Emulator E2E tests with env var enabled. diff --git a/openspec/changes/reduce-encryption-contention/specs/async-dek-prefetch/spec.md b/openspec/changes/reduce-encryption-contention/specs/async-dek-prefetch/spec.md new file mode 100644 index 0000000000..90ccb8fdf9 --- /dev/null +++ b/openspec/changes/reduce-encryption-contention/specs/async-dek-prefetch/spec.md @@ -0,0 +1,63 @@ +## ADDED Requirements + +### Requirement: Async prefetch of unwrapped DEK bytes outside the semaphore +The system SHALL provide a `PrefetchUnwrapKeyAsync` method that calls `ResolveAsync()` + `UnwrapKeyAsync()` asynchronously and stores the result in a `ConcurrentDictionary` prefetch cache. This method SHALL be called before semaphore acquisition in `BuildProtectedDataEncryptionKeyAsync`. + +#### Scenario: Prefetch warms cache before semaphore +- **WHEN** `BuildEncryptionAlgorithmForSettingAsync` enters the cold path and calls `PrefetchUnwrapKeyAsync` before acquiring the semaphore +- **THEN** `ResolveAsync` and `UnwrapKeyAsync` SHALL execute asynchronously (yielding the thread), and the result SHALL be stored in the prefetch cache + +#### Scenario: Sync UnwrapKey reads from prefetch cache +- **WHEN** MDE's sync `UnwrapKey` is called inside the semaphore and the prefetch cache has a valid entry for the wrapped key +- **THEN** `UnwrapKey` SHALL return the cached bytes immediately without calling `Resolve()` or `UnwrapKey()` on Key Vault + +#### Scenario: Sync fallback on prefetch cache miss +- **WHEN** MDE's sync `UnwrapKey` is called inside the semaphore and the prefetch cache does NOT have an entry (race condition, prefetch failed, or prefetch not called) +- **THEN** `UnwrapKey` SHALL fall through to the existing sync `Resolve()` + `UnwrapKey()` path (identical to current behavior) + +### Requirement: Concurrent prefetch deduplication +The system SHALL deduplicate concurrent prefetch calls for the same wrapped key so that only one async Key Vault call flies per key at a time. + +#### Scenario: Multiple threads prefetch same key +- **WHEN** 50 threads simultaneously call `PrefetchUnwrapKeyAsync` for the same wrapped key +- **THEN** only one `ResolveAsync` + `UnwrapKeyAsync` call SHALL be made to Key Vault; all threads SHALL await the same `Task` + +### Requirement: Proactive background refresh before TTL expiry +The system SHALL schedule a background refresh of the prefetch cache entry approximately 5 minutes before the entry's TTL expires, so that the next consumer finds a warm cache. + +#### Scenario: Background refresh fires before expiry +- **WHEN** a prefetch cache entry is within 5 minutes of expiry and is accessed +- **THEN** the system SHALL initiate a background `Task.Run` that calls `ResolveAsync` + `UnwrapKeyAsync` and updates the cache entry + +#### Scenario: Background refresh failure does not crash +- **WHEN** the background refresh call fails (Key Vault down, 429 throttle, network error) +- **THEN** the failure SHALL be caught and logged; the existing cache entry SHALL remain until its TTL expires; the sync fallback path SHALL handle the next call + +### Requirement: Prefetch cache TTL matches PDEK TTL +The prefetch cache entry TTL SHALL match the `ProtectedDataEncryptionKey.TimeToLive` value. + +#### Scenario: Cache entry expires with PDEK +- **WHEN** the PDEK cache TTL (1–2 hours) elapses +- **THEN** the prefetch cache entry for the same key SHALL also be expired, ensuring a fresh Key Vault call on the next cold path + +### Requirement: Lifecycle management via CancellationTokenSource +The async prefetch layer SHALL use a `CancellationTokenSource` to cancel in-flight background refresh tasks on disposal. + +#### Scenario: Disposal cancels background tasks +- **WHEN** `EncryptionCosmosClient.Dispose()` is called +- **THEN** the `CancellationTokenSource` SHALL be cancelled, all in-flight background refresh tasks SHALL observe cancellation, and the prefetch cache SHALL be cleared + +#### Scenario: Double-dispose is safe +- **WHEN** `Dispose()` is called multiple times +- **THEN** the second and subsequent calls SHALL be no-ops (idempotent via `Interlocked.Exchange`) + +### Requirement: Prefetch errors do not propagate to callers +The prefetch call in `BuildEncryptionAlgorithmForSettingAsync` SHALL be best-effort. + +#### Scenario: Prefetch throws non-cancellation exception +- **WHEN** `PrefetchUnwrapKeyAsync` throws an exception that is not `OperationCanceledException` +- **THEN** the exception SHALL be caught and swallowed; execution SHALL continue to semaphore acquisition and the normal sync path + +#### Scenario: Prefetch throws OperationCanceledException +- **WHEN** `PrefetchUnwrapKeyAsync` throws `OperationCanceledException` (caller's token fired) +- **THEN** the exception SHALL propagate to the caller (same as current behavior when `WaitAsync` is cancelled) diff --git a/openspec/changes/reduce-encryption-contention/specs/env-var-feature-gate/spec.md b/openspec/changes/reduce-encryption-contention/specs/env-var-feature-gate/spec.md new file mode 100644 index 0000000000..2c1f46c17f --- /dev/null +++ b/openspec/changes/reduce-encryption-contention/specs/env-var-feature-gate/spec.md @@ -0,0 +1,34 @@ +## ADDED Requirements + +### Requirement: Single environment variable gates all optimization layers +The system SHALL use a single environment variable `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED` to enable or disable all caching and prefetch layers (resolved-client cache, async prefetch, proactive background refresh). + +#### Scenario: Env var not set — all layers disabled +- **WHEN** `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED` is not set in the environment +- **THEN** all behavior SHALL be identical to the current codebase: no resolved-client cache, no async prefetch, no proactive background refresh + +#### Scenario: Env var set to true — all layers enabled +- **WHEN** `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED` is set to `true` (case-insensitive) +- **THEN** all caching and prefetch layers SHALL be active + +#### Scenario: Env var set to false or invalid — all layers disabled +- **WHEN** `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED` is set to `false`, empty, or any value that does not parse as `true` +- **THEN** all behavior SHALL be identical to the env-var-not-set case + +### Requirement: Env var read at EncryptionCosmosClient construction time +The environment variable SHALL be read once during `EncryptionCosmosClient` construction and the result cached for the client's lifetime. Subsequent changes to the environment variable SHALL NOT affect an already-constructed client. + +#### Scenario: Env var read once at startup +- **WHEN** `EncryptionCosmosClient` is constructed +- **THEN** the env var SHALL be read via the SDK's `ConfigurationManager` pattern (or `Environment.GetEnvironmentVariable`) and the boolean result stored as a readonly field + +#### Scenario: Env var change after construction has no effect +- **WHEN** the environment variable is changed after `EncryptionCosmosClient` is constructed +- **THEN** the existing client instance SHALL continue using the value read at construction time + +### Requirement: No public API changes +There SHALL be no new public classes, methods, properties, or parameters exposed. All optimization layers SHALL be entirely internal, activated only by the environment variable. + +#### Scenario: Public API surface unchanged +- **WHEN** the encryption package is built with optimizations enabled +- **THEN** the public API contract (as captured in the contracts file) SHALL be identical to the current version diff --git a/openspec/changes/reduce-encryption-contention/specs/resolved-client-cache/spec.md b/openspec/changes/reduce-encryption-contention/specs/resolved-client-cache/spec.md new file mode 100644 index 0000000000..e943deaa1b --- /dev/null +++ b/openspec/changes/reduce-encryption-contention/specs/resolved-client-cache/spec.md @@ -0,0 +1,30 @@ +## ADDED Requirements + +### Requirement: Resolved IKeyEncryptionKey cached per CMK URL +The system SHALL maintain a `ConcurrentDictionary` keyed by `encryptionKeyId` (the CMK URL) that caches the `CryptographyClient` returned by `IKeyEncryptionKeyResolver.Resolve()`. + +#### Scenario: First resolve for a CMK URL +- **WHEN** `UnwrapKey` (or prefetch) is called for a CMK URL not yet in the resolved-client cache +- **THEN** the system SHALL call `Resolve(keyId)` (or `ResolveAsync(keyId)` on the async path), store the returned `IKeyEncryptionKey` in the cache, and use it for the `UnwrapKey` call + +#### Scenario: Subsequent resolve for the same CMK URL +- **WHEN** `UnwrapKey` is called for a CMK URL that is already in the resolved-client cache +- **THEN** the system SHALL skip the `Resolve()` call and use the cached `IKeyEncryptionKey` directly for the `UnwrapKey` call + +#### Scenario: Key Vault calls halved on true cache miss +- **WHEN** a PDEK miss triggers `UnwrapKey` with a warm resolved-client cache +- **THEN** only one Key Vault HTTP call SHALL be made (`UnwrapKey` POST) instead of two (`Resolve` GET + `UnwrapKey` POST) + +### Requirement: No secret material in resolved-client cache +The `IKeyEncryptionKey` object SHALL contain only the Key Vault URL, key name, key version, and HTTP pipeline configuration. It SHALL NOT contain any private key material. + +#### Scenario: Cache contents are non-secret +- **WHEN** the resolved-client cache is inspected +- **THEN** each entry SHALL be a `CryptographyClient` (or equivalent) containing only the CMK URL and auth pipeline reference — no RSA private key bytes + +### Requirement: Cache invalidation on CMK URL change +The resolved-client cache entry SHALL be invalidated when the CMK URL changes (key rotation where the CEK is rewrapped to a different CMK). + +#### Scenario: CMK URL changes after rewrap +- **WHEN** `ClientEncryptionKeyProperties.EncryptionKeyWrapMetadata.Value` returns a different CMK URL than the one cached +- **THEN** the system SHALL call `Resolve()` with the new URL and update the cache entry diff --git a/openspec/changes/reduce-encryption-contention/tasks.md b/openspec/changes/reduce-encryption-contention/tasks.md new file mode 100644 index 0000000000..46e7a46b3f --- /dev/null +++ b/openspec/changes/reduce-encryption-contention/tasks.md @@ -0,0 +1,74 @@ +## 1. Security Validation — Background DEK Refresh & Indefinite IKeyEncryptionKey Caching + +- [ ] 1.1 **Threat model: plaintext DEK in prefetch cache.** Document what is cached (plaintext AES-256 bytes), where it lives (in-process `ConcurrentDictionary`), and confirm it is the same key material already held by MDE's `ProtectedDataEncryptionKey`. Identify any net-new exposure vs. current state. Deliverable: written assessment in this task item or design.md addendum. +- [ ] 1.2 **Threat model: background `Task.Run` refresh.** Identify risks of a background task calling `ResolveAsync` + `UnwrapKeyAsync` on a timer-like pattern: (a) GC rooting of provider/credential chain while task is in-flight, (b) token credential refresh happening on a ThreadPool thread outside the caller's context, (c) plaintext bytes being written to cache from a background thread after `Dispose()` is called. For each risk, document mitigation (CancellationTokenSource, Interlocked dispose guard, cancellation token observation). +- [ ] 1.3 **Threat model: indefinite `IKeyEncryptionKey` caching.** Confirm the cached `CryptographyClient` contains no secret material (only vault URL + key name + version + HTTP pipeline reference). Identify the invalidation trigger (CMK URL change on key rotation). Document whether an indefinite cache (no TTL, invalidate only on URL mismatch) is safe, or whether a bounded TTL is needed. Check: does the `CryptographyClient` hold a reference to a specific key version, and what happens if the key is disabled/deleted in Key Vault? +- [ ] 1.4 **Key rotation scenario.** Confirm that CEK rewrap (the only runtime rotation) does not change plaintext DEK bytes, so all caches remain valid. CEK replacement is out of scope (requires data migration, offline operation). Document this assumption. +- [ ] 1.5 **Stale key material window.** Quantify the maximum time plaintext DEK bytes from a revoked CMK can remain in memory (prefetch cache TTL + PDEK TTL). CEK replacement is out of scope. Confirm the window is acceptable per Azure Key Vault best practices. +- [ ] 1.6 **Explore: can `AeadAes256CbcHmac256EncryptionAlgorithm.GetOrCreate` replace `new`?** MDE has a built-in static `LocalCache` for algorithm instances keyed by `(DataEncryptionKey, EncryptionType)`. The SDK bypasses it by calling `new` directly. Evaluate: (a) does `ProtectedDataEncryptionKey.Equals`/`GetHashCode` produce correct cache hits for the same logical key? (b) is there any thread-safety concern with MDE's `LocalCache`? (c) could this be a standalone one-line cleanup independent of this change? Document findings. +- [ ] 1.7 **Review with implementing engineer.** Walk through findings from 1.1–1.6 before proceeding to implementation. Resolve any open questions. Update design.md Risks section if new risks are identified. + +## 2. Environment Variable Feature Gate + +- [ ] 2.1 Read `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED` in `EncryptionCosmosClient` constructor (via `Environment.GetEnvironmentVariable`, case-insensitive `true` check). Store as `internal readonly bool IsOptimisticDecryptionEnabled`. +- [ ] 2.2 Verify env var is read once at construction and cached — subsequent env var changes do not affect existing client instances. +- [ ] 2.3 Add unit test: env var not set → `IsOptimisticDecryptionEnabled` is false. +- [ ] 2.4 Add unit test: env var set to `true` / `True` / `TRUE` → `IsOptimisticDecryptionEnabled` is true. +- [ ] 2.5 Add unit test: env var set to `false` / empty / garbage → `IsOptimisticDecryptionEnabled` is false. + +## 3. Resolved Client Cache (`CachingEncryptionKeyStoreProviderImpl`) + +- [ ] 4.1 Create `CachingEncryptionKeyStoreProviderImpl` as a sealed internal class extending `EncryptionKeyStoreProviderImpl`. +- [ ] 4.2 Add `ConcurrentDictionary resolvedClientCache` field. +- [ ] 3.3 Override `UnwrapKey`: before calling `Resolve(keyId)`, check `resolvedClientCache.TryGetValue(keyId, out client)`. On hit, use cached client directly for `UnwrapKey`. On miss, call `Resolve`, store result, then call `UnwrapKey`. +- [ ] 3.4 Add unit test: first call for a CMK URL calls `Resolve`; second call skips `Resolve` and calls only `UnwrapKey` (verify via mock). +- [ ] 3.5 Add unit test: different CMK URLs get independent cache entries; resolving one does not affect the other. + +## 4. Async DEK Prefetch (`CachingEncryptionKeyStoreProviderImpl`) + +- [ ] 4.1 Add `ConcurrentDictionary prefetchCache` field to `CachingEncryptionKeyStoreProviderImpl`. +- [ ] 4.2 Add `ConcurrentDictionary inflightPrefetches` field for deduplication. +- [ ] 4.3 Implement `internal async Task PrefetchUnwrapKeyAsync(string keyId, string algorithm, byte[] wrappedKey, CancellationToken ct)`: check prefetch cache → if miss, check inflight → if miss, call `ResolveAsync` + `UnwrapKeyAsync`, store result in prefetch cache with TTL, remove from inflight. +- [ ] 4.4 Update `UnwrapKey` override in `CachingEncryptionKeyStoreProviderImpl`: check prefetch cache first (by `wrappedKey.ToHexString()`). On hit, return immediately. On miss, fall through to resolved-client-cache path then to sync fallback. +- [ ] 4.5 Wire `PrefetchUnwrapKeyAsync` call into `EncryptionSettingForProperty.BuildEncryptionAlgorithmForSettingAsync` — call it BEFORE `SemaphoreSlim.WaitAsync`, wrapped in try/catch (swallow non-cancellation exceptions, propagate `OperationCanceledException`). +- [ ] 4.6 Add unit test: prefetch populates cache → sync `UnwrapKey` returns immediately without Key Vault call. +- [ ] 4.7 Add unit test: prefetch cache miss → sync `UnwrapKey` falls through to sync Resolve + UnwrapKey (existing behavior). +- [ ] 4.8 Add unit test: 50 concurrent threads calling `PrefetchUnwrapKeyAsync` for the same key → only one `ResolveAsync` + `UnwrapKeyAsync` call (deduplication). +- [ ] 4.9 Add unit test: prefetch throws (simulated KV failure) → caller proceeds to semaphore and sync path without error. +- [ ] 4.10 Add unit test: prefetch throws `OperationCanceledException` → exception propagates to caller. + +## 5. Proactive Background Refresh + +- [ ] 5.1 In `CachingEncryptionKeyStoreProviderImpl`: add `CancellationTokenSource backgroundCts` field. +- [ ] 5.2 On prefetch cache access, if entry is within a jittered window before TTL expiry (random offset within 2–5 minutes before expiry, per key, to stagger multi-CEK refreshes), schedule `Task.Run` to call `ResolveAsync` + `UnwrapKeyAsync` and update the cache entry. Use `backgroundCts.Token`. Deduplicate via `inflightPrefetches`. +- [ ] 5.3 Add `Cleanup()` method: cancel `backgroundCts`, clear `prefetchCache`, clear `resolvedClientCache`, clear `inflightPrefetches`. Use `Interlocked.Exchange(ref disposed, 1)` for idempotent disposal. +- [ ] 5.4 Wire `Cleanup()` into `EncryptionCosmosClient.Dispose(bool disposing)`. +- [ ] 5.5 Add unit test: access a cache entry near TTL expiry → background refresh fires (verify via mock that `ResolveAsync` is called). +- [ ] 5.6 Add unit test: background refresh fails → existing cache entry persists until TTL; no exception propagated. +- [ ] 5.7 Add unit test: `Cleanup()` cancels in-flight background tasks (verify `OperationCanceledException` observed by background task). +- [ ] 5.8 Add unit test: double `Cleanup()` is safe (no `ObjectDisposedException`). + +## 6. Integration & Wiring + +- [ ] 6.1 In `EncryptionCosmosClient` constructor: if env var is on, instantiate `CachingEncryptionKeyStoreProviderImpl` instead of `EncryptionKeyStoreProviderImpl`. +- [ ] 6.2 Expose `PrefetchUnwrapKeyAsync` on base class as a virtual no-op (`return Task.CompletedTask`) so `EncryptionSettingForProperty` can call it without knowing the concrete type. +- [ ] 6.3 Verify build: `dotnet build Microsoft.Azure.Cosmos.Encryption/src/ -c Release` — 0 errors, 0 warnings. +- [ ] 6.4 Run existing unit tests: `dotnet test Microsoft.Azure.Cosmos.Encryption/tests/Microsoft.Azure.Cosmos.Encryption.Tests/ --no-build` — all pass. +- [ ] 6.5 Run emulator E2E tests with `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED=true`: all existing `MdeEncryptionTests` pass. +- [ ] 6.6 Run emulator E2E tests without env var: all existing `MdeEncryptionTests` pass (verify zero behavior change). + +## 7. Contract & API Validation + +- [ ] 7.1 Verify no public API changes: diff the contracts file before and after. No new public types, methods, or properties. +- [ ] 7.2 Verify `EncryptionKeyStoreProviderImpl` remains `internal`. +- [ ] 7.3 Verify `CachingEncryptionKeyStoreProviderImpl` is `internal sealed`. + +## 8. Performance Benchmarking + +- [ ] 8.1 Create `InMemoryKeyResolver` test harness with configurable delay per `Resolve`/`UnwrapKey` call and call-count tracking. +- [ ] 8.2 Benchmark: semaphore hold time (p50/p95/p99) at 50 concurrent threads, simulated KV latency 500ms, env var off vs. on. Expect: off = hundreds of ms, on = microseconds. +- [ ] 8.3 Benchmark: end-to-end decrypt throughput (docs/sec) at 10, 50, 100 concurrent feed iterators with 5 encrypted properties per document. Env var off vs. on. +- [ ] 8.4 Benchmark: AKV call count over 1000 decrypt operations with 1 CEK. Env var off (expect 2 per PDEK miss) vs. on (expect 1 per TTL refresh, resolved client cached). +- [ ] 8.5 Benchmark: `OperationCanceledException` count at 50 concurrent threads with 5s cancellation timeout and simulated 2s KV latency. Env var off (expect failures) vs. on (expect zero). +- [ ] 8.6 Benchmark: multi-CEK refresh burst — 5 CEKs with aligned TTLs, verify jitter staggers AKV calls (measure peak concurrent AKV calls). +- [ ] 8.7 Document results in a comparison table (baseline vs. optimized) with all metrics. From a86ee61a3610ccf5a399f11d37a2287f17efae0f Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Thu, 5 Mar 2026 17:01:39 -0500 Subject: [PATCH 04/10] =?UTF-8?q?Revert=20.cs=20changes=20=E2=80=94=20desi?= =?UTF-8?q?gn-only=20PR=20with=20OpenSpec=20artifacts.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../CachingEncryptionKeyStoreProviderImpl.cs | 210 ----- .../src/EncryptionCosmosClient.cs | 30 +- .../src/EncryptionKeyStoreProviderImpl.cs | 29 +- .../src/EncryptionSettingForProperty.cs | 59 -- .../EncryptionKeyStoreProviderImplTests.cs | 731 ------------------ 5 files changed, 2 insertions(+), 1057 deletions(-) delete mode 100644 Microsoft.Azure.Cosmos.Encryption/src/CachingEncryptionKeyStoreProviderImpl.cs delete mode 100644 Microsoft.Azure.Cosmos.Encryption/tests/Microsoft.Azure.Cosmos.Encryption.Tests/EncryptionKeyStoreProviderImplTests.cs diff --git a/Microsoft.Azure.Cosmos.Encryption/src/CachingEncryptionKeyStoreProviderImpl.cs b/Microsoft.Azure.Cosmos.Encryption/src/CachingEncryptionKeyStoreProviderImpl.cs deleted file mode 100644 index 42a1f1a587..0000000000 --- a/Microsoft.Azure.Cosmos.Encryption/src/CachingEncryptionKeyStoreProviderImpl.cs +++ /dev/null @@ -1,210 +0,0 @@ -//------------------------------------------------------------ -// Copyright (c) Microsoft Corporation. All rights reserved. -//------------------------------------------------------------ - -namespace Microsoft.Azure.Cosmos.Encryption -{ - using System; - using System.Collections.Concurrent; - using System.Threading; - using System.Threading.Tasks; - using global::Azure.Core.Cryptography; - using Microsoft.Data.Encryption.Cryptography; - - /// - /// Extends with an async prefetch cache - /// and proactive background refresh so that the synchronous - /// call (which runs inside the global encryption semaphore) returns instantly from - /// cache with zero Key Vault I/O. - /// - /// Enabled by setting environment variable - /// AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED=true. - /// - /// When disabled (default), is - /// used instead and behaviour is identical to the original sync-only implementation. - /// - internal sealed class CachingEncryptionKeyStoreProviderImpl : EncryptionKeyStoreProviderImpl - { - /// - /// When a cached key is within this duration of its expiry a background refresh - /// is scheduled so the sync path never encounters a cold cache. - /// - private static readonly TimeSpan ProactiveRefreshThreshold = TimeSpan.FromMinutes(5); - - /// - /// Cache of asynchronously pre-fetched unwrapped key bytes, keyed by the hex - /// representation of the encrypted key. - /// - private readonly ConcurrentDictionary prefetchedKeys = new ConcurrentDictionary(); - - /// - /// Tracks cache keys that have a background refresh in-flight to deduplicate concurrent refresh tasks. - /// - private readonly ConcurrentDictionary refreshesInFlight = new ConcurrentDictionary(); - - /// - /// Cancellation source for background proactive-refresh tasks. Cancelled on - /// so in-flight refreshes are promptly stopped and the - /// provider / key-resolver / credential chain can be garbage collected. - /// - private readonly CancellationTokenSource backgroundCts = new CancellationTokenSource(); - - /// - /// Guard for to make double-cleanup and concurrent - /// cleanup calls safe. 0 = not cleaned up, 1 = cleaned up. - /// - private int cleanedUp; - - public CachingEncryptionKeyStoreProviderImpl(IKeyEncryptionKeyResolver keyEncryptionKeyResolver, string providerName) - : base(keyEncryptionKeyResolver, providerName) - { - } - - public override byte[] UnwrapKey(string encryptionKeyId, KeyEncryptionKeyAlgorithm algorithm, byte[] encryptedKey) - { - string cacheKey = encryptedKey.ToHexString(); - - // Fast path: return from the prefetch cache — zero I/O, no latency. - if (this.prefetchedKeys.TryGetValue(cacheKey, out PrefetchedKeyData cached)) - { - if (DateTime.UtcNow < cached.ExpiresAtUtc) - { - // Proactive refresh: if we are nearing expiry, kick off a background - // async refresh so the cache stays warm for the next caller. - if (DateTime.UtcNow > cached.ExpiresAtUtc - ProactiveRefreshThreshold) - { - this.ScheduleBackgroundRefresh(encryptionKeyId, encryptedKey); - } - - return cached.UnwrappedKeyBytes; - } - - // Entry has expired — remove it and fall through to the sync path. - this.prefetchedKeys.TryRemove(cacheKey, out _); - } - - // Slow path (safety net): sync Resolve + UnwrapKey. On success the result - // is pushed into the prefetch cache so future calls are fast. - return this.GetOrCreateDataEncryptionKey(cacheKey, UnWrapKeyCore); - - byte[] UnWrapKeyCore() - { - byte[] unwrapped = this.KeyEncryptionKeyResolver - .Resolve(encryptionKeyId) - .UnwrapKey(EncryptionKeyStoreProviderImpl.GetNameForKeyEncryptionKeyAlgorithm(algorithm), encryptedKey); - - this.prefetchedKeys[cacheKey] = new PrefetchedKeyData( - unwrapped, - DateTime.UtcNow.Add(ProtectedDataEncryptionKey.TimeToLive)); - - return unwrapped; - } - } - - /// - /// Asynchronously pre-warms the unwrapped-key cache for - /// so that the synchronous call (which runs inside the global - /// encryption semaphore) can return instantly without any Key Vault I/O. - /// - /// This MUST be called before acquiring the global semaphore. - /// - internal override async Task PrefetchUnwrapKeyAsync( - string encryptionKeyId, - byte[] encryptedKey, - CancellationToken cancellationToken) - { - string cacheKey = encryptedKey.ToHexString(); - - // Skip when the cache is still well within its TTL. - if (this.prefetchedKeys.TryGetValue(cacheKey, out PrefetchedKeyData existing) - && DateTime.UtcNow < existing.ExpiresAtUtc - ProactiveRefreshThreshold) - { - return; - } - - // ResolveAsync + UnwrapKeyAsync: fully async Key Vault I/O, done outside - // the global semaphore so other threads are never blocked. - IKeyEncryptionKey keyEncryptionKey = await this.KeyEncryptionKeyResolver.ResolveAsync(encryptionKeyId, cancellationToken).ConfigureAwait(false); - - byte[] unwrappedKey = await keyEncryptionKey.UnwrapKeyAsync( - EncryptionKeyStoreProviderImpl.RsaOaepWrapAlgorithm, - encryptedKey, - cancellationToken).ConfigureAwait(false); - - this.prefetchedKeys[cacheKey] = new PrefetchedKeyData( - unwrappedKey, - DateTime.UtcNow.Add(ProtectedDataEncryptionKey.TimeToLive)); - } - - /// - /// Cancels any in-flight background refresh tasks and releases the - /// . Called from - /// . - /// - internal override void Cleanup() - { - if (Interlocked.Exchange(ref this.cleanedUp, 1) != 0) - { - return; - } - - this.backgroundCts.Cancel(); - this.backgroundCts.Dispose(); - this.prefetchedKeys.Clear(); - } - - /// - /// Fires a background task to refresh the prefetch cache entry for the given - /// encrypted key, keeping the sync path warm. - /// Concurrent refreshes for the same key are deduplicated. - /// - private void ScheduleBackgroundRefresh(string encryptionKeyId, byte[] encryptedKey) - { - string cacheKey = encryptedKey.ToHexString(); - - if (!this.refreshesInFlight.TryAdd(cacheKey, 0)) - { - return; // refresh already in progress - } - - CancellationToken token = this.backgroundCts.Token; - - _ = Task.Run(async () => - { - try - { - await this.PrefetchUnwrapKeyAsync( - encryptionKeyId, - encryptedKey, - token).ConfigureAwait(false); - } - catch (Exception) - { - // Best-effort: if the background refresh fails (including - // cancellation on Cleanup), the next sync UnwrapKey call will - // fall through to the slow path. No data loss. - } - finally - { - this.refreshesInFlight.TryRemove(cacheKey, out _); - } - }); - } - - /// - /// Immutable record holding a pre-fetched unwrapped key and its expiry. - /// - private sealed class PrefetchedKeyData - { - public PrefetchedKeyData(byte[] unwrappedKeyBytes, DateTime expiresAtUtc) - { - this.UnwrappedKeyBytes = unwrappedKeyBytes; - this.ExpiresAtUtc = expiresAtUtc; - } - - public byte[] UnwrappedKeyBytes { get; } - - public DateTime ExpiresAtUtc { get; } - } - } -} diff --git a/Microsoft.Azure.Cosmos.Encryption/src/EncryptionCosmosClient.cs b/Microsoft.Azure.Cosmos.Encryption/src/EncryptionCosmosClient.cs index 718019624c..24eb872393 100644 --- a/Microsoft.Azure.Cosmos.Encryption/src/EncryptionCosmosClient.cs +++ b/Microsoft.Azure.Cosmos.Encryption/src/EncryptionCosmosClient.cs @@ -18,8 +18,6 @@ internal sealed class EncryptionCosmosClient : CosmosClient { internal static readonly SemaphoreSlim EncryptionKeyCacheSemaphore = new SemaphoreSlim(1, 1); - private static readonly string OptimisticDecryptionEnabledEnvironmentVariable = "AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED"; - private readonly CosmosClient cosmosClient; private readonly AsyncCache clientEncryptionKeyPropertiesCacheByKeyId; @@ -34,12 +32,7 @@ public EncryptionCosmosClient( this.KeyEncryptionKeyResolver = keyEncryptionKeyResolver ?? throw new ArgumentNullException(nameof(keyEncryptionKeyResolver)); this.KeyEncryptionKeyResolverName = keyEncryptionKeyResolverName ?? throw new ArgumentNullException(nameof(keyEncryptionKeyResolverName)); this.clientEncryptionKeyPropertiesCacheByKeyId = new AsyncCache(); - - bool optimisticDecryption = EncryptionCosmosClient.IsOptimisticDecryptionEnabled(); - this.EnableAlgorithmCaching = optimisticDecryption; - this.EncryptionKeyStoreProviderImpl = optimisticDecryption - ? new CachingEncryptionKeyStoreProviderImpl(keyEncryptionKeyResolver, keyEncryptionKeyResolverName) - : new EncryptionKeyStoreProviderImpl(keyEncryptionKeyResolver, keyEncryptionKeyResolverName); + this.EncryptionKeyStoreProviderImpl = new EncryptionKeyStoreProviderImpl(keyEncryptionKeyResolver, keyEncryptionKeyResolverName); keyCacheTimeToLive ??= TimeSpan.FromHours(1); @@ -73,16 +66,6 @@ public EncryptionCosmosClient( public override Uri Endpoint => this.cosmosClient.Endpoint; - /// - /// Gets a value indicating whether optimistic decryption is enabled. - /// When true, caches the - /// - /// to avoid per-property semaphore acquisition, and - /// uses async key prefetch with proactive refresh. Controlled by environment variable - /// AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED. Default: false. - /// - internal bool EnableAlgorithmCaching { get; } - public override async Task CreateDatabaseAsync( string id, int? throughput = null, @@ -266,20 +249,9 @@ public async Task GetClientEncryptionKeyPropertie /// protected override void Dispose(bool disposing) { - if (disposing) - { - this.EncryptionKeyStoreProviderImpl.Cleanup(); - } - this.cosmosClient.Dispose(); } - private static bool IsOptimisticDecryptionEnabled() - { - string value = Environment.GetEnvironmentVariable(OptimisticDecryptionEnabledEnvironmentVariable); - return bool.TryParse(value, out bool result) && result; - } - private async Task FetchClientEncryptionKeyPropertiesAsync( EncryptionContainer encryptionContainer, string clientEncryptionKeyId, diff --git a/Microsoft.Azure.Cosmos.Encryption/src/EncryptionKeyStoreProviderImpl.cs b/Microsoft.Azure.Cosmos.Encryption/src/EncryptionKeyStoreProviderImpl.cs index 51380e1614..086e9855d8 100644 --- a/Microsoft.Azure.Cosmos.Encryption/src/EncryptionKeyStoreProviderImpl.cs +++ b/Microsoft.Azure.Cosmos.Encryption/src/EncryptionKeyStoreProviderImpl.cs @@ -5,8 +5,6 @@ namespace Microsoft.Azure.Cosmos.Encryption { using System; - using System.Threading; - using System.Threading.Tasks; using global::Azure.Core.Cryptography; using Microsoft.Data.Encryption.Cryptography; @@ -38,11 +36,6 @@ public EncryptionKeyStoreProviderImpl(IKeyEncryptionKeyResolver keyEncryptionKey public override string ProviderName { get; } - /// - /// Gets the key encryption key resolver for use by derived classes. - /// - internal IKeyEncryptionKeyResolver KeyEncryptionKeyResolver => this.keyEncryptionKeyResolver; - public override byte[] UnwrapKey(string encryptionKeyId, KeyEncryptionKeyAlgorithm algorithm, byte[] encryptedKey) { // since we do not expose GetOrCreateDataEncryptionKey we first look up the cache. @@ -81,7 +74,7 @@ public override bool Verify(string encryptionKeyId, bool allowEnclaveComputation throw new NotSupportedException("The Verify operation is not supported."); } - internal static string GetNameForKeyEncryptionKeyAlgorithm(KeyEncryptionKeyAlgorithm algorithm) + private static string GetNameForKeyEncryptionKeyAlgorithm(KeyEncryptionKeyAlgorithm algorithm) { if (algorithm == KeyEncryptionKeyAlgorithm.RSA_OAEP) { @@ -90,25 +83,5 @@ internal static string GetNameForKeyEncryptionKeyAlgorithm(KeyEncryptionKeyAlgor throw new InvalidOperationException(string.Format("Unexpected algorithm {0}", algorithm)); } - - /// - /// No-op in the base implementation. Overridden in - /// to asynchronously pre-warm the unwrapped-key cache. - /// - internal virtual Task PrefetchUnwrapKeyAsync( - string encryptionKeyId, - byte[] encryptedKey, - CancellationToken cancellationToken) - { - return Task.CompletedTask; - } - - /// - /// No-op in the base implementation. Overridden in - /// to cancel background tasks and release resources. - /// - internal virtual void Cleanup() - { - } } } diff --git a/Microsoft.Azure.Cosmos.Encryption/src/EncryptionSettingForProperty.cs b/Microsoft.Azure.Cosmos.Encryption/src/EncryptionSettingForProperty.cs index 73b13d18bb..431349c88d 100644 --- a/Microsoft.Azure.Cosmos.Encryption/src/EncryptionSettingForProperty.cs +++ b/Microsoft.Azure.Cosmos.Encryption/src/EncryptionSettingForProperty.cs @@ -22,20 +22,6 @@ internal sealed class EncryptionSettingForProperty // through InternalsVisibleTo. private readonly Microsoft.Data.Encryption.Cryptography.AeadAes256CbcHmac256EncryptionAlgorithm injectedAlgorithm; - // Cached encryption algorithm to avoid repeated semaphore acquisition per property per document. - // The underlying ProtectedDataEncryptionKey has its own TTL (default 2hrs) managed by MDE's LocalCache. - // We cache at this level to eliminate the amplification problem: without this cache, - // BuildEncryptionAlgorithmForSettingAsync acquires the global semaphore once per encrypted - // property leaf per document per page. With this cache, it's once per key per TTL period. - // - // Thread-safety note: these fields are read/written by concurrent async tasks without a lock. - // The worst-case race is that a thread reads a stale null/expired value and falls through to - // the semaphore path — one redundant cache build, no correctness issue. Using volatile to - // prevent the JIT from caching stale register copies across await boundaries. - // DateTime cannot be volatile (struct > pointer size), so we store expiry as ticks (long). - private volatile AeadAes256CbcHmac256EncryptionAlgorithm cachedEncryptionAlgorithm; - private long cachedAlgorithmExpiryTicks; - public EncryptionSettingForProperty( string clientEncryptionKeyId, Data.Encryption.Cryptography.EncryptionType encryptionType, @@ -74,19 +60,6 @@ public async Task BuildEncryptionAlgori return this.injectedAlgorithm; } - // Return cached algorithm if still valid and algorithm caching is enabled. - // This eliminates the per-property semaphore acquisition that causes - // contention under concurrent decryption. Gated by - // AZURE_COSMOS_ENCRYPTION_ALGORITHM_CACHING_ENABLED (default: off). - if (this.encryptionContainer.EncryptionCosmosClient.EnableAlgorithmCaching) - { - AeadAes256CbcHmac256EncryptionAlgorithm cached = this.cachedEncryptionAlgorithm; - if (cached != null && DateTime.UtcNow.Ticks < Interlocked.Read(ref this.cachedAlgorithmExpiryTicks)) - { - return cached; - } - } - ClientEncryptionKeyProperties clientEncryptionKeyProperties = await this.encryptionContainer.EncryptionCosmosClient.GetClientEncryptionKeyPropertiesAsync( clientEncryptionKeyId: this.ClientEncryptionKeyId, encryptionContainer: this.encryptionContainer, @@ -142,15 +115,6 @@ public async Task BuildEncryptionAlgori protectedDataEncryptionKey, this.EncryptionType); - // Cache the algorithm so subsequent calls for this property skip the semaphore entirely. - // Use the same TTL as ProtectedDataEncryptionKey (defaults to 2hrs). - // Only active when AZURE_COSMOS_ENCRYPTION_ALGORITHM_CACHING_ENABLED is set. - if (this.encryptionContainer.EncryptionCosmosClient.EnableAlgorithmCaching) - { - this.cachedEncryptionAlgorithm = aeadAes256CbcHmac256EncryptionAlgorithm; - Interlocked.Exchange(ref this.cachedAlgorithmExpiryTicks, DateTime.UtcNow.Add(ProtectedDataEncryptionKey.TimeToLive).Ticks); - } - return aeadAes256CbcHmac256EncryptionAlgorithm; } @@ -214,29 +178,6 @@ private async Task BuildProtectedDataEncryptionKeyAs string keyId, CancellationToken cancellationToken) { - // Pre-warm the unwrap key cache asynchronously, BEFORE acquiring the global - // semaphore. Only active when AZURE_COSMOS_ENCRYPTION_KEY_PREFETCH_ENABLED - // is set on EncryptionKeyStoreProviderImpl (checked internally by PrefetchUnwrapKeyAsync). - // When ProtectedDataEncryptionKey's constructor later calls UnwrapKey (sync, - // inside the semaphore), it will find a warm prefetch cache and return - // instantly — no Key Vault I/O under the lock. - // Best-effort: if prefetch fails, the sync fallback inside UnwrapKey handles - // it, and the L2 PDEK cache may avoid the unwrap call entirely. - try - { - await this.encryptionContainer.EncryptionCosmosClient.EncryptionKeyStoreProviderImpl - .PrefetchUnwrapKeyAsync( - clientEncryptionKeyProperties.EncryptionKeyWrapMetadata.Value, - clientEncryptionKeyProperties.WrappedDataEncryptionKey, - cancellationToken) - .ConfigureAwait(false); - } - catch (Exception ex) when (!(ex is OperationCanceledException)) - { - // Swallow non-cancellation prefetch errors — the sync path inside the - // semaphore will serve as fallback. - } - if (await EncryptionCosmosClient.EncryptionKeyCacheSemaphore.WaitAsync(-1, cancellationToken)) { try diff --git a/Microsoft.Azure.Cosmos.Encryption/tests/Microsoft.Azure.Cosmos.Encryption.Tests/EncryptionKeyStoreProviderImplTests.cs b/Microsoft.Azure.Cosmos.Encryption/tests/Microsoft.Azure.Cosmos.Encryption.Tests/EncryptionKeyStoreProviderImplTests.cs deleted file mode 100644 index bdeda3b109..0000000000 --- a/Microsoft.Azure.Cosmos.Encryption/tests/Microsoft.Azure.Cosmos.Encryption.Tests/EncryptionKeyStoreProviderImplTests.cs +++ /dev/null @@ -1,731 +0,0 @@ -//------------------------------------------------------------ -// Copyright (c) Microsoft Corporation. All rights reserved. -//------------------------------------------------------------ - -namespace Microsoft.Azure.Cosmos.Encryption.Tests -{ - using System; - using System.Collections.Concurrent; - using System.Collections.Generic; - using System.Linq; - using System.Threading; - using System.Threading.Tasks; - using global::Azure.Core.Cryptography; - using Microsoft.Data.Encryption.Cryptography; - using Microsoft.VisualStudio.TestTools.UnitTesting; - - /// - /// Unit tests for , covering: - /// - Prefetch async cache (PrefetchUnwrapKeyAsync) - /// - UnwrapKey fast-path from prefetch cache - /// - Sync fallback when prefetch cache is cold - /// - Proactive background refresh near TTL expiry - /// - Concurrent decryption (many threads hitting UnwrapKey) - /// - Cache expiry behavior - /// - Cancellation token propagation - /// - /// All tests use an in-memory that tracks call - /// counts and optionally injects latency, so no real Key Vault calls are made. - /// - [TestClass] - public class EncryptionKeyStoreProviderImplTests - { - private const string TestKeyId = "test-key-1"; - private const string TestProviderName = "TEST_PROVIDER"; - - // A well-known plaintext key and its "encrypted" form (trivial shift cipher). - private static readonly byte[] PlainKey = new byte[] { 10, 20, 30, 40, 50 }; - private static readonly byte[] EncryptedKey = PlainKey.Select(b => (byte)(b + 1)).ToArray(); - - #region PrefetchUnwrapKeyAsync Tests - - [TestMethod] - public async Task PrefetchUnwrapKeyAsync_WarmsCacheForSubsequentUnwrapKey() - { - // Arrange - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - resolver.RegisterKey(TestKeyId, shift: 1); - EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - // Act — prefetch the key asynchronously - await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); - - // Assert — UnwrapKey should hit the prefetch cache, NOT call Resolve again. - // Reset counters after prefetch so we can assert on the sync path. - int resolveCountAfterPrefetch = resolver.ResolveAsyncCallCount; - byte[] result = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); - - CollectionAssert.AreEqual(PlainKey, result, "Unwrapped key should match original plaintext."); - Assert.AreEqual(resolveCountAfterPrefetch, resolver.ResolveAsyncCallCount, - "ResolveAsync should NOT be called again — UnwrapKey should use the prefetch cache."); - Assert.AreEqual(0, resolver.ResolveSyncCallCount, - "Sync Resolve should not be called when prefetch cache is warm."); - } - - [TestMethod] - public async Task PrefetchUnwrapKeyAsync_SkipsRefreshWhenCacheIsWarm() - { - // Arrange - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - resolver.RegisterKey(TestKeyId, shift: 1); - EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - // Act — prefetch twice in quick succession - await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); - int countAfterFirst = resolver.ResolveAsyncCallCount; - - await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); - int countAfterSecond = resolver.ResolveAsyncCallCount; - - // Assert — second call should be a no-op (cache is well within TTL) - Assert.AreEqual(countAfterFirst, countAfterSecond, - "Second PrefetchUnwrapKeyAsync should skip refresh when the cache is still warm."); - } - - [TestMethod] - public async Task PrefetchUnwrapKeyAsync_PropagatesCancellation() - { - // Arrange - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - resolver.RegisterKey(TestKeyId, shift: 1); - resolver.ResolveAsyncDelay = TimeSpan.FromSeconds(10); // simulate slow resolve - EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - using CancellationTokenSource cts = new CancellationTokenSource(); - cts.Cancel(); // pre-cancel - - // Act & Assert - await Assert.ThrowsExceptionAsync( - () => provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, cts.Token)); - } - - [TestMethod] - public async Task PrefetchUnwrapKeyAsync_UsesResolveAsyncNotResolve() - { - // Arrange - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - resolver.RegisterKey(TestKeyId, shift: 1); - EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - // Act - await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); - - // Assert — the async path should call ResolveAsync, not Resolve - Assert.AreEqual(1, resolver.ResolveAsyncCallCount, "PrefetchUnwrapKeyAsync should call ResolveAsync."); - Assert.AreEqual(0, resolver.ResolveSyncCallCount, "PrefetchUnwrapKeyAsync should NOT call sync Resolve."); - } - - [TestMethod] - public async Task PrefetchUnwrapKeyAsync_IndependentKeysAreCachedSeparately() - { - // Arrange - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - resolver.RegisterKey("key-A", shift: 1); - resolver.RegisterKey("key-B", shift: 2); - - byte[] encKeyA = PlainKey.Select(b => (byte)(b + 1)).ToArray(); - byte[] encKeyB = PlainKey.Select(b => (byte)(b + 2)).ToArray(); - - EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - // Act - await provider.PrefetchUnwrapKeyAsync("key-A", encKeyA, CancellationToken.None); - await provider.PrefetchUnwrapKeyAsync("key-B", encKeyB, CancellationToken.None); - - byte[] resultA = provider.UnwrapKey("key-A", KeyEncryptionKeyAlgorithm.RSA_OAEP, encKeyA); - byte[] resultB = provider.UnwrapKey("key-B", KeyEncryptionKeyAlgorithm.RSA_OAEP, encKeyB); - - // Assert - CollectionAssert.AreEqual(PlainKey, resultA, "Key A unwrap should return original plaintext."); - CollectionAssert.AreEqual(PlainKey, resultB, "Key B unwrap should return original plaintext."); - Assert.AreEqual(2, resolver.ResolveAsyncCallCount, "Each key should trigger exactly one ResolveAsync."); - } - - #endregion - - #region UnwrapKey Sync Fallback Tests - - [TestMethod] - public void UnwrapKey_FallsBackToSyncWhenPrefetchCacheIsCold() - { - // Arrange — no prefetch call; cache is empty - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - resolver.RegisterKey(TestKeyId, shift: 1); - EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - // Act - byte[] result = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); - - // Assert - CollectionAssert.AreEqual(PlainKey, result, "Sync fallback should still unwrap correctly."); - Assert.AreEqual(1, resolver.ResolveSyncCallCount, - "Cold cache should trigger sync Resolve fallback."); - Assert.AreEqual(0, resolver.ResolveAsyncCallCount, - "Sync fallback should not call ResolveAsync."); - } - - [TestMethod] - public void UnwrapKey_SyncFallbackPopulatesPrefetchCache() - { - // Arrange — cold cache, sync fallback will populate it - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - resolver.RegisterKey(TestKeyId, shift: 1); - EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - // Act — first call goes through sync path - provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); - int syncCountAfterFirst = resolver.ResolveSyncCallCount; - - // Second call should hit prefetch cache (populated by first call's sync path) - byte[] result = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); - - // Assert - CollectionAssert.AreEqual(PlainKey, result); - Assert.AreEqual(syncCountAfterFirst, resolver.ResolveSyncCallCount, - "Second UnwrapKey should hit the prefetch cache, not call Resolve again."); - } - - [TestMethod] - public void UnwrapKey_ReturnsCorrectResultForDifferentKeys() - { - // Arrange - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - resolver.RegisterKey("key1", shift: 3); - resolver.RegisterKey("key2", shift: 7); - - byte[] plain = new byte[] { 100, 110, 120 }; - byte[] enc1 = plain.Select(b => (byte)(b + 3)).ToArray(); - byte[] enc2 = plain.Select(b => (byte)(b + 7)).ToArray(); - - EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - // Act - byte[] result1 = provider.UnwrapKey("key1", KeyEncryptionKeyAlgorithm.RSA_OAEP, enc1); - byte[] result2 = provider.UnwrapKey("key2", KeyEncryptionKeyAlgorithm.RSA_OAEP, enc2); - - // Assert - CollectionAssert.AreEqual(plain, result1); - CollectionAssert.AreEqual(plain, result2); - } - - #endregion - - #region Concurrency Tests - - [TestMethod] - public async Task ConcurrentUnwrapKey_WithPrefetchCache_NoContention() - { - // Arrange — prefetch the key, then hammer UnwrapKey from many threads - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - resolver.RegisterKey(TestKeyId, shift: 1); - EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); - int resolveCountAfterPrefetch = resolver.ResolveAsyncCallCount; - - int concurrency = 50; - ConcurrentBag results = new ConcurrentBag(); - List tasks = new List(); - - // Act — simulate concurrent feed iterator decryption - for (int i = 0; i < concurrency; i++) - { - tasks.Add(Task.Run(() => - { - byte[] result = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); - results.Add(result); - })); - } - - await Task.WhenAll(tasks); - - // Assert — all results correct, no additional Resolve calls - Assert.AreEqual(concurrency, results.Count); - foreach (byte[] result in results) - { - CollectionAssert.AreEqual(PlainKey, result, "Every concurrent unwrap should return correct bytes."); - } - - Assert.AreEqual(resolveCountAfterPrefetch, resolver.ResolveAsyncCallCount, - "No additional ResolveAsync calls should happen — all served from prefetch cache."); - Assert.AreEqual(0, resolver.ResolveSyncCallCount, - "No sync Resolve calls should happen when prefetch cache is warm."); - } - - [TestMethod] - public async Task ConcurrentUnwrapKey_ColdCache_AllSucceed() - { - // Arrange — no prefetch, all threads hit the sync fallback - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - resolver.RegisterKey(TestKeyId, shift: 1); - EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - int concurrency = 20; - ConcurrentBag results = new ConcurrentBag(); - List tasks = new List(); - - // Act - for (int i = 0; i < concurrency; i++) - { - tasks.Add(Task.Run(() => - { - byte[] result = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); - results.Add(result); - })); - } - - await Task.WhenAll(tasks); - - // Assert — all succeed despite contention on the sync fallback - Assert.AreEqual(concurrency, results.Count); - foreach (byte[] result in results) - { - CollectionAssert.AreEqual(PlainKey, result); - } - } - - [TestMethod] - public async Task ConcurrentPrefetchAndUnwrap_MixedPattern() - { - // Simulate the real pattern: some threads prefetch while others are already unwrapping. - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - resolver.RegisterKey(TestKeyId, shift: 1); - EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - int totalOps = 40; - ConcurrentBag results = new ConcurrentBag(); - List tasks = new List(); - - for (int i = 0; i < totalOps; i++) - { - if (i % 5 == 0) - { - // Every 5th thread does a prefetch first - tasks.Add(Task.Run(async () => - { - await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); - byte[] result = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); - results.Add(result); - })); - } - else - { - // Other threads just call UnwrapKey directly - tasks.Add(Task.Run(() => - { - byte[] result = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); - results.Add(result); - })); - } - } - - await Task.WhenAll(tasks); - - Assert.AreEqual(totalOps, results.Count); - foreach (byte[] result in results) - { - CollectionAssert.AreEqual(PlainKey, result, "All concurrent operations should return correct bytes."); - } - } - - [TestMethod] - public async Task ConcurrentUnwrapKey_MultipleKeys_NoInterference() - { - // Verify that concurrent operations on different keys don't interfere. - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - resolver.RegisterKey("keyA", shift: 1); - resolver.RegisterKey("keyB", shift: 2); - resolver.RegisterKey("keyC", shift: 3); - - byte[] encA = PlainKey.Select(b => (byte)(b + 1)).ToArray(); - byte[] encB = PlainKey.Select(b => (byte)(b + 2)).ToArray(); - byte[] encC = PlainKey.Select(b => (byte)(b + 3)).ToArray(); - - EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - // Prefetch all keys - await provider.PrefetchUnwrapKeyAsync("keyA", encA, CancellationToken.None); - await provider.PrefetchUnwrapKeyAsync("keyB", encB, CancellationToken.None); - await provider.PrefetchUnwrapKeyAsync("keyC", encC, CancellationToken.None); - - int concurrencyPerKey = 20; - ConcurrentBag<(string Key, byte[] Result)> results = new ConcurrentBag<(string, byte[])>(); - List tasks = new List(); - - foreach ((string keyId, byte[] enc) in new[] { ("keyA", encA), ("keyB", encB), ("keyC", encC) }) - { - for (int i = 0; i < concurrencyPerKey; i++) - { - string capturedKeyId = keyId; - byte[] capturedEnc = enc; - tasks.Add(Task.Run(() => - { - byte[] result = provider.UnwrapKey(capturedKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, capturedEnc); - results.Add((capturedKeyId, result)); - })); - } - } - - await Task.WhenAll(tasks); - - Assert.AreEqual(concurrencyPerKey * 3, results.Count); - foreach ((string key, byte[] result) in results) - { - CollectionAssert.AreEqual(PlainKey, result, $"Key '{key}' should unwrap to original plaintext."); - } - } - - #endregion - - #region WrapKey Tests - - [TestMethod] - public void WrapKey_PassesThroughToResolver() - { - // Arrange - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - resolver.RegisterKey(TestKeyId, shift: 1); - EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - // Act - byte[] wrapped = provider.WrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, PlainKey); - - // Assert - CollectionAssert.AreEqual(EncryptedKey, wrapped); - } - - #endregion - - #region Prefetch + Latency Simulation Tests - - [TestMethod] - public async Task PrefetchUnwrapKeyAsync_WithSlowResolver_UnwrapKeyIsFast() - { - // Arrange — resolver has 200ms latency - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - resolver.RegisterKey(TestKeyId, shift: 1); - resolver.ResolveAsyncDelay = TimeSpan.FromMilliseconds(200); - EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - // Act — prefetch absorbs the latency - await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); - - // Time the sync UnwrapKey — it should be near-instant from cache - System.Diagnostics.Stopwatch sw = System.Diagnostics.Stopwatch.StartNew(); - byte[] result = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); - sw.Stop(); - - // Assert - CollectionAssert.AreEqual(PlainKey, result); - Assert.IsTrue(sw.ElapsedMilliseconds < 50, - $"UnwrapKey from prefetch cache should be near-instant but took {sw.ElapsedMilliseconds}ms."); - } - - [TestMethod] - public async Task ConcurrentUnwrapKey_WithSlowResolver_PrefetchEliminatesContention() - { - // Simulate the real scenario: slow Key Vault, high concurrency. - // With prefetch, all threads should return instantly. - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - resolver.RegisterKey(TestKeyId, shift: 1); - resolver.ResolveAsyncDelay = TimeSpan.FromMilliseconds(500); - EncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - // Prefetch outside the hot path (absorbs the 500ms) - await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); - - int concurrency = 100; - ConcurrentBag timings = new ConcurrentBag(); - List tasks = new List(); - - for (int i = 0; i < concurrency; i++) - { - tasks.Add(Task.Run(() => - { - System.Diagnostics.Stopwatch sw = System.Diagnostics.Stopwatch.StartNew(); - byte[] result = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); - sw.Stop(); - timings.Add(sw.ElapsedMilliseconds); - })); - } - - await Task.WhenAll(tasks); - - long maxTiming = timings.Max(); - Assert.IsTrue(maxTiming < 100, - $"With warm prefetch cache, worst-case UnwrapKey should be <100ms, was {maxTiming}ms."); - Assert.AreEqual(0, resolver.ResolveSyncCallCount, - "No sync Resolve calls when prefetch cache is warm."); - } - - #endregion - - #region Edge Case Tests - - [TestMethod] - public void UnwrapKey_ThrowsOnInvalidAlgorithm() - { - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - resolver.RegisterKey(TestKeyId, shift: 1); - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - Assert.ThrowsException( - () => provider.UnwrapKey(TestKeyId, (KeyEncryptionKeyAlgorithm)999, EncryptedKey)); - } - - [TestMethod] - public void Sign_ThrowsNotSupported() - { - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - Assert.ThrowsException( - () => provider.Sign("key", true)); - } - - [TestMethod] - public void Verify_ThrowsNotSupported() - { - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - Assert.ThrowsException( - () => provider.Verify("key", true, new byte[] { 1 })); - } - - [TestMethod] - public void ProviderName_ReturnsConfiguredName() - { - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - Assert.AreEqual(TestProviderName, provider.ProviderName); - } - - [TestMethod] - public void DataEncryptionKeyCacheTimeToLive_IsZeroByDefault() - { - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - Assert.AreEqual(TimeSpan.Zero, provider.DataEncryptionKeyCacheTimeToLive); - } - - #endregion - - #region Base Class No-Op Tests - - [TestMethod] - public async Task BaseClass_PrefetchUnwrapKeyAsync_IsNoOp() - { - // Arrange — use the base class directly (no caching) - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - resolver.RegisterKey(TestKeyId, shift: 1); - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - // Act — prefetch should be a no-op on the base class - await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); - - // Assert — no resolver calls made (base class returns Task.CompletedTask) - Assert.AreEqual(0, resolver.ResolveAsyncCallCount, - "Base class PrefetchUnwrapKeyAsync should be a no-op."); - - // UnwrapKey should use the sync Resolve path - byte[] result = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); - CollectionAssert.AreEqual(PlainKey, result); - Assert.AreEqual(1, resolver.ResolveSyncCallCount, - "Base class UnwrapKey should use the sync Resolve path."); - } - - [TestMethod] - public void BaseClass_Cleanup_IsNoOp() - { - // Arrange - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - // Act & Assert — should not throw - provider.Cleanup(); - provider.Cleanup(); // double-call safe - } - - #endregion - - #region Cleanup / Lifecycle Tests - - [TestMethod] - public void Cleanup_CanBeCalledMultipleTimes() - { - // Arrange - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - CachingEncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - // Act & Assert — should not throw on double-cleanup - provider.Cleanup(); - provider.Cleanup(); - } - - [TestMethod] - public async Task Cleanup_CancelsInFlightBackgroundRefresh() - { - // Arrange — use a very slow resolver so the background refresh is still in-flight when we clean up - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - resolver.RegisterKey(TestKeyId, shift: 1); - resolver.ResolveAsyncDelay = TimeSpan.FromSeconds(30); // very slow - - CachingEncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - // Pre-warm with a fast resolve first (bypass the slow delay for initial population) - TimeSpan originalDelay = resolver.ResolveAsyncDelay; - resolver.ResolveAsyncDelay = TimeSpan.Zero; - await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); - resolver.ResolveAsyncDelay = originalDelay; - - // Act — cleanup should return quickly, not block on the 30s resolve - System.Diagnostics.Stopwatch sw = System.Diagnostics.Stopwatch.StartNew(); - provider.Cleanup(); - sw.Stop(); - - Assert.IsTrue(sw.ElapsedMilliseconds < 1000, - $"Cleanup should complete promptly (not block on background I/O), took {sw.ElapsedMilliseconds}ms."); - } - - [TestMethod] - public async Task Cleanup_ClearsPrefetchCache() - { - // Arrange - InMemoryKeyResolver resolver = new InMemoryKeyResolver(); - resolver.RegisterKey(TestKeyId, shift: 1); - CachingEncryptionKeyStoreProviderImpl provider = new CachingEncryptionKeyStoreProviderImpl(resolver, TestProviderName); - - await provider.PrefetchUnwrapKeyAsync(TestKeyId, EncryptedKey, CancellationToken.None); - - // Verify cache is warm - byte[] warmResult = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); - CollectionAssert.AreEqual(PlainKey, warmResult); - int resolveCountBeforeCleanup = resolver.ResolveAsyncCallCount + resolver.ResolveSyncCallCount; - - // Act - provider.Cleanup(); - - // After cleanup, UnwrapKey should go through the sync fallback - // (prefetch cache was cleared). - byte[] postCleanupResult = provider.UnwrapKey(TestKeyId, KeyEncryptionKeyAlgorithm.RSA_OAEP, EncryptedKey); - CollectionAssert.AreEqual(PlainKey, postCleanupResult); - - int resolveCountAfterCleanup = resolver.ResolveAsyncCallCount + resolver.ResolveSyncCallCount; - Assert.IsTrue(resolveCountAfterCleanup > resolveCountBeforeCleanup, - "After Cleanup clears prefetch cache, UnwrapKey should require a new Resolve call."); - } - - #endregion - - #region InMemoryKeyResolver — test double - - /// - /// An in-memory implementation of for unit testing. - /// Each registered key uses a simple shift cipher (add/subtract a byte offset). - /// Tracks call counts for assertions and supports configurable latency. - /// - internal sealed class InMemoryKeyResolver : IKeyEncryptionKeyResolver - { - private readonly Dictionary keyShifts = new Dictionary(); - - private int resolveAsyncCallCount; - private int resolveSyncCallCount; - - /// - /// Optional delay injected into ResolveAsync to simulate Key Vault latency. - /// - public TimeSpan ResolveAsyncDelay { get; set; } = TimeSpan.Zero; - - /// - /// Optional delay injected into sync Resolve to simulate Key Vault latency. - /// - public TimeSpan ResolveSyncDelay { get; set; } = TimeSpan.Zero; - - public int ResolveAsyncCallCount => this.resolveAsyncCallCount; - - public int ResolveSyncCallCount => this.resolveSyncCallCount; - - public void RegisterKey(string keyId, int shift) - { - this.keyShifts[keyId] = shift; - } - - public IKeyEncryptionKey Resolve(string keyId, CancellationToken cancellationToken = default) - { - cancellationToken.ThrowIfCancellationRequested(); - Interlocked.Increment(ref this.resolveSyncCallCount); - - if (this.ResolveSyncDelay > TimeSpan.Zero) - { - Thread.Sleep(this.ResolveSyncDelay); - } - - return this.CreateKey(keyId); - } - - public async Task ResolveAsync(string keyId, CancellationToken cancellationToken = default) - { - cancellationToken.ThrowIfCancellationRequested(); - Interlocked.Increment(ref this.resolveAsyncCallCount); - - if (this.ResolveAsyncDelay > TimeSpan.Zero) - { - await Task.Delay(this.ResolveAsyncDelay, cancellationToken); - } - - return this.CreateKey(keyId); - } - - private IKeyEncryptionKey CreateKey(string keyId) - { - if (!this.keyShifts.TryGetValue(keyId, out int shift)) - { - throw new InvalidOperationException($"Key '{keyId}' not registered in InMemoryKeyResolver."); - } - - return new InMemoryKeyEncryptionKey(keyId, shift); - } - } - - /// - /// A trivial in-memory using a byte-shift cipher. - /// - internal sealed class InMemoryKeyEncryptionKey : IKeyEncryptionKey - { - private readonly int shift; - - public InMemoryKeyEncryptionKey(string keyId, int shift) - { - this.KeyId = keyId; - this.shift = shift; - } - - public string KeyId { get; } - - public byte[] WrapKey(string algorithm, ReadOnlyMemory key, CancellationToken cancellationToken = default) - { - return key.ToArray().Select(b => (byte)(b + this.shift)).ToArray(); - } - - public Task WrapKeyAsync(string algorithm, ReadOnlyMemory key, CancellationToken cancellationToken = default) - { - return Task.FromResult(this.WrapKey(algorithm, key, cancellationToken)); - } - - public byte[] UnwrapKey(string algorithm, ReadOnlyMemory encryptedKey, CancellationToken cancellationToken = default) - { - return encryptedKey.ToArray().Select(b => (byte)(b - this.shift)).ToArray(); - } - - public Task UnwrapKeyAsync(string algorithm, ReadOnlyMemory encryptedKey, CancellationToken cancellationToken = default) - { - return Task.FromResult(this.UnwrapKey(algorithm, encryptedKey, cancellationToken)); - } - } - - #endregion - } -} From 9debedfea35318ea011e4bdfa2d7ee7bcc35e7bb Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Thu, 5 Mar 2026 17:06:43 -0500 Subject: [PATCH 05/10] Modify tasks.md --- openspec/changes/reduce-encryption-contention/tasks.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openspec/changes/reduce-encryption-contention/tasks.md b/openspec/changes/reduce-encryption-contention/tasks.md index 46e7a46b3f..011febe0bb 100644 --- a/openspec/changes/reduce-encryption-contention/tasks.md +++ b/openspec/changes/reduce-encryption-contention/tasks.md @@ -6,7 +6,8 @@ - [ ] 1.4 **Key rotation scenario.** Confirm that CEK rewrap (the only runtime rotation) does not change plaintext DEK bytes, so all caches remain valid. CEK replacement is out of scope (requires data migration, offline operation). Document this assumption. - [ ] 1.5 **Stale key material window.** Quantify the maximum time plaintext DEK bytes from a revoked CMK can remain in memory (prefetch cache TTL + PDEK TTL). CEK replacement is out of scope. Confirm the window is acceptable per Azure Key Vault best practices. - [ ] 1.6 **Explore: can `AeadAes256CbcHmac256EncryptionAlgorithm.GetOrCreate` replace `new`?** MDE has a built-in static `LocalCache` for algorithm instances keyed by `(DataEncryptionKey, EncryptionType)`. The SDK bypasses it by calling `new` directly. Evaluate: (a) does `ProtectedDataEncryptionKey.Equals`/`GetHashCode` produce correct cache hits for the same logical key? (b) is there any thread-safety concern with MDE's `LocalCache`? (c) could this be a standalone one-line cleanup independent of this change? Document findings. -- [ ] 1.7 **Review with implementing engineer.** Walk through findings from 1.1–1.6 before proceeding to implementation. Resolve any open questions. Update design.md Risks section if new risks are identified. +- [ ] 1.7 **Understand the wrapped CEK fetch path from Cosmos DB.** Read `GetClientEncryptionKeyPropertiesAsync` in `EncryptionCosmosClient` and `EncryptionSettingForProperty`. Understand: (a) how the wrapped CEK is fetched (gateway-cached read via `AsyncCache`), (b) when it refreshes (force refresh on Forbidden retry, ETag-based staleness check), (c) whether anything in its caching design (AsyncCache pattern, ETag invalidation, gateway cache headers) can inform or be reused for the prefetch cache design. Document any applicable patterns. +- [ ] 1.8 **Review with implementing engineer.** Walk through findings from 1.1–1.7 before proceeding to implementation. Resolve any open questions. Update design.md Risks section if new risks are identified. ## 2. Environment Variable Feature Gate From 0aed54e888e8ea65cc1f10392e5573445e6f51f2 Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Wed, 11 Mar 2026 17:41:24 -0400 Subject: [PATCH 06/10] Address review comments. --- .../reduce-encryption-contention/design.md | 91 ++++++++++--------- .../reduce-encryption-contention/proposal.md | 18 ++-- .../specs/async-dek-prefetch/spec.md | 28 +++--- .../specs/env-var-feature-gate/spec.md | 16 ++-- .../specs/resolved-client-cache/spec.md | 24 ++--- .../reduce-encryption-contention/tasks.md | 59 ++++++------ 6 files changed, 121 insertions(+), 115 deletions(-) diff --git a/openspec/changes/reduce-encryption-contention/design.md b/openspec/changes/reduce-encryption-contention/design.md index 36bf35a482..a71b3bc5e2 100644 --- a/openspec/changes/reduce-encryption-contention/design.md +++ b/openspec/changes/reduce-encryption-contention/design.md @@ -1,72 +1,72 @@ ## Context -Client-side encryption in the Cosmos DB .NET SDK uses a global `SemaphoreSlim(1,1)` that serializes all encryption key operations. The semaphore guards MDE’s `ProtectedDataEncryptionKey.GetOrCreate` and `KeyEncryptionKey.GetOrCreate` — both of which are cache lookups on the hot path but trigger synchronous Key Vault HTTP calls on cache miss. PDEK resolution is sync, on the hot path, and makes two blocking HTTP calls to Key Vault (Resolve + UnwrapKey) under the semaphore. +Client-side encryption in the Cosmos DB .NET SDK uses a global `SemaphoreSlim(1,1)` that serializes all encryption key resolution operations — specifically, the construction or cache lookup of `ProtectedDataEncryptionKey` and `KeyEncryptionKey` objects via the Microsoft Data Encryption library's `GetOrCreate` methods. These are the only operations guarded by the semaphore; encrypt/decrypt of document payloads happens outside the semaphore. The semaphore guards the Microsoft Data Encryption library's `ProtectedDataEncryptionKey.GetOrCreate` and `KeyEncryptionKey.GetOrCreate` — both of which are cache lookups on the hot path but trigger synchronous Key Vault HTTP calls on cache miss. `ProtectedDataEncryptionKey` resolution is sync, on the hot path, and makes two blocking HTTP calls to Key Vault (Resolve + UnwrapKey) under the semaphore. -The call amplification is severe: `DecryptObjectAsync` calls `BuildEncryptionAlgorithmForSettingAsync` per encrypted leaf value per document per page. With concurrent feed iterators, this produces thousands of semaphore acquisitions per second, all serialized to a single permit. +The call amplification is severe: `DecryptObjectAsync` calls `BuildEncryptionAlgorithmForSettingAsync` per encrypted leaf value per document per page. With concurrent feed iterators, this produces thousands of semaphore acquisitions per second, all serialized to a single permit. Note: both read (decrypt) and write (encrypt) paths acquire the semaphore for key resolution; however, the customer workload triggering this issue is read-heavy (Change Feed Processor with concurrent `FeedIterator`). A dedicated investigation task (1.9) covers whether specific operation types trigger disproportionate contention. -**Structural constraint**: MDE's `ProtectedDataEncryptionKey` constructor chain is sync (C# cannot make base-ctor calls async). `EncryptionKeyStoreProvider.UnwrapKey` returns `byte[]` (sync). MDE's `LocalCache.GetOrCreate` takes `Func` not `Func>`. The sync chain from `ProtectedDataEncryptionKey` → `KeyEncryptionKey.DecryptEncryptionKey` → `UnwrapKey` → `Resolve` + `UnwrapKey` on Key Vault cannot be made async at any point in MDE's type hierarchy. However, `IKeyEncryptionKeyResolver` exposes `ResolveAsync` and `IKeyEncryptionKey` exposes `UnwrapKeyAsync` — these async variants are reachable from our code, just not through MDE's call chain. +**Structural constraint**: The Microsoft Data Encryption library's `ProtectedDataEncryptionKey` constructor chain is sync (C# cannot make base-ctor calls async). `EncryptionKeyStoreProvider.UnwrapKey` returns `byte[]` (sync). The Microsoft Data Encryption library's `LocalCache.GetOrCreate` takes `Func` not `Func>`. The sync chain from `ProtectedDataEncryptionKey` → `KeyEncryptionKey.DecryptEncryptionKey` → `UnwrapKey` → `Resolve` + `UnwrapKey` on Key Vault cannot be made async at any point in the Microsoft Data Encryption library's type hierarchy. However, `IKeyEncryptionKeyResolver` exposes `ResolveAsync` and `IKeyEncryptionKey` exposes `UnwrapKeyAsync` — these async variants are reachable from our code, just not through the Microsoft Data Encryption library's call chain. ## Goals / Non-Goals **Goals:** -- Move PDEK resolution (Key Vault I/O) off the hot path via async prefetch outside the semaphore -- Prevent AKV overload: proactive background refresh, deduplicated to one call per key per interval -- Reduce Key Vault calls per refresh: cache resolved `IKeyEncryptionKey` so only `UnwrapKey` hits AKV, not `Resolve` + `UnwrapKey` -- Gate all changes behind an opt-in env var for safe rollout — zero behavior change when disabled +- Move `ProtectedDataEncryptionKey` resolution (Key Vault I/O) off the hot path via async prefetch outside the semaphore +- Prevent Azure Key Vault overload: proactive background refresh, deduplicated to one call per key per interval +- Reduce Key Vault calls per refresh: cache resolved `IKeyEncryptionKey` so only `UnwrapKey` hits Azure Key Vault, not `Resolve` + `UnwrapKey` +- Gate all changes behind an opt-in environment variable for safe rollout — zero behavior change when disabled - No public API changes. No breaking changes. **Non-Goals:** -- Changing MDE's type hierarchy or making the `ProtectedDataEncryptionKey` constructor async -- Removing the semaphore entirely (it guards MDE internal state mutation — still needed) +- Changing the Microsoft Data Encryption library's type hierarchy or making the `ProtectedDataEncryptionKey` constructor async +- Removing the semaphore entirely (it guards the Microsoft Data Encryption library internal state mutation — still needed) - Per-key semaphores (reduces cross-key contention but same-key still serializes; added complexity for marginal gain) -- Caching the `AeadAes256CbcHmac256EncryptionAlgorithm` at the `EncryptionSettingForProperty` level (would need to be hoisted above the semaphore to reduce contention, which reintroduces ETag validation and Forbidden retry path complexity; however, MDE has a built-in `GetOrCreate` cache that the SDK currently bypasses — see Future Considerations)- Enabling the MDE DEK byte cache (`DataEncryptionKeyCacheTimeToLive`) as a standalone layer (redundant when async prefetch is working — the sync `UnwrapKey` path reads from the prefetch cache, never reaching MDE’s built-in cache) -- CEK rotation handling (CEK replacement requires data migration — re-encrypting every document — and is an offline operation; not a runtime concern for caching)- Supporting key rotation detection at the cache level (CEK rewrap doesn't change plaintext bytes; new CEK policy creates new `EncryptionSettingForProperty` objects) +- Caching the `AeadAes256CbcHmac256EncryptionAlgorithm` at the `EncryptionSettingForProperty` level (would need to be hoisted above the semaphore to reduce contention, which reintroduces ETag validation and Forbidden retry path complexity; however, the Microsoft Data Encryption library has a built-in `GetOrCreate` cache that the SDK currently bypasses — see Future Considerations)- Enabling the Microsoft Data Encryption library's Data Encryption Key byte cache (`DataEncryptionKeyCacheTimeToLive`) as a standalone layer (redundant when async prefetch is working — the sync `UnwrapKey` path reads from the prefetch cache, never reaching the Microsoft Data Encryption library’s built-in cache) +- Client Encryption Key rotation handling (Client Encryption Key replacement requires data migration — re-encrypting every document — and is an offline operation; not a runtime concern for caching)- Supporting key rotation detection at the cache level (Client Encryption Key rewrap doesn't change plaintext bytes; new Client Encryption Key policy creates new `EncryptionSettingForProperty` objects) ## Decisions ### Decision 1: Async prefetch in a sealed subclass, not in the base `EncryptionKeyStoreProviderImpl` -**Chosen**: New `CachingEncryptionKeyStoreProviderImpl` (sealed, internal) extends `EncryptionKeyStoreProviderImpl`. Base class stays clean sync-only. `EncryptionCosmosClient` instantiates the subclass when env var is on. +**Chosen**: New `CachingEncryptionKeyStoreProviderImpl` (sealed, internal) extends `EncryptionKeyStoreProviderImpl`. Base class stays clean sync-only. `EncryptionCosmosClient` instantiates the subclass when environment variable is on. -**Rationale**: The base class serves all existing customers with zero behavior change. The subclass adds: prefetch `ConcurrentDictionary`, resolved-client cache, `CancellationTokenSource`, `Cleanup()` method, and overridden `UnwrapKey` that checks prefetch cache first. Separation keeps the risk surface isolated. +**Rationale**: The base class serves all existing customers with zero behavior change. The subclass adds: prefetch `ConcurrentDictionary`, resolved-client cache (a `ConcurrentDictionary` that caches the `CryptographyClient` returned by `IKeyEncryptionKeyResolver.Resolve()` per Customer Master Key URL — called "resolved-client" because it caches the resolved `IKeyEncryptionKey` client object, not the unwrapped key bytes), `CancellationTokenSource`, `Cleanup()` method, and overridden `UnwrapKey` that checks prefetch cache first. Separation keeps the risk surface isolated. -**Rejected alternative**: Adding prefetch directly to base class with conditional branches — mixes concerns, harder to reason about lifecycle, risk of unintended behavior when env var is off. +**Rejected alternative**: Adding prefetch directly to base class with conditional branches — mixes concerns, harder to reason about lifecycle, risk of unintended behavior when environment variable is off. -### Decision 2: `ConcurrentDictionary` for prefetch cache (not `MemoryCache` or MDE's `LocalCache`) +### Decision 2: `ConcurrentDictionary` for prefetch cache (not `MemoryCache` or the Microsoft Data Encryption library's `LocalCache`) -**Chosen**: `ConcurrentDictionary` with manual TTL tracking (expiry stored alongside value). +**Chosen**: `ConcurrentDictionary` with manual time-to-live tracking (expiry stored alongside value). -**Rationale**: MDE's `LocalCache.GetOrCreate` is get-or-create only — no `Set`, `Remove`, or `Evict` API. `MemoryCache` adds a dependency and has its own concurrency model. `ConcurrentDictionary` is simple, lock-free reads, and we control TTL ourselves. +**Rationale**: The Microsoft Data Encryption library's `LocalCache.GetOrCreate` is get-or-create only — no `Set`, `Remove`, or `Evict` API. `MemoryCache` adds a dependency and has its own concurrency model. `ConcurrentDictionary` is simple, lock-free reads, and we control time-to-live ourselves. ### Decision 3: Proactive refresh via `Task.Run`, not `Timer` -**Chosen**: On cache access, if within 5 minutes of expiry, fire `Task.Run` to refresh in background. Deduplicate via `ConcurrentDictionary`. +**Chosen**: On cache access, if within a configurable refresh window of expiry, fire `Task.Run` to refresh in background. The refresh window SHALL be **20% of the cache time-to-live, capped at a maximum of 5 minutes**. For example: time-to-live = 1 hour → refresh at 12 minutes before expiry (20% × 60 min = 12 min, but capped to 5 min → 5 minutes); time-to-live = 10 minutes → refresh at 2 minutes before expiry (20% × 10 min = 2 min). Deduplicate via `ConcurrentDictionary`. -**Rationale**: `Timer` requires managing timer lifecycle per key, disposal ordering, and thread-pool callbacks. `Task.Run` on access is simpler — only fires when the key is actually being used (no wasted refresh for unused keys). Tied to `CancellationTokenSource` for cleanup on dispose. +**Rationale**: `Timer` requires managing timer lifecycle per key, disposal ordering, and thread-pool callbacks. `Task.Run` on access is simpler — only fires when the key is actually being used (no wasted refresh for unused keys). Tied to `CancellationTokenSource` for cleanup on dispose. The refresh window is percentage-based rather than a fixed duration because the `keyCacheTimeToLive` is customer-configurable (defaults to 1 hour, max 2 hours from `ProtectedDataEncryptionKey.TimeToLive`, but can be set shorter). A fixed 5-minute window would be disproportionate for short time-to-live values. **Rejected alternative**: `System.Threading.Timer` per key — more deterministic timing but complex lifecycle management (dispose timers on cache eviction, handle timer callbacks after disposal). ## Risks / Trade-offs -- **[Non-risk] Cache coherence on key rotation**: CEK rewrap (same plaintext DEK, new CMK wrapper) is the only runtime rotation — all caches remain valid. CEK replacement is out of scope (requires data migration, offline operation). +- **[Non-risk] Cache coherence on key rotation**: Client Encryption Key rewrap (same plaintext Data Encryption Key, new Customer Master Key wrapper) is the only runtime rotation — all caches remain valid. Client Encryption Key replacement is out of scope (requires data migration, offline operation). - **[Risk] Background refresh lifecycle]: `Task.Run` captures `this`, keeping the provider and all its dependencies (resolver, credential chain) GC-rooted until the task completes. → **Mitigation**: `CancellationTokenSource` cancelled on `Cleanup()`/`Dispose()`. `Interlocked.Exchange` for idempotent dispose. Background tasks observe cancellation token. -- **[Risk] Two copies of plaintext DEK in memory**: PDEK internal field + prefetch `ConcurrentDictionary`. → **Mitigation**: Same process, same threat boundary, same TTL. No new attack surface. Industry standard (Azure Key Vault SDK recommends caching). +- **[Risk] Two copies of plaintext Data Encryption Key in memory**: `ProtectedDataEncryptionKey` internal field + prefetch `ConcurrentDictionary`. → **Mitigation**: Same process, same threat boundary, same time-to-live. No new attack surface. Industry standard (Azure Key Vault SDK recommends caching). -- **[Risk] Env var read at construction only**: If an operator enables the env var and restarts the app, old client instances still run without optimization. → **Mitigation**: This is by design — matches SDK `ConfigurationManager` pattern. Client must be reconstructed to pick up changes. +- **[Risk] Environment variable read at construction only**: If an operator enables the environment variable and restarts the app, old client instances still run without optimization. → **Mitigation**: This is by design — matches SDK `ConfigurationManager` pattern. Client must be reconstructed to pick up changes. -- **[Risk] AKV burst on multi-CEK proactive refresh**: If multiple Client Encryption Keys have similar TTLs (common at startup — all populated in a short window), their proactive refreshes fire simultaneously. Each refresh is deduplicated per key (1 call per CEK), but N CEKs = N concurrent AKV calls. AKV throttles at 4000 ops/vault/10s — unlikely to hit for typical customers (1–10 CEKs), but possible for large deployments. → **Mitigation (recommended)**: Add jitter to the proactive refresh window (random offset within 2–5 minutes before expiry) so CEKs don’t all refresh at the same instant. Consider a concurrency limiter on background AKV calls. +- **[Risk] Azure Key Vault burst on multi-Client Encryption Key proactive refresh**: If multiple Client Encryption Keys have similar time-to-live values (common at startup — all populated in a short window), their proactive refreshes fire simultaneously. Each refresh is deduplicated per key (1 call per Client Encryption Key), but N Client Encryption Keys = N concurrent Azure Key Vault calls. Azure Key Vault throttles at 4000 ops/vault/10s — unlikely to hit for typical customers (1–10 Client Encryption Keys), but possible for large deployments. → **Mitigation (recommended)**: Add jitter to the proactive refresh window (random offset within the percentage-based refresh window, per key) so Client Encryption Keys don’t all refresh at the same instant. Consider a concurrency limiter on background Azure Key Vault calls. -- **[Trade-off] Prefetch adds complexity for correctness**: Async prefetch + background refresh + disposal lifecycle is more complex than a one-liner (`TimeSpan.Zero` → `TimeSpan.FromHours(1)`). But it solves the actual problem — PDEK resolution is async, off the hot path, and doesn’t overload AKV. → **Accept**: Complexity is isolated in a sealed subclass, gated by env var. +- **[Trade-off] Prefetch adds complexity for correctness**: Async prefetch + background refresh + disposal lifecycle is more complex than a one-liner (`TimeSpan.Zero` → `TimeSpan.FromHours(1)`). But it solves the actual problem — `ProtectedDataEncryptionKey` resolution is async, off the hot path, and doesn’t overload Azure Key Vault. → **Accept**: Complexity is isolated in a sealed subclass, gated by environment variable. ## Migration Plan 1. All changes are gated by `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED` (off by default). -2. Ship in next SDK release with release notes documenting the env var. -3. Customers experiencing contention enable the env var. No code changes to their app. +2. Ship in next SDK release with release notes documenting the environment variable. +3. Customers experiencing contention enable the environment variable. No code changes to their app. 4. After bake time (1–2 release cycles), consider enabling by default. -5. **Rollback**: Set env var to false (or unset it). Immediate revert to current behavior on next client construction. +5. **Rollback**: Set environment variable to false (or unset it). Immediate revert to current behavior on next client construction. ## Performance Benchmarking Strategy @@ -75,52 +75,53 @@ The fix must be validated with quantitative evidence, not just passing tests. Th **Customer workload profile (from incident investigation):** - Service running high-concurrency `CosmosFeedIterator` reads with client-side encryption enabled - Multiple encrypted properties per document (nested objects/arrays → per-leaf-value semaphore acquisition) -- AKV-backed key resolver (`KeyResolver(DefaultTokenCredential)`) with `DefaultAzureCredential` (Managed Identity in production) +- Azure Key Vault-backed key resolver (`KeyResolver(DefaultTokenCredential)`) with `DefaultAzureCredential` (Managed Identity in production) - Sustained concurrent load — multiple feed iterators running in parallel -- Failure point: PDEK cache TTL expires (every 1–2 hours) → first thread blocks on sync Key Vault I/O inside semaphore → queued threads' cancellation tokens fire → `OperationCanceledException` +- Failure point: `ProtectedDataEncryptionKey` cache time-to-live expires (every 1–2 hours) → first thread blocks on sync Key Vault I/O inside semaphore → queued threads' cancellation tokens fire → `OperationCanceledException` - Cancellation timeout: ~5s (inferred from call stack — Change Feed Processor lease rebalance or request timeout) -**The benchmark must reproduce this pattern**: concurrent feed iterator reads decrypting multi-property documents, crossing a PDEK cache TTL boundary, with realistic Key Vault latency. +**The benchmark must reproduce this pattern**: concurrent feed iterator reads decrypting multi-property documents, crossing a `ProtectedDataEncryptionKey` cache time-to-live boundary, with realistic Key Vault latency. **What to measure:** - **Semaphore hold time** (p50, p95, p99): How long each thread holds the semaphore during `BuildProtectedDataEncryptionKeyAsync`. Baseline (without fix) should show 200ms–2.4s on cache miss. With fix, should drop to microseconds. - **End-to-end decrypt latency** (p50, p95, p99): Time from `ReadNextAsync` call to decrypted response, under concurrent load. - **Throughput**: Decrypted documents per second at N concurrent feed iterators. -- **AKV call count**: Total `Resolve` + `UnwrapKey` calls to Key Vault over a fixed workload. Should drop from O(PDEK misses × 2) to O(CEKs × 1 per TTL interval). +- **Azure Key Vault call count**: Total `Resolve` + `UnwrapKey` calls to Key Vault over a fixed workload. Should drop from O(`ProtectedDataEncryptionKey` cache misses × 2) to O(Client Encryption Keys × 1 per time-to-live interval). - **`OperationCanceledException` rate**: With a 5s cancellation timeout (matching customer pattern), count how many operations are cancelled. Baseline should reproduce the customer's issue; with fix, should be zero. **Benchmark configuration:** - Concurrent feed iterators: 1, 10, 50, 100 - Encrypted properties per document: 1, 5, 10 - Documents per page: 10, 50 -- CEK count: 1, 5 (to test multi-CEK refresh burst) -- Key Vault latency: simulated via `InMemoryKeyResolver` with configurable delay (0ms, 100ms, 500ms, 2000ms) — 500ms represents typical AKV latency, 2000ms represents cold token + AKV worst case -- PDEK cache state: warm (cache hit) and cold (force TTL expiry to simulate the 2-hour boundary) -- **Key scenario (must reproduce customer failure)**: Start with warm cache under sustained concurrent reads (50+ feed iterators), then force PDEK TTL expiry mid-workload with 500ms simulated AKV latency and 5s cancellation timeout. Baseline should show `OperationCanceledException` cascade; with fix, should show zero cancellations. +- Client Encryption Key count: 1, 5 (to test multi-Client Encryption Key refresh burst) +- Key Vault latency: simulated via `InMemoryKeyResolver` with configurable delay (0ms, 100ms, 500ms, 2000ms) — 500ms represents typical Azure Key Vault latency, 2000ms represents cold token + Azure Key Vault worst case +- `ProtectedDataEncryptionKey` cache state: warm (cache hit) and cold (force time-to-live expiry to simulate the 2-hour boundary) +- **Key scenario (must reproduce customer failure)**: Start with warm cache under sustained concurrent reads (50+ feed iterators), then force `ProtectedDataEncryptionKey` cache time-to-live expiry mid-workload with 500ms simulated Azure Key Vault latency and 5s cancellation timeout. Baseline should show `OperationCanceledException` cascade; with fix, should show zero cancellations. **Comparison:** -- Env var off (baseline — current behavior) vs. env var on (with fix) +- Environment variable off (baseline — current behavior) vs. environment variable on (with fix) - Same workload, same concurrency, same simulated Key Vault latency -- Output: CSV with latency percentiles, throughput, AKV call counts, cancellation counts +- Output: CSV with latency percentiles, throughput, Azure Key Vault call counts, cancellation counts **Where to run:** - Unit-level microbenchmarks (in-process, `InMemoryKeyResolver` with configurable delay) for semaphore hold time and throughput -- Emulator E2E benchmarks for end-to-end latency with real Cosmos operations (requires emulator) +- Emulator end-to-end benchmarks for end-to-end latency with real Cosmos operations (requires emulator) ## Open Questions -1. Should the resolved-client cache (`IKeyEncryptionKey`) be invalidated on a timer, or only on CMK URL mismatch? The `CryptographyClient` is stateless and long-lived, so indefinite caching until URL change seems safe. -2. What should the prefetch cache TTL be? Should it match PDEK TTL (1–2 hours) or be shorter? -3. **AKV burst on multi-CEK refresh**: If a customer has N distinct Client Encryption Keys all created around the same time, their prefetch caches expire roughly together, causing N concurrent `ResolveAsync` + `UnwrapKeyAsync` calls to AKV in one burst. Should the proactive refresh window include jitter/stagger (e.g., random offset within 2–5 minutes before expiry) to spread calls over time? Should there be a rate limiter (max M concurrent background AKV calls)? -4. **Single vs. dual `ConcurrentDictionary` for prefetch**: The current design uses two dictionaries — `ConcurrentDictionary` (result cache) and `ConcurrentDictionary` (inflight deduplication). The result cache serves the sync `UnwrapKey` path (needs `byte[]` immediately, can't `await`). The inflight dictionary is ephemeral (populated only during active AKV calls, removed on completion). An alternative is a single `ConcurrentDictionary>` where the sync path calls `.Result` (safe when the task is already completed from a prior prefetch). Trade-off: fewer data structures vs. less explicit intent. Should the implementer choose based on readability? +1. Should the resolved-client cache (`IKeyEncryptionKey`) be invalidated on a timer, or only on Customer Master Key URL mismatch? The `CryptographyClient` is stateless and long-lived, so indefinite caching until URL change seems safe. +2. What should the prefetch cache time-to-live be? Should it match `ProtectedDataEncryptionKey` cache time-to-live (1–2 hours) or be shorter? +3. **Azure Key Vault burst on multi-Client Encryption Key refresh**: If a customer has N distinct Client Encryption Keys all created around the same time, their prefetch caches expire roughly together, causing N concurrent `ResolveAsync` + `UnwrapKeyAsync` calls to Azure Key Vault in one burst. Should the proactive refresh window include jitter/stagger (e.g., random offset within the percentage-based refresh window) to spread calls over time? Should there be a rate limiter (max M concurrent background Azure Key Vault calls)? +4. **Single vs. dual `ConcurrentDictionary` for prefetch**: The current design uses two dictionaries — `ConcurrentDictionary` (result cache) and `ConcurrentDictionary` (inflight deduplication). The result cache serves the sync `UnwrapKey` path (needs `byte[]` immediately, can't `await`). The inflight dictionary is ephemeral (populated only during active Azure Key Vault calls, removed on completion). An alternative is a single `ConcurrentDictionary>` where the sync path calls `.Result` (safe when the task is already completed from a prior prefetch). Trade-off: fewer data structures vs. less explicit intent. Should the implementer choose based on readability? +5. **Stale or missing prefetch cache entry at sync `UnwrapKey` time**: If the prefetch cache does not have a refreshed `ProtectedDataEncryptionKey` entry when the sync `UnwrapKey` is called inside the semaphore (e.g. prefetch failed, prefetch was slow, or the entry expired between prefetch and semaphore acquisition), the sync path falls through to the existing `Resolve()` + `UnwrapKey()` Key Vault calls — identical to current behavior. The semaphore hold time in this case is the same as today (200ms–2.4s on cache miss). This is the expected fallback; the optimization is best-effort and the worst case is unchanged from the status quo. ## Future Considerations -### Use MDE's built-in algorithm cache (`AeadAes256CbcHmac256EncryptionAlgorithm.GetOrCreate`) +### Use the Microsoft Data Encryption library's built-in algorithm cache (`AeadAes256CbcHmac256EncryptionAlgorithm.GetOrCreate`) -MDE already has a static `LocalCache, AeadAes256CbcHmac256EncryptionAlgorithm>` with a `GetOrCreate` method. The SDK currently bypasses it by calling `new AeadAes256CbcHmac256EncryptionAlgorithm(pdek, encType)` directly at `EncryptionSettingForProperty.cs:114`. +The Microsoft Data Encryption library already has a static `LocalCache, AeadAes256CbcHmac256EncryptionAlgorithm>` with a `GetOrCreate` method. The SDK currently bypasses it by calling `new AeadAes256CbcHmac256EncryptionAlgorithm(pdek, encType)` directly at `EncryptionSettingForProperty.cs:114`. -Switching to `GetOrCreate` would eliminate a redundant allocation per encrypted leaf value on the hot path (when the PDEK cache hits, the same `ProtectedDataEncryptionKey` reference produces equal `Tuple` keys → algorithm cache hit). `ProtectedDataEncryptionKey` overrides `Equals`/`GetHashCode` with structural equality (`Name` + `KeyEncryptionKey` + `rootKeyHexString`), so the cache key works correctly. +Switching to `GetOrCreate` would eliminate a redundant allocation per encrypted leaf value on the hot path (when the `ProtectedDataEncryptionKey` cache hits, the same `ProtectedDataEncryptionKey` reference produces equal `Tuple` keys → algorithm cache hit). `ProtectedDataEncryptionKey` overrides `Equals`/`GetHashCode` with structural equality (`Name` + `KeyEncryptionKey` + `rootKeyHexString`), so the cache key works correctly. **Why it's not in this change**: The `GetOrCreate` call occurs *after* the semaphore is released — it doesn't reduce semaphore contention. To skip the semaphore entirely, the cache check would need to be hoisted above `BuildProtectedDataEncryptionKeyAsync`, which reintroduces the ETag validation complexity and interaction with the Forbidden retry path. The async prefetch already makes the semaphore hold time microseconds, so the marginal benefit of skipping it entirely is small. diff --git a/openspec/changes/reduce-encryption-contention/proposal.md b/openspec/changes/reduce-encryption-contention/proposal.md index 3950438c79..3e8dd32a9c 100644 --- a/openspec/changes/reduce-encryption-contention/proposal.md +++ b/openspec/changes/reduce-encryption-contention/proposal.md @@ -1,19 +1,19 @@ ## Why -Customer-side encryption operations throw `OperationCanceledException` under concurrent load because a global `SemaphoreSlim(1,1)` in `BuildProtectedDataEncryptionKeyAsync` serializes construction of `ProtectedDataEncryptionKey` objects — the resolved, unwrapped encryption keys needed for every encrypt/decrypt operation. The semaphore guards `KeyEncryptionKey.GetOrCreate` and `ProtectedDataEncryptionKey.GetOrCreate` — MDE's static get-or-create cache operations that, on cache miss, invoke the `createItem` delegate which triggers synchronous Key Vault HTTP calls (Resolve + UnwrapKey). The semaphore ensures only one thread at a time performs the expensive key creation, preventing duplicate Key Vault calls for the same key. However, this means every encrypted leaf value in every document on every page contends on this single-permit lock — even when the cache is warm and the hold time would be microseconds. The root cause: `ProtectedDataEncryptionKey` resolution is synchronous, happens on the hot path under the semaphore, and makes two blocking HTTP calls to Key Vault on cache miss. An internal customer is actively impacted. +Customer-side encryption operations throw `OperationCanceledException` under concurrent load because a global `SemaphoreSlim(1,1)` in `BuildProtectedDataEncryptionKeyAsync` serializes construction of `ProtectedDataEncryptionKey` objects — the resolved, unwrapped encryption keys needed for every encrypt/decrypt operation. The semaphore guards `KeyEncryptionKey.GetOrCreate` and `ProtectedDataEncryptionKey.GetOrCreate` — the Microsoft Data Encryption library's static get-or-create cache operations that, on cache miss, invoke the `createItem` delegate which triggers synchronous Key Vault HTTP calls (Resolve + UnwrapKey). The semaphore ensures only one thread at a time performs the expensive key creation, preventing duplicate Key Vault calls for the same key. However, this means every encrypted leaf value in every document on every page contends on this single-permit lock — even when the cache is warm and the hold time would be microseconds. The root cause: `ProtectedDataEncryptionKey` resolution is synchronous, happens on the hot path under the semaphore, and makes two blocking HTTP calls to Key Vault on cache miss. An internal customer is actively impacted. ## What Changes -- **Async prefetch of `ProtectedDataEncryptionKey` resolution outside the semaphore** via `ResolveAsync()` + `UnwrapKeyAsync()` before semaphore acquisition. The sync `UnwrapKey` inside the semaphore cannot be removed — MDE's `ProtectedDataEncryptionKey.GetOrCreate` constructor chain calls it unconditionally. Instead, the prefetch populates a cache so that when MDE's sync `UnwrapKey` fires, it returns cached bytes instantly instead of making HTTP calls to Key Vault. The semaphore is still acquired, but held for microseconds (cache read) instead of 200ms–2.4s (HTTP I/O). -- **Proactive background refresh** of the prefetched unwrapped Data Encryption Key bytes (the plaintext AES-256 key material returned by Key Vault's `UnwrapKey`) ~5min before TTL expiry, deduplicated to one AKV call per key per interval. Prevents thundering herd at TTL boundary. -- **Cache the resolved `IKeyEncryptionKey` (CryptographyClient)** per CMK URL so each refresh makes one AKV call (`UnwrapKey`) instead of two (`Resolve` + `UnwrapKey`). +- **Async prefetch of `ProtectedDataEncryptionKey` resolution outside the semaphore** via `ResolveAsync()` + `UnwrapKeyAsync()` before semaphore acquisition. The sync `UnwrapKey` inside the semaphore cannot be removed — the Microsoft Data Encryption library's `ProtectedDataEncryptionKey.GetOrCreate` constructor chain calls it unconditionally. Instead, the prefetch populates a cache so that when the Microsoft Data Encryption library's sync `UnwrapKey` fires, it returns cached bytes instantly instead of making HTTP calls to Key Vault. The semaphore is still acquired, but held for microseconds (cache read) instead of 200ms–2.4s (HTTP I/O). +- **Proactive background refresh** of the prefetched unwrapped Data Encryption Key bytes (the plaintext AES-256 key material returned by Key Vault's `UnwrapKey`) approximately 5 minutes before time-to-live expiry, deduplicated to one Azure Key Vault call per key per interval. Prevents thundering herd at time-to-live boundary. +- **Cache the resolved `IKeyEncryptionKey` (CryptographyClient)** per Customer Master Key URL so each refresh makes one Azure Key Vault call (`UnwrapKey`) instead of two (`Resolve` + `UnwrapKey`). - All changes gated behind an opt-in environment variable (`AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED`), off by default. No public API changes. No breaking changes. ## Capabilities ### New Capabilities -- `async-dek-prefetch`: Async prefetch of `ProtectedDataEncryptionKey` resolution outside the semaphore using `ResolveAsync()` + `UnwrapKeyAsync()`, with a `ConcurrentDictionary` prefetch cache that the sync `UnwrapKey` reads from. Includes proactive background refresh before TTL expiry and lifecycle management via `CancellationTokenSource`. -- `resolved-client-cache`: Cache the `IKeyEncryptionKey` (CryptographyClient) returned by `Resolve()` per CMK URL to eliminate redundant Key Vault HTTP GETs on each refresh. +- `async-dek-prefetch`: Async prefetch of `ProtectedDataEncryptionKey` resolution outside the semaphore using `ResolveAsync()` + `UnwrapKeyAsync()`, with a `ConcurrentDictionary` prefetch cache that the sync `UnwrapKey` reads from. Includes proactive background refresh before time-to-live expiry and lifecycle management via `CancellationTokenSource`. +- `resolved-client-cache`: Cache the `IKeyEncryptionKey` (CryptographyClient) returned by `Resolve()` per Customer Master Key URL to eliminate redundant Key Vault HTTP GETs on each refresh. - `env-var-feature-gate`: Environment variable gate (`AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED`) to opt in to all caching/prefetch layers. Off by default. Follows existing SDK `ConfigurationManager` pattern. ### Modified Capabilities @@ -24,6 +24,6 @@ Customer-side encryption operations throw `OperationCanceledException` under con - **Files**: `EncryptionKeyStoreProviderImpl.cs` (or new subclass), `EncryptionCosmosClient.cs`, `EncryptionSettingForProperty.cs` (prefetch wiring only) - **Dependencies**: No new packages. Uses existing `IKeyEncryptionKeyResolver.ResolveAsync()` / `IKeyEncryptionKey.UnwrapKeyAsync()` from `Azure.Core.Cryptography`. - **APIs**: No public API changes. Internal-only. -- **Security**: Plaintext Data Encryption Key bytes are already cached in-process by MDE's `ProtectedDataEncryptionKey`. New caches hold the same bytes with the same TTL in the same process — no new attack surface. -- **Risk**: New caching layers introduce lifecycle complexity (background refresh, disposal, cache coherence on key rotation). Gated by env var for safe rollout. -- **Testing**: Unit tests for each cache layer + concurrency. Emulator E2E tests with env var enabled. +- **Security**: Plaintext Data Encryption Key bytes are already cached in-process by the Microsoft Data Encryption library's `ProtectedDataEncryptionKey`. New caches hold the same bytes with the same time-to-live in the same process — no new attack surface. +- **Risk**: New caching layers introduce lifecycle complexity (background refresh, disposal, cache coherence on key rotation). Gated by environment variable for safe rollout. +- **Testing**: Unit tests for each cache layer + concurrency. Emulator end-to-end tests with environment variable enabled. diff --git a/openspec/changes/reduce-encryption-contention/specs/async-dek-prefetch/spec.md b/openspec/changes/reduce-encryption-contention/specs/async-dek-prefetch/spec.md index 90ccb8fdf9..e349ecda72 100644 --- a/openspec/changes/reduce-encryption-contention/specs/async-dek-prefetch/spec.md +++ b/openspec/changes/reduce-encryption-contention/specs/async-dek-prefetch/spec.md @@ -1,6 +1,6 @@ ## ADDED Requirements -### Requirement: Async prefetch of unwrapped DEK bytes outside the semaphore +### Requirement: Async prefetch of unwrapped Data Encryption Key bytes outside the semaphore The system SHALL provide a `PrefetchUnwrapKeyAsync` method that calls `ResolveAsync()` + `UnwrapKeyAsync()` asynchronously and stores the result in a `ConcurrentDictionary` prefetch cache. This method SHALL be called before semaphore acquisition in `BuildProtectedDataEncryptionKeyAsync`. #### Scenario: Prefetch warms cache before semaphore @@ -8,36 +8,38 @@ The system SHALL provide a `PrefetchUnwrapKeyAsync` method that calls `ResolveAs - **THEN** `ResolveAsync` and `UnwrapKeyAsync` SHALL execute asynchronously (yielding the thread), and the result SHALL be stored in the prefetch cache #### Scenario: Sync UnwrapKey reads from prefetch cache -- **WHEN** MDE's sync `UnwrapKey` is called inside the semaphore and the prefetch cache has a valid entry for the wrapped key +- **WHEN** the Microsoft Data Encryption library's sync `UnwrapKey` is called inside the semaphore and the prefetch cache has a valid entry for the wrapped key - **THEN** `UnwrapKey` SHALL return the cached bytes immediately without calling `Resolve()` or `UnwrapKey()` on Key Vault #### Scenario: Sync fallback on prefetch cache miss -- **WHEN** MDE's sync `UnwrapKey` is called inside the semaphore and the prefetch cache does NOT have an entry (race condition, prefetch failed, or prefetch not called) +- **WHEN** the Microsoft Data Encryption library's sync `UnwrapKey` is called inside the semaphore and the prefetch cache does NOT have an entry (race condition, prefetch failed, or prefetch not called) - **THEN** `UnwrapKey` SHALL fall through to the existing sync `Resolve()` + `UnwrapKey()` path (identical to current behavior) ### Requirement: Concurrent prefetch deduplication The system SHALL deduplicate concurrent prefetch calls for the same wrapped key so that only one async Key Vault call flies per key at a time. #### Scenario: Multiple threads prefetch same key -- **WHEN** 50 threads simultaneously call `PrefetchUnwrapKeyAsync` for the same wrapped key -- **THEN** only one `ResolveAsync` + `UnwrapKeyAsync` call SHALL be made to Key Vault; all threads SHALL await the same `Task` +- **WHEN** N threads simultaneously call `PrefetchUnwrapKeyAsync` for the same wrapped key (N can be any number of concurrent callers) +- **THEN** only one `ResolveAsync` + `UnwrapKeyAsync` call SHALL be made to Key Vault; all N threads SHALL await the same `Task` +- **NOTE**: The deduplication guarantee is independent of the number of concurrent callers. Test scenarios SHOULD use a representative concurrency level (e.g. 50) but the invariant holds for any N ≥ 2. -### Requirement: Proactive background refresh before TTL expiry -The system SHALL schedule a background refresh of the prefetch cache entry approximately 5 minutes before the entry's TTL expires, so that the next consumer finds a warm cache. +### Requirement: Proactive background refresh before time-to-live expiry +The system SHALL schedule a background refresh of the prefetch cache entry when the entry is within the refresh window of its time-to-live expiry (20% of cache time-to-live, capped at 5 minutes maximum), so that the next consumer finds a warm cache. #### Scenario: Background refresh fires before expiry -- **WHEN** a prefetch cache entry is within 5 minutes of expiry and is accessed +- **WHEN** a prefetch cache entry is within the refresh window (20% of time-to-live, max 5 minutes) of expiry and is accessed - **THEN** the system SHALL initiate a background `Task.Run` that calls `ResolveAsync` + `UnwrapKeyAsync` and updates the cache entry #### Scenario: Background refresh failure does not crash - **WHEN** the background refresh call fails (Key Vault down, 429 throttle, network error) -- **THEN** the failure SHALL be caught and logged; the existing cache entry SHALL remain until its TTL expires; the sync fallback path SHALL handle the next call +- **THEN** the failure SHALL be caught and logged; the existing cache entry SHALL remain until its time-to-live expires; the sync fallback path SHALL handle the next call +- **NOTE**: A backoff-retry mechanism SHOULD be applied to the background refresh before giving up (e.g. exponential backoff with 1–3 retries). If all retries are exhausted, the existing cache entry continues to serve reads until time-to-live expiry. Once the entry expires (removed from the prefetch cache), the next `UnwrapKey` call inside the semaphore will find no cache hit and fall through to the sync `Resolve()` + `UnwrapKey()` path — no explicit invalidation is needed because time-to-live expiry naturally clears the entry. -### Requirement: Prefetch cache TTL matches PDEK TTL -The prefetch cache entry TTL SHALL match the `ProtectedDataEncryptionKey.TimeToLive` value. +### Requirement: Prefetch cache time-to-live matches `ProtectedDataEncryptionKey` cache time-to-live +The prefetch cache entry time-to-live SHALL match the `ProtectedDataEncryptionKey.TimeToLive` value. -#### Scenario: Cache entry expires with PDEK -- **WHEN** the PDEK cache TTL (1–2 hours) elapses +#### Scenario: Cache entry expires with `ProtectedDataEncryptionKey` +- **WHEN** the `ProtectedDataEncryptionKey` cache time-to-live (1–2 hours) elapses - **THEN** the prefetch cache entry for the same key SHALL also be expired, ensuring a fresh Key Vault call on the next cold path ### Requirement: Lifecycle management via CancellationTokenSource diff --git a/openspec/changes/reduce-encryption-contention/specs/env-var-feature-gate/spec.md b/openspec/changes/reduce-encryption-contention/specs/env-var-feature-gate/spec.md index 2c1f46c17f..240888252c 100644 --- a/openspec/changes/reduce-encryption-contention/specs/env-var-feature-gate/spec.md +++ b/openspec/changes/reduce-encryption-contention/specs/env-var-feature-gate/spec.md @@ -1,28 +1,28 @@ ## ADDED Requirements ### Requirement: Single environment variable gates all optimization layers -The system SHALL use a single environment variable `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED` to enable or disable all caching and prefetch layers (resolved-client cache, async prefetch, proactive background refresh). +The system SHALL use a single environment variable `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED` to enable or disable all caching and prefetch layers for Data Encryption Key resolution (resolved-client cache, async Data Encryption Key prefetch, proactive background Data Encryption Key refresh). -#### Scenario: Env var not set — all layers disabled +#### Scenario: Environment variable not set — all layers disabled - **WHEN** `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED` is not set in the environment - **THEN** all behavior SHALL be identical to the current codebase: no resolved-client cache, no async prefetch, no proactive background refresh -#### Scenario: Env var set to true — all layers enabled +#### Scenario: Environment variable set to true — all layers enabled - **WHEN** `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED` is set to `true` (case-insensitive) - **THEN** all caching and prefetch layers SHALL be active -#### Scenario: Env var set to false or invalid — all layers disabled +#### Scenario: Environment variable set to false or invalid — all layers disabled - **WHEN** `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED` is set to `false`, empty, or any value that does not parse as `true` - **THEN** all behavior SHALL be identical to the env-var-not-set case -### Requirement: Env var read at EncryptionCosmosClient construction time +### Requirement: Environment variable read at EncryptionCosmosClient construction time The environment variable SHALL be read once during `EncryptionCosmosClient` construction and the result cached for the client's lifetime. Subsequent changes to the environment variable SHALL NOT affect an already-constructed client. -#### Scenario: Env var read once at startup +#### Scenario: Environment variable read once at startup - **WHEN** `EncryptionCosmosClient` is constructed -- **THEN** the env var SHALL be read via the SDK's `ConfigurationManager` pattern (or `Environment.GetEnvironmentVariable`) and the boolean result stored as a readonly field +- **THEN** the environment variable SHALL be read via the SDK's `ConfigurationManager` pattern (or `Environment.GetEnvironmentVariable`) and the boolean result stored as a readonly field -#### Scenario: Env var change after construction has no effect +#### Scenario: Environment variable change after construction has no effect - **WHEN** the environment variable is changed after `EncryptionCosmosClient` is constructed - **THEN** the existing client instance SHALL continue using the value read at construction time diff --git a/openspec/changes/reduce-encryption-contention/specs/resolved-client-cache/spec.md b/openspec/changes/reduce-encryption-contention/specs/resolved-client-cache/spec.md index e943deaa1b..7588ff514f 100644 --- a/openspec/changes/reduce-encryption-contention/specs/resolved-client-cache/spec.md +++ b/openspec/changes/reduce-encryption-contention/specs/resolved-client-cache/spec.md @@ -1,18 +1,18 @@ ## ADDED Requirements -### Requirement: Resolved IKeyEncryptionKey cached per CMK URL -The system SHALL maintain a `ConcurrentDictionary` keyed by `encryptionKeyId` (the CMK URL) that caches the `CryptographyClient` returned by `IKeyEncryptionKeyResolver.Resolve()`. +### Requirement: Resolved IKeyEncryptionKey cached per Customer Master Key URL +The system SHALL maintain a `ConcurrentDictionary` keyed by `encryptionKeyId` (the Customer Master Key URL) that caches the `CryptographyClient` returned by `IKeyEncryptionKeyResolver.Resolve()`. -#### Scenario: First resolve for a CMK URL -- **WHEN** `UnwrapKey` (or prefetch) is called for a CMK URL not yet in the resolved-client cache +#### Scenario: First resolve for a Customer Master Key URL +- **WHEN** `UnwrapKey` (or prefetch) is called for a Customer Master Key URL not yet in the resolved-client cache - **THEN** the system SHALL call `Resolve(keyId)` (or `ResolveAsync(keyId)` on the async path), store the returned `IKeyEncryptionKey` in the cache, and use it for the `UnwrapKey` call -#### Scenario: Subsequent resolve for the same CMK URL -- **WHEN** `UnwrapKey` is called for a CMK URL that is already in the resolved-client cache +#### Scenario: Subsequent resolve for the same Customer Master Key URL +- **WHEN** `UnwrapKey` is called for a Customer Master Key URL that is already in the resolved-client cache - **THEN** the system SHALL skip the `Resolve()` call and use the cached `IKeyEncryptionKey` directly for the `UnwrapKey` call #### Scenario: Key Vault calls halved on true cache miss -- **WHEN** a PDEK miss triggers `UnwrapKey` with a warm resolved-client cache +- **WHEN** a `ProtectedDataEncryptionKey` cache miss triggers `UnwrapKey` with a warm resolved-client cache - **THEN** only one Key Vault HTTP call SHALL be made (`UnwrapKey` POST) instead of two (`Resolve` GET + `UnwrapKey` POST) ### Requirement: No secret material in resolved-client cache @@ -20,11 +20,11 @@ The `IKeyEncryptionKey` object SHALL contain only the Key Vault URL, key name, k #### Scenario: Cache contents are non-secret - **WHEN** the resolved-client cache is inspected -- **THEN** each entry SHALL be a `CryptographyClient` (or equivalent) containing only the CMK URL and auth pipeline reference — no RSA private key bytes +- **THEN** each entry SHALL be a `CryptographyClient` (or equivalent) containing only the Customer Master Key URL and auth pipeline reference — no RSA private key bytes -### Requirement: Cache invalidation on CMK URL change -The resolved-client cache entry SHALL be invalidated when the CMK URL changes (key rotation where the CEK is rewrapped to a different CMK). +### Requirement: Cache invalidation on Customer Master Key URL change +The resolved-client cache entry SHALL be invalidated when the Customer Master Key URL changes (key rotation where the Client Encryption Key is rewrapped to a different Customer Master Key). -#### Scenario: CMK URL changes after rewrap -- **WHEN** `ClientEncryptionKeyProperties.EncryptionKeyWrapMetadata.Value` returns a different CMK URL than the one cached +#### Scenario: Customer Master Key URL changes after rewrap +- **WHEN** `ClientEncryptionKeyProperties.EncryptionKeyWrapMetadata.Value` returns a different Customer Master Key URL than the one cached - **THEN** the system SHALL call `Resolve()` with the new URL and update the cache entry diff --git a/openspec/changes/reduce-encryption-contention/tasks.md b/openspec/changes/reduce-encryption-contention/tasks.md index 011febe0bb..9e949d921c 100644 --- a/openspec/changes/reduce-encryption-contention/tasks.md +++ b/openspec/changes/reduce-encryption-contention/tasks.md @@ -1,62 +1,64 @@ -## 1. Security Validation — Background DEK Refresh & Indefinite IKeyEncryptionKey Caching +## 1. Security Validation — Background Data Encryption Key Refresh & Indefinite IKeyEncryptionKey Caching -- [ ] 1.1 **Threat model: plaintext DEK in prefetch cache.** Document what is cached (plaintext AES-256 bytes), where it lives (in-process `ConcurrentDictionary`), and confirm it is the same key material already held by MDE's `ProtectedDataEncryptionKey`. Identify any net-new exposure vs. current state. Deliverable: written assessment in this task item or design.md addendum. +- [ ] 1.1 **Threat model: plaintext Data Encryption Key in prefetch cache.** Document what is cached (plaintext AES-256 bytes), where it lives (in-process `ConcurrentDictionary`), and confirm it is the same key material already held by the Microsoft Data Encryption library's `ProtectedDataEncryptionKey`. Identify any net-new exposure vs. current state. Deliverable: written assessment in this task item or design.md addendum. - [ ] 1.2 **Threat model: background `Task.Run` refresh.** Identify risks of a background task calling `ResolveAsync` + `UnwrapKeyAsync` on a timer-like pattern: (a) GC rooting of provider/credential chain while task is in-flight, (b) token credential refresh happening on a ThreadPool thread outside the caller's context, (c) plaintext bytes being written to cache from a background thread after `Dispose()` is called. For each risk, document mitigation (CancellationTokenSource, Interlocked dispose guard, cancellation token observation). -- [ ] 1.3 **Threat model: indefinite `IKeyEncryptionKey` caching.** Confirm the cached `CryptographyClient` contains no secret material (only vault URL + key name + version + HTTP pipeline reference). Identify the invalidation trigger (CMK URL change on key rotation). Document whether an indefinite cache (no TTL, invalidate only on URL mismatch) is safe, or whether a bounded TTL is needed. Check: does the `CryptographyClient` hold a reference to a specific key version, and what happens if the key is disabled/deleted in Key Vault? -- [ ] 1.4 **Key rotation scenario.** Confirm that CEK rewrap (the only runtime rotation) does not change plaintext DEK bytes, so all caches remain valid. CEK replacement is out of scope (requires data migration, offline operation). Document this assumption. -- [ ] 1.5 **Stale key material window.** Quantify the maximum time plaintext DEK bytes from a revoked CMK can remain in memory (prefetch cache TTL + PDEK TTL). CEK replacement is out of scope. Confirm the window is acceptable per Azure Key Vault best practices. -- [ ] 1.6 **Explore: can `AeadAes256CbcHmac256EncryptionAlgorithm.GetOrCreate` replace `new`?** MDE has a built-in static `LocalCache` for algorithm instances keyed by `(DataEncryptionKey, EncryptionType)`. The SDK bypasses it by calling `new` directly. Evaluate: (a) does `ProtectedDataEncryptionKey.Equals`/`GetHashCode` produce correct cache hits for the same logical key? (b) is there any thread-safety concern with MDE's `LocalCache`? (c) could this be a standalone one-line cleanup independent of this change? Document findings. -- [ ] 1.7 **Understand the wrapped CEK fetch path from Cosmos DB.** Read `GetClientEncryptionKeyPropertiesAsync` in `EncryptionCosmosClient` and `EncryptionSettingForProperty`. Understand: (a) how the wrapped CEK is fetched (gateway-cached read via `AsyncCache`), (b) when it refreshes (force refresh on Forbidden retry, ETag-based staleness check), (c) whether anything in its caching design (AsyncCache pattern, ETag invalidation, gateway cache headers) can inform or be reused for the prefetch cache design. Document any applicable patterns. -- [ ] 1.8 **Review with implementing engineer.** Walk through findings from 1.1–1.7 before proceeding to implementation. Resolve any open questions. Update design.md Risks section if new risks are identified. +- [ ] 1.3 **Threat model: indefinite `IKeyEncryptionKey` caching.** Confirm the cached `CryptographyClient` contains no secret material (only vault URL + key name + version + HTTP pipeline reference). Identify the invalidation trigger (Customer Master Key URL change on key rotation). Document whether an indefinite cache (no time-to-live, invalidate only on URL mismatch) is safe, or whether a bounded time-to-live is needed. Check: does the `CryptographyClient` hold a reference to a specific key version, and what happens if the key is disabled/deleted in Key Vault? +- [ ] 1.4 **Key rotation scenario.** Confirm that Client Encryption Key rewrap (the only runtime rotation) does not change plaintext Data Encryption Key bytes, so all caches remain valid. Client Encryption Key replacement is out of scope (requires data migration, offline operation). Document this assumption. +- [ ] 1.5 **Stale key material window.** Quantify the maximum time plaintext Data Encryption Key bytes from a revoked Customer Master Key can remain in memory (prefetch cache time-to-live + `ProtectedDataEncryptionKey` cache time-to-live). Client Encryption Key replacement is out of scope. Confirm the window is acceptable per Azure Key Vault best practices. +- [ ] 1.6 **Explore: can `AeadAes256CbcHmac256EncryptionAlgorithm.GetOrCreate` replace `new`?** The Microsoft Data Encryption library has a built-in static `LocalCache` for algorithm instances keyed by `(DataEncryptionKey, EncryptionType)`. The SDK bypasses it by calling `new` directly. Evaluate: (a) does `ProtectedDataEncryptionKey.Equals`/`GetHashCode` produce correct cache hits for the same logical key? (b) is there any thread-safety concern with the Microsoft Data Encryption library's `LocalCache`? (c) could this be a standalone one-line cleanup independent of this change? Document findings. +- [ ] 1.7 **Understand the wrapped Client Encryption Key fetch path from Cosmos DB.** Read `GetClientEncryptionKeyPropertiesAsync` in `EncryptionCosmosClient` and `EncryptionSettingForProperty`. Understand: (a) how the wrapped Client Encryption Key is fetched (gateway-cached read via `AsyncCache`), (b) when it refreshes (force refresh on Forbidden retry, ETag-based staleness check), (c) whether anything in its caching design (AsyncCache pattern, ETag invalidation, gateway cache headers) can inform or be reused for the prefetch cache design. Document any applicable patterns. +- [ ] 1.8 **Review with implementing engineer.** Walk through findings from 1.1–1.9 before proceeding to implementation. Resolve any open questions. Update design.md Risks section if new risks are identified. +- [ ] 1.9 **Operation-type contention analysis.** Investigate whether specific operation types (read/decrypt vs. write/encrypt, point reads vs. feed iterators, Change Feed Processor vs. manual iteration) trigger disproportionate semaphore contention. Map the call path from each operation type to `BuildEncryptionAlgorithmForSettingAsync` and document the semaphore acquisition frequency per operation type. Identify if any operation type can bypass the semaphore (e.g. if the algorithm is already cached at a higher level). ## 2. Environment Variable Feature Gate - [ ] 2.1 Read `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED` in `EncryptionCosmosClient` constructor (via `Environment.GetEnvironmentVariable`, case-insensitive `true` check). Store as `internal readonly bool IsOptimisticDecryptionEnabled`. -- [ ] 2.2 Verify env var is read once at construction and cached — subsequent env var changes do not affect existing client instances. -- [ ] 2.3 Add unit test: env var not set → `IsOptimisticDecryptionEnabled` is false. -- [ ] 2.4 Add unit test: env var set to `true` / `True` / `TRUE` → `IsOptimisticDecryptionEnabled` is true. -- [ ] 2.5 Add unit test: env var set to `false` / empty / garbage → `IsOptimisticDecryptionEnabled` is false. +- [ ] 2.2 Verify environment variable is read once at construction and cached — subsequent environment variable changes do not affect existing client instances. +- [ ] 2.3 Add unit test: environment variable not set → `IsOptimisticDecryptionEnabled` is false. +- [ ] 2.4 Add unit test: environment variable set to `true` / `True` / `TRUE` → `IsOptimisticDecryptionEnabled` is true. +- [ ] 2.5 Add unit test: environment variable set to `false` / empty / garbage → `IsOptimisticDecryptionEnabled` is false. ## 3. Resolved Client Cache (`CachingEncryptionKeyStoreProviderImpl`) - [ ] 4.1 Create `CachingEncryptionKeyStoreProviderImpl` as a sealed internal class extending `EncryptionKeyStoreProviderImpl`. - [ ] 4.2 Add `ConcurrentDictionary resolvedClientCache` field. - [ ] 3.3 Override `UnwrapKey`: before calling `Resolve(keyId)`, check `resolvedClientCache.TryGetValue(keyId, out client)`. On hit, use cached client directly for `UnwrapKey`. On miss, call `Resolve`, store result, then call `UnwrapKey`. -- [ ] 3.4 Add unit test: first call for a CMK URL calls `Resolve`; second call skips `Resolve` and calls only `UnwrapKey` (verify via mock). -- [ ] 3.5 Add unit test: different CMK URLs get independent cache entries; resolving one does not affect the other. +- [ ] 3.4 Add unit test: first call for a Customer Master Key URL calls `Resolve`; second call skips `Resolve` and calls only `UnwrapKey` (verify via mock). +- [ ] 3.5 Add unit test: different Customer Master Key URLs get independent cache entries; resolving one does not affect the other. -## 4. Async DEK Prefetch (`CachingEncryptionKeyStoreProviderImpl`) +## 4. Async Data Encryption Key Prefetch (`CachingEncryptionKeyStoreProviderImpl`) - [ ] 4.1 Add `ConcurrentDictionary prefetchCache` field to `CachingEncryptionKeyStoreProviderImpl`. - [ ] 4.2 Add `ConcurrentDictionary inflightPrefetches` field for deduplication. -- [ ] 4.3 Implement `internal async Task PrefetchUnwrapKeyAsync(string keyId, string algorithm, byte[] wrappedKey, CancellationToken ct)`: check prefetch cache → if miss, check inflight → if miss, call `ResolveAsync` + `UnwrapKeyAsync`, store result in prefetch cache with TTL, remove from inflight. +- [ ] 4.3 Implement `internal async Task PrefetchUnwrapKeyAsync(string keyId, string algorithm, byte[] wrappedKey, CancellationToken ct)`: check prefetch cache → if miss, check inflight → if miss, call `ResolveAsync` + `UnwrapKeyAsync`, store result in prefetch cache with time-to-live, remove from inflight. - [ ] 4.4 Update `UnwrapKey` override in `CachingEncryptionKeyStoreProviderImpl`: check prefetch cache first (by `wrappedKey.ToHexString()`). On hit, return immediately. On miss, fall through to resolved-client-cache path then to sync fallback. - [ ] 4.5 Wire `PrefetchUnwrapKeyAsync` call into `EncryptionSettingForProperty.BuildEncryptionAlgorithmForSettingAsync` — call it BEFORE `SemaphoreSlim.WaitAsync`, wrapped in try/catch (swallow non-cancellation exceptions, propagate `OperationCanceledException`). - [ ] 4.6 Add unit test: prefetch populates cache → sync `UnwrapKey` returns immediately without Key Vault call. - [ ] 4.7 Add unit test: prefetch cache miss → sync `UnwrapKey` falls through to sync Resolve + UnwrapKey (existing behavior). - [ ] 4.8 Add unit test: 50 concurrent threads calling `PrefetchUnwrapKeyAsync` for the same key → only one `ResolveAsync` + `UnwrapKeyAsync` call (deduplication). -- [ ] 4.9 Add unit test: prefetch throws (simulated KV failure) → caller proceeds to semaphore and sync path without error. +- [ ] 4.9 Add unit test: prefetch throws (simulated Key Vault failure) → caller proceeds to semaphore and sync path without error. - [ ] 4.10 Add unit test: prefetch throws `OperationCanceledException` → exception propagates to caller. ## 5. Proactive Background Refresh - [ ] 5.1 In `CachingEncryptionKeyStoreProviderImpl`: add `CancellationTokenSource backgroundCts` field. -- [ ] 5.2 On prefetch cache access, if entry is within a jittered window before TTL expiry (random offset within 2–5 minutes before expiry, per key, to stagger multi-CEK refreshes), schedule `Task.Run` to call `ResolveAsync` + `UnwrapKeyAsync` and update the cache entry. Use `backgroundCts.Token`. Deduplicate via `inflightPrefetches`. +- [ ] 5.2 On prefetch cache access, if entry is within the refresh window before time-to-live expiry (20% of cache time-to-live, capped at 5 minutes maximum, with per-key jitter to stagger multi-Client Encryption Key refreshes), schedule `Task.Run` to call `ResolveAsync` + `UnwrapKeyAsync` and update the cache entry. Use `backgroundCts.Token`. Deduplicate via `inflightPrefetches`. Enforce a concurrency limiter (`SemaphoreSlim`, e.g. max 3 concurrent background refresh calls) to prevent unbounded fan-out with many Client Encryption Keys. - [ ] 5.3 Add `Cleanup()` method: cancel `backgroundCts`, clear `prefetchCache`, clear `resolvedClientCache`, clear `inflightPrefetches`. Use `Interlocked.Exchange(ref disposed, 1)` for idempotent disposal. - [ ] 5.4 Wire `Cleanup()` into `EncryptionCosmosClient.Dispose(bool disposing)`. -- [ ] 5.5 Add unit test: access a cache entry near TTL expiry → background refresh fires (verify via mock that `ResolveAsync` is called). -- [ ] 5.6 Add unit test: background refresh fails → existing cache entry persists until TTL; no exception propagated. +- [ ] 5.5 Add unit test: access a cache entry near time-to-live expiry → background refresh fires (verify via mock that `ResolveAsync` is called). +- [ ] 5.6 Add unit test: background refresh fails → existing cache entry persists until time-to-live; no exception propagated. - [ ] 5.7 Add unit test: `Cleanup()` cancels in-flight background tasks (verify `OperationCanceledException` observed by background task). - [ ] 5.8 Add unit test: double `Cleanup()` is safe (no `ObjectDisposedException`). +- [ ] 5.9 Add unit test: concurrency limiter caps background refresh tasks (e.g. 10 Client Encryption Keys trigger refresh simultaneously, verify only max 3 concurrent Azure Key Vault calls via mock). ## 6. Integration & Wiring -- [ ] 6.1 In `EncryptionCosmosClient` constructor: if env var is on, instantiate `CachingEncryptionKeyStoreProviderImpl` instead of `EncryptionKeyStoreProviderImpl`. +- [ ] 6.1 In `EncryptionCosmosClient` constructor: if environment variable is on, instantiate `CachingEncryptionKeyStoreProviderImpl` instead of `EncryptionKeyStoreProviderImpl`. - [ ] 6.2 Expose `PrefetchUnwrapKeyAsync` on base class as a virtual no-op (`return Task.CompletedTask`) so `EncryptionSettingForProperty` can call it without knowing the concrete type. - [ ] 6.3 Verify build: `dotnet build Microsoft.Azure.Cosmos.Encryption/src/ -c Release` — 0 errors, 0 warnings. - [ ] 6.4 Run existing unit tests: `dotnet test Microsoft.Azure.Cosmos.Encryption/tests/Microsoft.Azure.Cosmos.Encryption.Tests/ --no-build` — all pass. -- [ ] 6.5 Run emulator E2E tests with `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED=true`: all existing `MdeEncryptionTests` pass. -- [ ] 6.6 Run emulator E2E tests without env var: all existing `MdeEncryptionTests` pass (verify zero behavior change). +- [ ] 6.5 Run emulator end-to-end tests with `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED=true`: all existing `MdeEncryptionTests` pass. +- [ ] 6.6 Run emulator end-to-end tests without environment variable: all existing `MdeEncryptionTests` pass (verify zero behavior change). ## 7. Contract & API Validation @@ -67,9 +69,10 @@ ## 8. Performance Benchmarking - [ ] 8.1 Create `InMemoryKeyResolver` test harness with configurable delay per `Resolve`/`UnwrapKey` call and call-count tracking. -- [ ] 8.2 Benchmark: semaphore hold time (p50/p95/p99) at 50 concurrent threads, simulated KV latency 500ms, env var off vs. on. Expect: off = hundreds of ms, on = microseconds. -- [ ] 8.3 Benchmark: end-to-end decrypt throughput (docs/sec) at 10, 50, 100 concurrent feed iterators with 5 encrypted properties per document. Env var off vs. on. -- [ ] 8.4 Benchmark: AKV call count over 1000 decrypt operations with 1 CEK. Env var off (expect 2 per PDEK miss) vs. on (expect 1 per TTL refresh, resolved client cached). -- [ ] 8.5 Benchmark: `OperationCanceledException` count at 50 concurrent threads with 5s cancellation timeout and simulated 2s KV latency. Env var off (expect failures) vs. on (expect zero). -- [ ] 8.6 Benchmark: multi-CEK refresh burst — 5 CEKs with aligned TTLs, verify jitter staggers AKV calls (measure peak concurrent AKV calls). +- [ ] 8.2 Benchmark: semaphore hold time (p50/p95/p99) at 50 concurrent threads, simulated Key Vault latency 500ms, environment variable off vs. on. Expect: off = hundreds of ms, on = microseconds. +- [ ] 8.3 Benchmark: end-to-end decrypt throughput (docs/sec) at 10, 50, 100 concurrent feed iterators with 5 encrypted properties per document. Environment variable off vs. on. +- [ ] 8.4 Benchmark: Azure Key Vault call count over 1000 decrypt operations with 1 Client Encryption Key. Environment variable off (expect 2 per `ProtectedDataEncryptionKey` cache miss) vs. on (expect 1 per time-to-live refresh, resolved client cached). +- [ ] 8.5 Benchmark: `OperationCanceledException` count at 50 concurrent threads with 5s cancellation timeout and simulated 2s Key Vault latency. Environment variable off (expect failures) vs. on (expect zero). +- [ ] 8.6 Benchmark: multi-Client Encryption Key refresh burst — 5 Client Encryption Keys with aligned time-to-live values, verify jitter + concurrency limiter staggers Azure Key Vault calls (measure peak concurrent Azure Key Vault calls; should not exceed concurrency limit). - [ ] 8.7 Document results in a comparison table (baseline vs. optimized) with all metrics. +- [ ] 8.8 **Customer thread pool investigation.** Gather information about the impacted customer's thread pool configuration (min/max threads, thread pool starvation indicators) to inform benchmark concurrency levels and validate that the fix is effective under their actual thread pool constraints. From 7015f57664c560a7c5e39eb5eba5e79eabdac66e Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Wed, 11 Mar 2026 17:51:07 -0400 Subject: [PATCH 07/10] Address review comments. --- .../specs/async-dek-prefetch/spec.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openspec/changes/reduce-encryption-contention/specs/async-dek-prefetch/spec.md b/openspec/changes/reduce-encryption-contention/specs/async-dek-prefetch/spec.md index e349ecda72..d0313ddd51 100644 --- a/openspec/changes/reduce-encryption-contention/specs/async-dek-prefetch/spec.md +++ b/openspec/changes/reduce-encryption-contention/specs/async-dek-prefetch/spec.md @@ -33,7 +33,7 @@ The system SHALL schedule a background refresh of the prefetch cache entry when #### Scenario: Background refresh failure does not crash - **WHEN** the background refresh call fails (Key Vault down, 429 throttle, network error) - **THEN** the failure SHALL be caught and logged; the existing cache entry SHALL remain until its time-to-live expires; the sync fallback path SHALL handle the next call -- **NOTE**: A backoff-retry mechanism SHOULD be applied to the background refresh before giving up (e.g. exponential backoff with 1–3 retries). If all retries are exhausted, the existing cache entry continues to serve reads until time-to-live expiry. Once the entry expires (removed from the prefetch cache), the next `UnwrapKey` call inside the semaphore will find no cache hit and fall through to the sync `Resolve()` + `UnwrapKey()` path — no explicit invalidation is needed because time-to-live expiry naturally clears the entry. +- **NOTE**: The background refresh SHALL NOT retry on failure. Retrying with backoff risks spanning past the cache entry's time-to-live expiry — at which point the entry is gone, concurrent threads find no cache hit, and all fall through to the sync `Resolve()` + `UnwrapKey()` path under the semaphore, recreating the thundering herd problem this design prevents. Instead, fail fast: log the failure, keep serving the existing entry until time-to-live expiry, and rely on the next natural prefetch call (on the next cache access) to retry organically. The prefetch path already deduplicates concurrent calls, so retry coordination is built in. No explicit cache invalidation is needed — time-to-live expiry naturally clears stale entries. ### Requirement: Prefetch cache time-to-live matches `ProtectedDataEncryptionKey` cache time-to-live The prefetch cache entry time-to-live SHALL match the `ProtectedDataEncryptionKey.TimeToLive` value. From f595bcc56dfbd6c6910b0ecbab01bb2e8fc23b0f Mon Sep 17 00:00:00 2001 From: Abhijeet Mohanty Date: Wed, 11 Mar 2026 18:05:32 -0400 Subject: [PATCH 08/10] Address review comments. --- .../reduce-encryption-contention/design.md | 123 +++++++++++++++++- 1 file changed, 116 insertions(+), 7 deletions(-) diff --git a/openspec/changes/reduce-encryption-contention/design.md b/openspec/changes/reduce-encryption-contention/design.md index a71b3bc5e2..490902c718 100644 --- a/openspec/changes/reduce-encryption-contention/design.md +++ b/openspec/changes/reduce-encryption-contention/design.md @@ -9,18 +9,127 @@ The call amplification is severe: `DecryptObjectAsync` calls `BuildEncryptionAlg ## Goals / Non-Goals **Goals:** -- Move `ProtectedDataEncryptionKey` resolution (Key Vault I/O) off the hot path via async prefetch outside the semaphore -- Prevent Azure Key Vault overload: proactive background refresh, deduplicated to one call per key per interval -- Reduce Key Vault calls per refresh: cache resolved `IKeyEncryptionKey` so only `UnwrapKey` hits Azure Key Vault, not `Resolve` + `UnwrapKey` -- Gate all changes behind an opt-in environment variable for safe rollout — zero behavior change when disabled +- Eliminate `OperationCanceledException` cascade when `ProtectedDataEncryptionKey` cache expires under concurrent load +- Reduce semaphore hold time on `ProtectedDataEncryptionKey` cache miss from 200ms–2.4s to microseconds - No public API changes. No breaking changes. +- Minimal complexity: prefer the simplest change that solves the customer’s problem **Non-Goals:** -- Changing the Microsoft Data Encryption library's type hierarchy or making the `ProtectedDataEncryptionKey` constructor async +- Changing the Microsoft Data Encryption library’s type hierarchy or making the `ProtectedDataEncryptionKey` constructor async - Removing the semaphore entirely (it guards the Microsoft Data Encryption library internal state mutation — still needed) - Per-key semaphores (reduces cross-key contention but same-key still serializes; added complexity for marginal gain) -- Caching the `AeadAes256CbcHmac256EncryptionAlgorithm` at the `EncryptionSettingForProperty` level (would need to be hoisted above the semaphore to reduce contention, which reintroduces ETag validation and Forbidden retry path complexity; however, the Microsoft Data Encryption library has a built-in `GetOrCreate` cache that the SDK currently bypasses — see Future Considerations)- Enabling the Microsoft Data Encryption library's Data Encryption Key byte cache (`DataEncryptionKeyCacheTimeToLive`) as a standalone layer (redundant when async prefetch is working — the sync `UnwrapKey` path reads from the prefetch cache, never reaching the Microsoft Data Encryption library’s built-in cache) -- Client Encryption Key rotation handling (Client Encryption Key replacement requires data migration — re-encrypting every document — and is an offline operation; not a runtime concern for caching)- Supporting key rotation detection at the cache level (Client Encryption Key rewrap doesn't change plaintext bytes; new Client Encryption Key policy creates new `EncryptionSettingForProperty` objects) +- Client Encryption Key rotation handling (Client Encryption Key replacement requires data migration — re-encrypting every document — and is an offline operation; not a runtime concern for caching) +- Supporting key rotation detection at the cache level (Client Encryption Key rewrap doesn’t change plaintext bytes; new Client Encryption Key policy creates new `EncryptionSettingForProperty` objects) + +## Approach Comparison + +Two approaches were evaluated. Approach A (enable the Microsoft Data Encryption library’s built-in Data Encryption Key byte cache) is the recommended starting point due to its simplicity and minimal risk surface. + +### Approach A: Enable Data Encryption Key Byte Cache (Low-Risk Baseline) + +#### Why this was not considered earlier + +The original design dismissed the Data Encryption Key byte cache with circular reasoning: + +> “Enabling the Microsoft Data Encryption library Data Encryption Key byte cache (`DataEncryptionKeyCacheTimeToLive`) as a standalone layer (redundant when async prefetch is working)” + +This is backward — the question is whether the prefetch is needed at all if enabling the built-in cache solves the problem. The oversight occurred because investigation focused on the `ProtectedDataEncryptionKey` cache layer and the semaphore, without tracing the full call chain through `UnwrapKey` → `GetOrCreateDataEncryptionKey` → `LocalCache.GetOrCreate` to discover that the SDK **explicitly disables** a cache that would prevent the Key Vault calls entirely. + +#### How the built-in cache works + +The Microsoft Data Encryption library’s `EncryptionKeyStoreProvider` base class has a `LocalCache` keyed by the hex-encoded wrapped key bytes. It is checked inside `UnwrapKey` → `GetOrCreateDataEncryptionKey` before the `UnwrapKeyCore` delegate (which calls `Resolve()` + `UnwrapKey()` on Key Vault) is invoked. + +The Cosmos SDK’s `EncryptionKeyStoreProviderImpl` constructor **explicitly disables** this cache: + +```csharp +this.DataEncryptionKeyCacheTimeToLive = TimeSpan.Zero; +``` + +When `TimeToLive <= TimeSpan.Zero`, `LocalCache.GetOrCreate` skips the cache entirely and always invokes the delegate — meaning every `ProtectedDataEncryptionKey` cache miss triggers two Key Vault HTTP calls. + +#### What enabling it solves + +The customer’s problem is the **steady-state `ProtectedDataEncryptionKey` cache time-to-live expiry** (every 1–2 hours) under sustained concurrent load: + +1. `ProtectedDataEncryptionKey` cache time-to-live expires → cache miss → constructor runs under semaphore +2. Constructor → `KeyEncryptionKey.DecryptEncryptionKey` → `UnwrapKey` → **Data Encryption Key byte cache check** +3. **Today (cache disabled):** `LocalCache.GetOrCreate` skips cache → `UnwrapKeyCore` → `Resolve()` + `UnwrapKey()` on Key Vault (200ms–2.4s) +4. **With cache enabled:** `LocalCache.GetOrCreate` finds a warm entry (populated 1 hour ago when the previous `ProtectedDataEncryptionKey` was constructed) → returns instantly → **zero Key Vault calls** +5. Semaphore hold time drops to **microseconds** — identical to the prefetch approach’s goal + +This works because the Data Encryption Key byte cache time-to-live (set to 2 hours or longer) outlives the `ProtectedDataEncryptionKey` cache time-to-live (1–2 hours, defaults to 1 hour via `keyCacheTimeToLive`). When the `ProtectedDataEncryptionKey` expires and is reconstructed, the wrapped key bytes haven’t changed — the Data Encryption Key byte cache key is `encryptedKey.ToHexString()`, which is the same wrapped blob — so the cache hits. + +#### Critical limitation: periodic time-to-live alignment causes co-expiry + +However, the Data Encryption Key byte cache uses `AbsoluteExpirationRelativeToNow` — the time-to-live is set once at creation and **not renewed on cache hit**. This means the Data Encryption Key byte cache entry created at T=0 expires at T=2h regardless of how many `ProtectedDataEncryptionKey` reconstructions hit it in between. + +**With Data Encryption Key byte cache time-to-live = 2h, `ProtectedDataEncryptionKey` time-to-live = 1h (defaults):** + +| Time | `ProtectedDataEncryptionKey` cache | Data Encryption Key byte cache | Semaphore hold time | +|------|-----------|----------------|-----| +| T=0h | Populated (expires T=1h) | Populated (expires T=2h) | Cold start — Key Vault I/O | +| T=1h | **MISS** → reconstruct | HIT (not renewed, still expires T=2h) | **Microseconds** | +| T=2h | **MISS** → reconstruct | **MISS** (co-expired) → Key Vault I/O, repopulated (expires T=4h) | **200ms–2.4s — thundering herd possible** | +| T=3h | **MISS** → reconstruct | HIT (not renewed, still expires T=4h) | **Microseconds** | +| T=4h | **MISS** → reconstruct | **MISS** (co-expired) → Key Vault I/O | **200ms–2.4s** | + +**Pattern: every `DataEncryptionKeyCacheTimeToLive` interval, both caches co-expire and the full Key Vault call path is hit under the semaphore.** The thundering herd still occurs at these alignment boundaries — just less frequently (every 2h instead of every 1h with defaults). + +Setting a much longer Data Encryption Key byte cache time-to-live (e.g. 24h) would reduce the frequency further (1 slow miss per 24h), but: +- The thundering herd at the 24h alignment boundary is identical to today’s problem — just rare +- A 24h stale key material window may raise security review concerns +- If `keyCacheTimeToLive` is configured shorter (e.g. 10 minutes), the alignment frequency increases proportionally + +**Verdict: Approach A reduces frequency of the problem but does not eliminate it.** It’s a low-risk improvement but not a complete fix for customers with sustained high-concurrency workloads and strict cancellation timeouts. + +#### The change + +```csharp +// EncryptionKeyStoreProviderImpl constructor — change ONE line: +this.DataEncryptionKeyCacheTimeToLive = TimeSpan.Zero; +// becomes: +this.DataEncryptionKeyCacheTimeToLive = TimeSpan.FromHours(2); +// (or: keyCacheTimeToLive * 2, ensuring it outlives the ProtectedDataEncryptionKey cache) +``` + +#### Key rotation correctness + +Client Encryption Key rewrap (the only runtime rotation) changes the wrapped key bytes → different `encryptedKey.ToHexString()` → different cache key → cache miss → fresh Key Vault call. The plaintext Data Encryption Key bytes are unchanged, so the result is correct. No cache coherence issue. + +#### Summary of Approach A limitations + +| Scenario | Approach A behavior | +|---|---| +| **Steady-state time-to-live expiry (PDEK miss, DEK hit)** | Fast — no Key Vault calls. Covers most PDEK reconstructions. | +| **Periodic co-expiry (PDEK miss AND DEK miss)** | Slow — full Key Vault I/O under semaphore. Thundering herd still possible at alignment boundaries. | +| **Cold start** (first-ever call, no prior cache entry) | Key Vault I/O under semaphore — but at cold start there’s no concurrent load, so no thundering herd | +| **Reducing Resolve calls** | On a true cache miss, still calls `Resolve()` + `UnwrapKey()` (2 Key Vault calls). Does not cache the resolved `IKeyEncryptionKey` | +| **Proactive refresh before expiry** | No proactive refresh — relies on the next access to populate | + +### Approach B: Async Prefetch with Background Refresh (Recommended) + +This is the full approach originally proposed. It adds a sealed subclass `CachingEncryptionKeyStoreProviderImpl` with async prefetch outside the semaphore, inflight deduplication, a resolved-client cache (caching the `IKeyEncryptionKey` / `CryptographyClient` per Customer Master Key URL), proactive background refresh with jitter, a concurrency limiter, disposal lifecycle with `CancellationTokenSource`, and environment variable gating. + +Approach B eliminates the co-expiry problem because the prefetch cache is refreshed **proactively before time-to-live expiry** — the sync `UnwrapKey` inside the semaphore always finds a warm prefetch entry. It also covers cold start (prefetch runs outside the semaphore), reduces Key Vault calls from 2 to 1 (via resolved-client cache), and handles the background refresh lifecycle cleanly. + +**However**, Approach B is significantly more complex. Consider whether the complexity is justified for the customer’s actual workload. + +#### Can the approaches be combined? + +**Yes — and this is recommended.** Enabling the Data Encryption Key byte cache (Approach A) as a baseline improvement is low-risk and benefits all customers immediately. Approach B layers on top for customers who need zero-contention guarantees: + +- Approach A: default-on, no environment variable needed, reduces slow-miss frequency from every `keyCacheTimeToLive` to every `DataEncryptionKeyCacheTimeToLive` +- Approach B: opt-in via environment variable, eliminates slow misses entirely via proactive prefetch + +The full Approach B design (decisions, risks, migration plan) is retained below. + +## Recommendation + +1. **Ship Approach A (enable Data Encryption Key byte cache) as a default-on baseline.** One-line change, benefits all customers, reduces slow-miss frequency. Low risk. +2. **Ship Approach B (async prefetch) gated behind `AZURE_COSMOS_ENCRYPTION_OPTIMISTIC_DECRYPTION_ENABLED` environment variable** for customers who need zero-contention guarantees under sustained concurrent load. +3. **Benchmark both** against the customer workload to validate: (a) Approach A reduces frequency as expected, (b) Approach B eliminates contention entirely. +4. The `AeadAes256CbcHmac256EncryptionAlgorithm.GetOrCreate` cleanup (see Future Considerations) can ship independently with either approach. + ## Decisions From 5e7b1bf2a47e3ec8e327664fa4f2f84c66d0469b Mon Sep 17 00:00:00 2001 From: Yash Trivedi Date: Mon, 30 Mar 2026 15:05:39 -0700 Subject: [PATCH 09/10] Update cache TTL --- .../src/EncryptionKeyStoreProviderImpl.cs | 7 +++++- .../EncryptionKeyStoreProviderImplTests.cs | 24 +++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 Microsoft.Azure.Cosmos.Encryption/tests/Microsoft.Azure.Cosmos.Encryption.Tests/EncryptionKeyStoreProviderImplTests.cs diff --git a/Microsoft.Azure.Cosmos.Encryption/src/EncryptionKeyStoreProviderImpl.cs b/Microsoft.Azure.Cosmos.Encryption/src/EncryptionKeyStoreProviderImpl.cs index 086e9855d8..b19b577bbe 100644 --- a/Microsoft.Azure.Cosmos.Encryption/src/EncryptionKeyStoreProviderImpl.cs +++ b/Microsoft.Azure.Cosmos.Encryption/src/EncryptionKeyStoreProviderImpl.cs @@ -31,7 +31,12 @@ public EncryptionKeyStoreProviderImpl(IKeyEncryptionKeyResolver keyEncryptionKey { this.keyEncryptionKeyResolver = keyEncryptionKeyResolver; this.ProviderName = providerName; - this.DataEncryptionKeyCacheTimeToLive = TimeSpan.Zero; + + // Enable the MDE library's built-in DEK byte cache. When ProtectedDataEncryptionKey cache + // expires (every 1–2 hours), the DEK byte cache still holds the unwrapped key bytes, so + // reconstruction avoids Key Vault HTTP calls. The 2-hour TTL outlives the default + // ProtectedDataEncryptionKey TTL (1 hour), covering most steady-state cache misses. + this.DataEncryptionKeyCacheTimeToLive = TimeSpan.FromHours(2); } public override string ProviderName { get; } diff --git a/Microsoft.Azure.Cosmos.Encryption/tests/Microsoft.Azure.Cosmos.Encryption.Tests/EncryptionKeyStoreProviderImplTests.cs b/Microsoft.Azure.Cosmos.Encryption/tests/Microsoft.Azure.Cosmos.Encryption.Tests/EncryptionKeyStoreProviderImplTests.cs new file mode 100644 index 0000000000..39927acfe9 --- /dev/null +++ b/Microsoft.Azure.Cosmos.Encryption/tests/Microsoft.Azure.Cosmos.Encryption.Tests/EncryptionKeyStoreProviderImplTests.cs @@ -0,0 +1,24 @@ +//------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------ + +namespace Microsoft.Azure.Cosmos.Encryption.Tests +{ + using System; + using global::Azure.Core.Cryptography; + using Microsoft.VisualStudio.TestTools.UnitTesting; + using Moq; + + [TestClass] + public class EncryptionKeyStoreProviderImplTests + { + [TestMethod] + public void Constructor_DekByteCacheEnabled_TwoHours() + { + Mock mockResolver = new Mock(); + EncryptionKeyStoreProviderImpl provider = new EncryptionKeyStoreProviderImpl(mockResolver.Object, "testProvider"); + + Assert.AreEqual(TimeSpan.FromHours(2), provider.DataEncryptionKeyCacheTimeToLive); + } + } +} From 8659a22d9244b50a779a69fc8add946b6a684d44 Mon Sep 17 00:00:00 2001 From: Yash Trivedi Date: Tue, 28 Apr 2026 11:39:25 -0700 Subject: [PATCH 10/10] Add KEK revocation details to spec --- .../reduce-encryption-contention/design.md | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/openspec/changes/reduce-encryption-contention/design.md b/openspec/changes/reduce-encryption-contention/design.md index 490902c718..487304c072 100644 --- a/openspec/changes/reduce-encryption-contention/design.md +++ b/openspec/changes/reduce-encryption-contention/design.md @@ -96,6 +96,16 @@ this.DataEncryptionKeyCacheTimeToLive = TimeSpan.FromHours(2); Client Encryption Key rewrap (the only runtime rotation) changes the wrapped key bytes → different `encryptedKey.ToHexString()` → different cache key → cache miss → fresh Key Vault call. The plaintext Data Encryption Key bytes are unchanged, so the result is correct. No cache coherence issue. +#### KEK revocation detection + +Enabling the Data Encryption Key byte cache introduces a conflict with KEK (Key Encryption Key) revocation detection. The existing error-handling path in `BuildEncryptionAlgorithmForSettingAsync` catches `RequestFailedException(403)` from `UnwrapKey` and triggers a force-refresh flow that fetches the latest Client Encryption Key properties from the backend. With the byte cache enabled, `UnwrapKey` → `GetOrCreateDataEncryptionKey` returns cached unwrapped bytes on a cache hit **without ever calling the key resolver**. The 403 is never thrown. KEK revocation becomes undetectable for the lifetime of the cache entry. + +This is not a theoretical concern — the `VerifyKekRevokeHandling` emulator test explicitly validates this flow: cache a DEK, revoke the KEK, verify that subsequent operations fail with the "needs to be rewrapped" error. With a naïve cache enable (`TimeSpan.Zero` → `TimeSpan.FromHours(2)`), this test fails because the cached bytes mask the revocation. + +Simple eviction doesn't work either — evicting the byte cache entry before every `BuildProtectedDataEncryptionKeyAsync` call renders the cache useless, since that's the only code path that reads it. This is functionally equivalent to `DataEncryptionKeyCacheTimeToLive = TimeSpan.Zero`. + +**This is resolved by the custom cache in Approach B** — see Decision 2 for the `revalidationInterval` design that separates the performance and security time scales. + #### Summary of Approach A limitations | Scenario | Approach A behavior | @@ -105,6 +115,7 @@ Client Encryption Key rewrap (the only runtime rotation) changes the wrapped key | **Cold start** (first-ever call, no prior cache entry) | Key Vault I/O under semaphore — but at cold start there’s no concurrent load, so no thundering herd | | **Reducing Resolve calls** | On a true cache miss, still calls `Resolve()` + `UnwrapKey()` (2 Key Vault calls). Does not cache the resolved `IKeyEncryptionKey` | | **Proactive refresh before expiry** | No proactive refresh — relies on the next access to populate | +| **KEK revocation detection** | Not supported by simple byte cache — cached bytes mask revocation. Resolved by Approach B’s `revalidationInterval` (see Decision 2). | ### Approach B: Async Prefetch with Background Refresh (Recommended) @@ -147,6 +158,43 @@ The full Approach B design (decisions, risks, migration plan) is retained below. **Rationale**: The Microsoft Data Encryption library's `LocalCache.GetOrCreate` is get-or-create only — no `Set`, `Remove`, or `Evict` API. `MemoryCache` adds a dependency and has its own concurrency model. `ConcurrentDictionary` is simple, lock-free reads, and we control time-to-live ourselves. +#### KEK revocation via revalidation interval + +The custom cache also resolves the KEK revocation detection problem described in Approach A's "KEK revocation detection" section. The fundamental tension is between two goals that both go through the same `UnwrapKey` → resolver path: +1. **Performance**: skip the resolver call (avoid Key Vault HTTP I/O under the semaphore) +2. **Security**: call the resolver (detect KEK revocation) + +These are contradictory on any single call. The custom cache resolves this by tracking a `lastValidated` timestamp per entry and introducing a `revalidationInterval`: + +``` +CacheEntry { + byte[] UnwrappedBytes // the cached unwrapped DEK bytes + DateTime LastValidated // when we last confirmed KEK access via the resolver +} +``` + +Lookup behavior: + +| Cache state | Action | +|---|---| +| **Cache miss** | Call resolver (`UnwrapKeyCore`), populate entry with `LastValidated = UtcNow` | +| **Cache hit, within `revalidationInterval`** | Return cached bytes immediately — no resolver call | +| **Cache hit, past `revalidationInterval`** | Call resolver to re-validate KEK access. **Succeeds**: update `LastValidated`, return bytes. **Fails (403)**: evict entry, rethrow — existing force-refresh path handles recovery | + +**Key parameters**: + +- `cacheTtl` (overall entry lifetime): 2 hours — same as the current `DataEncryptionKeyCacheTimeToLive` intent. Controls how long unwrapped bytes are retained at all. +- `revalidationInterval` (KEK access re-check frequency): recommended default matching `ProtectedDataEncryptionKey` time-to-live (1 hour). This is the maximum window during which KEK revocation goes undetected. + +This **separates the time scales**: ~99% of `UnwrapKey` calls (within `revalidationInterval`) are pure cache hits for performance, while periodic calls (at revalidation boundaries) contact the resolver for security validation. KEK revocation is detected within `revalidationInterval` — not immediately, but within a bounded and configurable window. The existing `BuildEncryptionAlgorithmForSettingAsync` 403 → force-refresh flow is unchanged. + +**Test compatibility**: The `VerifyKekRevokeHandling` test sets `ProtectedDataEncryptionKey.TimeToLive = TimeSpan.Zero`, meaning every operation rebuilds the PDEK and calls `UnwrapKey`. With `revalidationInterval` derived from the PDEK time-to-live (zero → every call validates), the test passes — every `UnwrapKey` call contacts the resolver and detects revocation immediately. + +**Implementation notes**: +- Override `GetOrCreateDataEncryptionKey` in `EncryptionKeyStoreProviderImpl` (or the `CachingEncryptionKeyStoreProviderImpl` subclass) — the base class method is `protected virtual` +- Set `DataEncryptionKeyCacheTimeToLive = TimeSpan.Zero` on the base class to disable its simple cache; the override handles all caching with revalidation +- Derive `revalidationInterval` from `ProtectedDataEncryptionKey.TimeToLive`: if PDEK time-to-live is zero (no caching), revalidation is zero (every call validates); if PDEK time-to-live is 1 hour, revalidation is 1 hour (one validation per PDEK rebuild cycle) + ### Decision 3: Proactive refresh via `Task.Run`, not `Timer` **Chosen**: On cache access, if within a configurable refresh window of expiry, fire `Task.Run` to refresh in background. The refresh window SHALL be **20% of the cache time-to-live, capped at a maximum of 5 minutes**. For example: time-to-live = 1 hour → refresh at 12 minutes before expiry (20% × 60 min = 12 min, but capped to 5 min → 5 minutes); time-to-live = 10 minutes → refresh at 2 minutes before expiry (20% × 10 min = 2 min). Deduplicate via `ConcurrentDictionary`.