@@ -89,10 +89,12 @@ var _ Index = &InMemoryIndex{}
8989// PodCache represents a cache for pod entries.
9090type PodCache struct {
9191 // cache is an LRU cache that maps PodEntry to their last access time.
92- // thread-safe.
9392 cache * lru.Cache [PodEntry , struct {}]
9493 // mu protects the cache from concurrent access during check-and-set operations.
9594 mu sync.Mutex
95+ // removed indicates this PodCache has been evicted from the parent map.
96+ // Checked by Add after acquiring mu to avoid writing into an orphaned cache.
97+ removed bool
9698}
9799
98100// Lookup receives a list of requestKeys and a set of pod identifiers,
@@ -165,47 +167,27 @@ func (m *InMemoryIndex) Add(ctx context.Context, engineKeys, requestKeys []Block
165167 m .engineToRequestKeys .Add (engineKeys [i ], requestKey )
166168 }
167169
168- // 2. Store requestKey -> PodCache mapping
169- var podCache * PodCache
170- var found bool
171-
172- // Try to get existing cache first
173- podCache , found = m .data .Get (requestKey )
174- //nolint:nestif // double-checked locking pattern
175- if ! found {
176- // Create new cache
177- cache , err := lru.New [PodEntry , struct {}](m .podCacheSize )
178- if err != nil {
179- return fmt .Errorf ("failed to create pod cache for key %s: %w" , requestKey .String (), err )
180- }
181-
182- newPodCache := & PodCache {
183- cache : cache ,
170+ // 2. Store requestKey -> PodCache mapping with retry on stale cache.
171+ // A retry is needed only when a concurrent Evict marks the PodCache as
172+ // removed between getOrCreatePodCache and Lock. The window is tiny, so
173+ // this loop almost never iterates more than once.
174+ for {
175+ podCache := m .getOrCreatePodCache (requestKey )
176+
177+ podCache .mu .Lock ()
178+ if podCache .removed {
179+ podCache .mu .Unlock ()
180+ continue // retry — this cache was evicted
184181 }
185182
186- // Try to add, but use existing if another thread added it first
187- // This is a bounded retry (1) - not perfectly safe but for practical use-cases and scenarios
188- // this should be sufficient
189- contains , _ := m .data .ContainsOrAdd (requestKey , newPodCache )
190- if contains {
191- podCache , found = m .data .Get (requestKey )
192- if ! found { // Extremely irregular workload pattern - key evicted
193- m .data .Add (requestKey , newPodCache )
194- podCache = newPodCache
195- }
196- } else {
197- // We successfully added our cache
198- podCache = newPodCache
183+ for _ , entry := range entries {
184+ podCache .cache .Add (entry , struct {}{})
199185 }
200- }
186+ podCache . mu . Unlock ()
201187
202- podCache .mu .Lock ()
203- for _ , entry := range entries {
204- podCache .cache .Add (entry , struct {}{})
188+ traceLogger .Info ("added pods to key" , "requestKey" , requestKey , "pods" , entries )
189+ break
205190 }
206- podCache .mu .Unlock ()
207-
208- traceLogger .Info ("added pods to key" , "requestKey" , requestKey , "pods" , entries )
209191 }
210192
211193 return nil
@@ -251,41 +233,36 @@ func (m *InMemoryIndex) Evict(ctx context.Context, key BlockHash, keyType KeyTyp
251233 }
252234
253235 podCache .mu .Lock ()
236+ prevLen := podCache .cache .Len ()
254237 for _ , entry := range entries {
255238 podCache .cache .Remove (entry )
256239 }
257240
258- isEmpty := podCache .cache .Len () == 0
259- podCache .mu .Unlock ()
260-
261- traceLogger .Info ("evicted pods from key" , "requestKey" , requestKey , "key" , key , "keyType" , keyType , "pods" , entries )
262-
263- // Remove key from main cache if empty.
264- // Re-fetch and hold the lock through removal to prevent racing with Add.
265- if ! isEmpty {
266- return nil
267- }
268-
269- currentCache , stillExists := m .data .Get (requestKey )
270- if ! stillExists || currentCache == nil {
271- return nil
272- }
273-
274- currentCache .mu .Lock ()
275- if currentCache .cache .Len () == 0 {
276- m .data .Remove (requestKey )
241+ // Only mark as removed if this Evict actually emptied the cache.
242+ // If the cache was already empty (prevLen == 0), a concurrent Add may have
243+ // just created it — marking it removed would cause Add to spin.
244+ if podCache .cache .Len () == 0 && prevLen > 0 {
245+ podCache .removed = true
246+ // Use Peek + pointer equality to avoid removing a replacement PodCache
247+ // that a concurrent Add may have inserted.
248+ if cur , ok := m .data .Peek (requestKey ); ok && cur == podCache {
249+ m .data .Remove (requestKey )
250+ }
277251 if hasEngineKeyMapping {
278252 m .engineToRequestKeys .Remove (key )
279253 }
280254 traceLogger .Info ("removed requestKey from index as no pods remain" , "requestKey" , requestKey , "key" , key )
281255 }
282- currentCache .mu .Unlock ()
256+ podCache .mu .Unlock ()
257+
258+ traceLogger .Info ("evicted pods from key" , "requestKey" , requestKey , "key" , key , "keyType" , keyType , "pods" , entries )
283259
284260 return nil
285261}
286262
287263// GetRequestKey returns the requestKey associated with the given engineKey.
288264// Returns an error if the engineKey mapping is missing (e.g., already evicted).
265+ // No external lock needed — lru.Cache is internally thread-safe.
289266func (m * InMemoryIndex ) GetRequestKey (ctx context.Context , engineKey BlockHash ) (BlockHash , error ) {
290267 requestKey , found := m .engineToRequestKeys .Get (engineKey )
291268 if ! found {
@@ -294,6 +271,28 @@ func (m *InMemoryIndex) GetRequestKey(ctx context.Context, engineKey BlockHash)
294271 return requestKey , nil
295272}
296273
274+ // getOrCreatePodCache returns the existing PodCache for requestKey,
275+ // or creates and inserts a new one if none exists.
276+ func (m * InMemoryIndex ) getOrCreatePodCache (requestKey BlockHash ) * PodCache {
277+ if podCache , found := m .data .Get (requestKey ); found {
278+ return podCache
279+ }
280+
281+ cache , _ := lru.New [PodEntry , struct {}](m .podCacheSize )
282+ newPodCache := & PodCache {cache : cache }
283+
284+ // Try to add atomically; if another goroutine beat us, use theirs.
285+ if contains , _ := m .data .ContainsOrAdd (requestKey , newPodCache ); contains {
286+ if existing , ok := m .data .Get (requestKey ); ok {
287+ return existing
288+ }
289+ // Key was evicted between ContainsOrAdd and Get — use ours.
290+ m .data .Add (requestKey , newPodCache )
291+ }
292+
293+ return newPodCache
294+ }
295+
297296// podsPerKeyPrintHelper formats a map of keys to pod names for printing.
298297func podsPerKeyPrintHelper (ks map [BlockHash ][]PodEntry ) string {
299298 var b strings.Builder
0 commit comments