Skip to content

Commit 75bfc28

Browse files
feat(memory): add scheduled memory pruning with two-path strategy (#1350)
Implement MemoryBank-style retention scoring R=exp(-t/S) with two complementary pruning paths: - Path 1 (event-driven): async cap enforcement on Store() when user exceeds max_memories_per_user - Path 2 (background sweep): periodic time.Ticker goroutine prunes decayed memories for inactive users in batches Includes Prometheus metrics, graceful shutdown, multi-replica support via prune_sweep_enabled flag, config template, and documentation. Signed-off-by: Abdallah Samara <abdallahsamabd@gmail.com>
1 parent 8c91297 commit 75bfc28

11 files changed

Lines changed: 688 additions & 16 deletions

File tree

e2e/testing/09-memory-features-test.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -605,10 +605,10 @@ def test_02_pronoun_resolution(self):
605605
self.milvus.flush()
606606
time.sleep(3)
607607

608-
# Ask using pronoun in NEW SESSION (no previous_response_id)
609-
# Memory context from Milvus should help resolve the pronoun
608+
# Ask about the person by name in NEW SESSION (no previous_response_id)
609+
# Memory context from Milvus should provide the answer
610610
result = self.send_memory_request(
611-
message="Where does she live?",
611+
message="Where does Sarah live?",
612612
auto_store=False,
613613
# NO previous_response_id - this is a new session!
614614
)

src/semantic-router/pkg/config/config.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1080,6 +1080,21 @@ type MemoryQualityScoringConfig struct {
10801080

10811081
// MaxMemoriesPerUser caps memories per user; if over, lowest-R memories are deleted first (0 = no cap).
10821082
MaxMemoriesPerUser int `yaml:"max_memories_per_user,omitempty"`
1083+
1084+
// PruneInterval is the interval between background sweep runs (e.g. "6h", "24h").
1085+
// Empty or "0" disables the background sweep. Requires PruneSweepEnabled to be true.
1086+
PruneInterval string `yaml:"prune_interval,omitempty"`
1087+
1088+
// PruneBatchSize is the number of users to prune per batch during the background sweep (default: 50).
1089+
PruneBatchSize int `yaml:"prune_batch_size,omitempty"`
1090+
1091+
// PruneSweepEnabled enables the background prune sweep on this replica.
1092+
// In multi-replica deployments, set this to true on only one replica to avoid duplicate work.
1093+
PruneSweepEnabled bool `yaml:"prune_sweep_enabled,omitempty"`
1094+
1095+
// MaxConcurrentPrunes limits how many pruneIfOverCap goroutines can run at once
1096+
// to avoid overwhelming Milvus during high-throughput Store() bursts (default: 10).
1097+
MaxConcurrentPrunes int `yaml:"max_concurrent_prunes,omitempty"`
10831098
}
10841099

10851100
// MemoryMilvusConfig contains Milvus-specific configuration for memory storage.

src/semantic-router/pkg/extproc/router.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@ type OpenAIRouter struct {
5454
// RateLimiter enforces per-user/model rate limits from multiple sources
5555
// (Envoy RLS → local limiter). Initialized in NewOpenAIRouter.
5656
RateLimiter *ratelimit.RateLimitResolver
57+
58+
// StopPruneSweep stops the background memory prune sweep goroutine (nil if not started).
59+
StopPruneSweep func()
5760
}
5861

5962
// Ensure OpenAIRouter implements the ext_proc calls
@@ -397,6 +400,13 @@ func NewOpenAIRouter(configPath string) (*OpenAIRouter, error) {
397400
}
398401
}
399402

403+
// Start background prune sweep if memory store is available and configured
404+
var stopPruneSweep func()
405+
if memoryStore != nil && cfg.Memory.QualityScoring.PruneSweepEnabled && cfg.Memory.QualityScoring.PruneInterval != "" {
406+
stopPruneSweep = memory.StartPruneSweep(cfg.Memory.QualityScoring, memoryStore)
407+
logging.Infof("Memory prune sweep enabled (interval=%s)", cfg.Memory.QualityScoring.PruneInterval)
408+
}
409+
400410
// Create memory extractor if memory_extraction external model is configured
401411
var memoryExtractor *memory.MemoryExtractor
402412
if memoryEnabled && cfg.FindExternalModelByRole(config.ModelRoleMemoryExtraction) != nil {
@@ -435,6 +445,7 @@ func NewOpenAIRouter(configPath string) (*OpenAIRouter, error) {
435445
MemoryExtractor: memoryExtractor,
436446
CredentialResolver: credResolver,
437447
RateLimiter: rateLimiter,
448+
StopPruneSweep: stopPruneSweep,
438449
}
439450

440451
return router, nil

src/semantic-router/pkg/extproc/server.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,8 +134,13 @@ func (s *Server) Start() error {
134134
return nil
135135
}
136136

137-
// Stop stops the gRPC server
137+
// Stop stops the gRPC server and background goroutines
138138
func (s *Server) Stop() {
139+
if s.service != nil {
140+
if r := s.service.current.Load(); r != nil && r.StopPruneSweep != nil {
141+
r.StopPruneSweep()
142+
}
143+
}
139144
if s.server != nil {
140145
s.server.GracefulStop()
141146
logging.Infof("Server stopped")

src/semantic-router/pkg/memory/milvus_store.go

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"fmt"
77
"sort"
88
"strings"
9+
"sync"
910
"time"
1011

1112
"github.com/milvus-io/milvus-sdk-go/v2/client"
@@ -22,6 +23,9 @@ const (
2223
DefaultRetryBaseDelay = 100
2324
)
2425

26+
// DefaultMaxConcurrentPrunes is the default limit for concurrent pruneIfOverCap goroutines.
27+
const DefaultMaxConcurrentPrunes = 10
28+
2529
// MilvusStore provides memory retrieval from Milvus with similarity threshold filtering
2630
type MilvusStore struct {
2731
client client.Client
@@ -31,6 +35,8 @@ type MilvusStore struct {
3135
maxRetries int
3236
retryBaseDelay time.Duration
3337
embeddingConfig EmbeddingConfig // Unified embedding configuration
38+
pruneSem chan struct{} // bounds concurrent prune goroutines
39+
pruneInFlight sync.Map // tracks userIDs with an active prune goroutine (dedup)
3440
}
3541

3642
// MilvusStoreOptions contains configuration for creating a MilvusStore
@@ -79,6 +85,11 @@ func NewMilvusStore(options MilvusStoreOptions) (*MilvusStore, error) {
7985
embeddingCfg = EmbeddingConfig{Model: EmbeddingModelBERT}
8086
}
8187

88+
maxPrunes := cfg.QualityScoring.MaxConcurrentPrunes
89+
if maxPrunes <= 0 {
90+
maxPrunes = DefaultMaxConcurrentPrunes
91+
}
92+
8293
store := &MilvusStore{
8394
client: options.Client,
8495
collectionName: options.CollectionName,
@@ -87,6 +98,7 @@ func NewMilvusStore(options MilvusStoreOptions) (*MilvusStore, error) {
8798
maxRetries: DefaultMaxRetries,
8899
retryBaseDelay: DefaultRetryBaseDelay * time.Millisecond,
89100
embeddingConfig: embeddingCfg,
101+
pruneSem: make(chan struct{}, maxPrunes),
90102
}
91103

92104
// Auto-create collection if it doesn't exist
@@ -684,9 +696,93 @@ func (m *MilvusStore) Store(ctx context.Context, memory *Memory) error {
684696
}
685697

686698
logging.Debugf("MilvusStore.Store: successfully stored memory id=%s", memory.ID)
699+
700+
// Path 1: event-driven cap enforcement — async prune if user exceeds max_memories_per_user.
701+
// Uses context.Background() intentionally: the goroutine must outlive the request ctx.
702+
// Two layers of protection against Milvus pressure:
703+
// 1. pruneInFlight (sync.Map): dedup — at most one goroutine per user at any time
704+
// 2. pruneSem (channel): semaphore — at most maxConcurrentPrunes goroutines globally
705+
if m.config.QualityScoring.MaxMemoriesPerUser > 0 {
706+
if _, alreadyRunning := m.pruneInFlight.LoadOrStore(memory.UserID, struct{}{}); !alreadyRunning {
707+
select {
708+
case m.pruneSem <- struct{}{}:
709+
go func(userID string) {
710+
defer func() {
711+
<-m.pruneSem
712+
m.pruneInFlight.Delete(userID)
713+
}()
714+
m.pruneIfOverCap(context.Background(), userID)
715+
}(memory.UserID)
716+
default:
717+
m.pruneInFlight.Delete(memory.UserID)
718+
logging.Debugf("MilvusStore.Store: prune semaphore full, skipping cap check for user_id=%s", memory.UserID)
719+
}
720+
}
721+
}
722+
687723
return nil
688724
}
689725

726+
// pruneIfOverCap counts the user's memories and calls PruneUser if over MaxMemoriesPerUser.
727+
// Designed to run in a goroutine triggered by Store().
728+
func (m *MilvusStore) pruneIfOverCap(ctx context.Context, userID string) {
729+
cap := m.config.QualityScoring.MaxMemoriesPerUser
730+
if cap <= 0 {
731+
return
732+
}
733+
734+
count, err := m.countUserMemories(ctx, userID)
735+
if err != nil {
736+
logging.Warnf("MilvusStore.pruneIfOverCap: count failed for user_id=%s: %v", userID, err)
737+
return
738+
}
739+
740+
if count <= cap {
741+
return
742+
}
743+
744+
PruneCapTriggeredTotal.Inc()
745+
logging.Infof("MilvusStore.pruneIfOverCap: user_id=%s has %d memories (cap=%d), pruning", userID, count, cap)
746+
747+
deleted, err := m.PruneUser(ctx, userID)
748+
if err != nil {
749+
logging.Warnf("MilvusStore.pruneIfOverCap: PruneUser failed for user_id=%s: %v", userID, err)
750+
return
751+
}
752+
if deleted > 0 {
753+
PruneDeletedTotal.WithLabelValues("cap").Add(float64(deleted))
754+
logging.Infof("MilvusStore.pruneIfOverCap: user_id=%s pruned %d memories", userID, deleted)
755+
}
756+
}
757+
758+
// countUserMemories returns the number of memories stored for a given user.
759+
func (m *MilvusStore) countUserMemories(ctx context.Context, userID string) (int, error) {
760+
filterExpr := fmt.Sprintf("user_id == \"%s\"", userID)
761+
762+
var queryResult []entity.Column
763+
err := m.retryWithBackoff(ctx, func() error {
764+
var retryErr error
765+
queryResult, retryErr = m.client.Query(
766+
ctx,
767+
m.collectionName,
768+
[]string{},
769+
filterExpr,
770+
[]string{"id"},
771+
)
772+
return retryErr
773+
})
774+
if err != nil {
775+
return 0, fmt.Errorf("milvus query failed: %w", err)
776+
}
777+
778+
for _, col := range queryResult {
779+
if col.Name() == "id" {
780+
return col.Len(), nil
781+
}
782+
}
783+
return 0, nil
784+
}
785+
690786
// upsert atomically replaces a row in Milvus by primary key.
691787
// The memory must be fully populated (including Embedding, timestamps, etc.).
692788
// Used by Update to avoid the delete+insert data-loss window.
@@ -1485,6 +1581,57 @@ func (m *MilvusStore) PruneUser(ctx context.Context, userID string) (deleted int
14851581
return deleted, nil
14861582
}
14871583

1584+
// ListStaleUserIDs queries Milvus for memories with created_at older than cutoffUnix
1585+
// and returns the deduplicated set of user_id values. This targets users whose oldest
1586+
// memories may have decayed below the prune threshold, without iterating all users.
1587+
func (m *MilvusStore) ListStaleUserIDs(ctx context.Context, cutoffUnix int64) ([]string, error) {
1588+
if !m.enabled {
1589+
return nil, fmt.Errorf("milvus store is not enabled")
1590+
}
1591+
1592+
filterExpr := fmt.Sprintf("created_at < %d", cutoffUnix)
1593+
outputFields := []string{"user_id"}
1594+
1595+
var queryResult []entity.Column
1596+
err := m.retryWithBackoff(ctx, func() error {
1597+
var retryErr error
1598+
queryResult, retryErr = m.client.Query(
1599+
ctx,
1600+
m.collectionName,
1601+
[]string{},
1602+
filterExpr,
1603+
outputFields,
1604+
)
1605+
return retryErr
1606+
})
1607+
if err != nil {
1608+
return nil, fmt.Errorf("milvus query for stale users failed: %w", err)
1609+
}
1610+
1611+
seen := make(map[string]struct{})
1612+
for _, col := range queryResult {
1613+
if col.Name() == "user_id" {
1614+
vc, ok := col.(*entity.ColumnVarChar)
1615+
if !ok {
1616+
continue
1617+
}
1618+
for i := 0; i < vc.Len(); i++ {
1619+
uid, _ := vc.ValueByIdx(i)
1620+
if uid != "" {
1621+
seen[uid] = struct{}{}
1622+
}
1623+
}
1624+
}
1625+
}
1626+
1627+
userIDs := make([]string, 0, len(seen))
1628+
for uid := range seen {
1629+
userIDs = append(userIDs, uid)
1630+
}
1631+
logging.Debugf("MilvusStore.ListStaleUserIDs: found %d users with memories older than %d", len(userIDs), cutoffUnix)
1632+
return userIDs, nil
1633+
}
1634+
14881635
// isTransientError checks if an error is transient and should be retried
14891636
func isTransientError(err error) bool {
14901637
if err == nil {
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
package memory
2+
3+
import (
4+
"github.com/prometheus/client_golang/prometheus"
5+
"github.com/prometheus/client_golang/prometheus/promauto"
6+
)
7+
8+
var (
9+
// PruneDeletedTotal tracks the total number of memories pruned, labeled by trigger source.
10+
PruneDeletedTotal = promauto.NewCounterVec(
11+
prometheus.CounterOpts{
12+
Name: "memory_prune_deleted_total",
13+
Help: "Total number of memories deleted by pruning",
14+
},
15+
[]string{"trigger"}, // "cap" or "sweep"
16+
)
17+
18+
// PruneCapTriggeredTotal tracks how many times a Store() call triggered cap enforcement.
19+
PruneCapTriggeredTotal = promauto.NewCounter(
20+
prometheus.CounterOpts{
21+
Name: "memory_prune_cap_triggered_total",
22+
Help: "Total number of times cap enforcement was triggered on Store()",
23+
},
24+
)
25+
26+
// PruneSweepRunsTotal tracks how many background sweep cycles have completed.
27+
PruneSweepRunsTotal = promauto.NewCounter(
28+
prometheus.CounterOpts{
29+
Name: "memory_prune_sweep_runs_total",
30+
Help: "Total number of background prune sweep cycles completed",
31+
},
32+
)
33+
34+
// PruneSweepDuration tracks how long each background sweep cycle takes.
35+
PruneSweepDuration = promauto.NewHistogram(
36+
prometheus.HistogramOpts{
37+
Name: "memory_prune_sweep_duration_seconds",
38+
Help: "Duration of background prune sweep cycles in seconds",
39+
Buckets: prometheus.DefBuckets,
40+
},
41+
)
42+
43+
// PruneSweepUsersProcessedTotal tracks how many users were evaluated during sweeps.
44+
PruneSweepUsersProcessedTotal = promauto.NewCounter(
45+
prometheus.CounterOpts{
46+
Name: "memory_prune_sweep_users_processed_total",
47+
Help: "Total number of users processed during background prune sweeps",
48+
},
49+
)
50+
51+
// PruneSweepErrorsTotal tracks errors encountered during background sweeps.
52+
PruneSweepErrorsTotal = promauto.NewCounter(
53+
prometheus.CounterOpts{
54+
Name: "memory_prune_sweep_errors_total",
55+
Help: "Total number of errors encountered during background prune sweeps",
56+
},
57+
)
58+
)

0 commit comments

Comments
 (0)