66 "flag"
77 "fmt"
88 "os"
9-
10- "strconv"
119 "time"
1210
1311 "github.com/llm-d-incubation/llm-d-async/pkg/async/api"
4139 queuesConfigFile = flag .String ("redis.queues-config-file" , "" , "Queues Configuration file. Mutually exclusive with redis.igw-base-url, redis.request-queue-name, redis.request-path-url and redis.inference-objective flags. See documentation about syntax" )
4240)
4341
42+ const retryPopBatchSize = 100
43+
44+ // popDueRetryMessagesScript atomically fetches due retry entries (score <= now) and removes them.
45+ var popDueRetryMessagesScript = redis .NewScript (`
46+ local key = KEYS[1]
47+ local now = tonumber(ARGV[1])
48+ local limit = tonumber(ARGV[2])
49+
50+ local items = redis.call("ZRANGEBYSCORE", key, "-inf", now, "LIMIT", 0, limit)
51+ if #items > 0 then
52+ -- Chunk ZREM arguments to avoid Lua unpack stack limits if
53+ -- limit is increased significantly in the future.
54+ local chunk_size = 1000
55+ for i = 1, #items, chunk_size do
56+ local last = math.min(i + chunk_size - 1, #items)
57+ local chunk = {}
58+ for j = i, last do
59+ chunk[#chunk + 1] = items[j]
60+ end
61+ redis.call("ZREM", key, unpack(chunk))
62+ end
63+ end
64+ return items
65+ ` )
66+
4467type QueueConfig struct {
4568 QueueName string `json:"queue_name"`
4669 InferenceObjective string `json:"inference_objective"`
@@ -243,6 +266,7 @@ func addMsgToRetryWorker(ctx context.Context, rdb *redis.Client, retryChannel ch
243266
244267// Every second polls the sorted set and publishes the messages that need to be retried into the request queue
245268func (r * RedisMQFlow ) retryWorker (ctx context.Context , rdb * redis.Client ) {
269+ logger := log .FromContext (ctx )
246270 // create a map of queuename to channel based on requestchannels
247271 msgChannels := make (map [string ]chan api.RequestMessage )
248272 for _ , channelData := range r .requestChannels {
@@ -255,38 +279,70 @@ func (r *RedisMQFlow) retryWorker(ctx context.Context, rdb *redis.Client) {
255279 return
256280
257281 default :
258- currentTimeSec := float64 (time .Now ().Unix ())
259-
260- results , err := rdb .ZRangeArgs (ctx , redis.ZRangeArgs {
261- Key : * retryQueueName ,
262- Start : "0" ,
263- Stop : strconv .FormatFloat (currentTimeSec , 'f' , - 1 , 64 ),
264- ByScore : true ,
265- }).Result ()
266- if err != nil {
267- panic (err )
268- }
269- for _ , msg := range results {
270- var message api.RequestMessage
271- err := json .Unmarshal ([]byte (msg ), & message )
272- if err != nil {
273- fmt .Println (err )
282+ // Keep one fixed cutoff for this drain cycle so we only process
283+ // messages due at cycle start, avoiding an ever-expanding window.
284+ currentTimeSec := time .Now ().Unix ()
274285
275- }
276- err = rdb . ZRem (ctx , * retryQueueName , msg ). Err ( )
286+ for {
287+ results , err := popDueRetryMessages (ctx , rdb , * retryQueueName , currentTimeSec , retryPopBatchSize )
277288 if err != nil {
278- fmt .Println (err )
279-
289+ logger .V (logutil .DEFAULT ).Error (err , "Failed to atomically pop due retry messages" )
290+ break
291+ }
292+ if len (results ) == 0 {
293+ break
280294 }
281- queueName := message .Metadata [QUEUE_NAME_KEY ]
282295
283- // TODO: We probably want to write here back to the request queue/channel in Redis. Adding the msg to the
284- // golang channel directly is not that wise as this might be blocking.
285- msgChannels [queueName ] <- message
296+ for _ , msg := range results {
297+ var message api.RequestMessage
298+ err := json .Unmarshal ([]byte (msg ), & message )
299+ if err != nil {
300+ logger .V (logutil .DEFAULT ).Error (err , "Failed to unmarshal retry message" )
301+ continue
302+ }
303+ queueName := message .Metadata [QUEUE_NAME_KEY ]
304+ msgChannel , ok := msgChannels [queueName ]
305+ if ! ok {
306+ logger .V (logutil .DEFAULT ).Info ("Unknown retry queue, dropping message" , "queueName" , queueName , "messageId" , message .Id )
307+ continue
308+ }
309+
310+ // TODO: We probably want to write here back to the request queue/channel in Redis. Adding the msg to the
311+ // golang channel directly is not that wise as this might be blocking.
312+ select {
313+ case msgChannel <- message :
314+ case <- ctx .Done ():
315+ return
316+ }
317+ }
286318 }
287319 time .Sleep (time .Second )
288320 }
289321 }
290322
291323}
292324
325+ // popDueRetryMessages atomically pops up to limit retry messages whose score is <= nowUnixSec.
326+ // It returns the raw message payloads removed from the sorted set.
327+ func popDueRetryMessages (ctx context.Context , rdb * redis.Client , key string , nowUnixSec int64 , limit int ) ([]string , error ) {
328+ raw , err := popDueRetryMessagesScript .Run (ctx , rdb , []string {key }, nowUnixSec , limit ).Result ()
329+ if err != nil {
330+ return nil , err
331+ }
332+
333+ entries , ok := raw .([]interface {})
334+ if ! ok {
335+ return nil , fmt .Errorf ("unexpected script result type: %T" , raw )
336+ }
337+
338+ messages := make ([]string , 0 , len (entries ))
339+ for _ , entry := range entries {
340+ msg , ok := entry .(string )
341+ if ! ok {
342+ return nil , fmt .Errorf ("unexpected script entry type: %T" , entry )
343+ }
344+ messages = append (messages , msg )
345+ }
346+
347+ return messages , nil
348+ }
0 commit comments