@@ -31,11 +31,11 @@ type LlmRequest struct {
31
31
32
32
type Scheduler struct {
33
33
pendingReqCh chan * LlmRequest
34
- finishedReqCh chan * LlmRequest
34
+ finishedReqCh chan * runnerRef
35
35
expiredCh chan * runnerRef
36
36
unloadedCh chan interface {}
37
37
38
- loaded map [string ]* runnerRef
38
+ loaded map [string ][] * runnerRef
39
39
loadedMu sync.Mutex
40
40
41
41
loadFn func (req * LlmRequest , ggml * llm.GGML , gpus gpu.GpuInfoList )
@@ -48,10 +48,10 @@ var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending re
48
48
func InitScheduler (ctx context.Context ) * Scheduler {
49
49
sched := & Scheduler {
50
50
pendingReqCh : make (chan * LlmRequest , envconfig .MaxQueuedRequests ),
51
- finishedReqCh : make (chan * LlmRequest , envconfig .MaxQueuedRequests ),
51
+ finishedReqCh : make (chan * runnerRef , envconfig .MaxQueuedRequests ),
52
52
expiredCh : make (chan * runnerRef , envconfig .MaxQueuedRequests ),
53
53
unloadedCh : make (chan interface {}, envconfig .MaxQueuedRequests ),
54
- loaded : make (map [string ]* runnerRef ),
54
+ loaded : make (map [string ][] * runnerRef ),
55
55
newServerFn : llm .NewLlamaServer ,
56
56
getGpuFn : gpu .GetGPUInfo ,
57
57
}
@@ -114,9 +114,21 @@ func (s *Scheduler) processPending(ctx context.Context) {
114
114
for {
115
115
var runnerToExpire * runnerRef
116
116
s .loadedMu .Lock ()
117
- runner := s .loaded [pending .model .ModelPath ]
118
- loadedCount := len (s .loaded )
117
+ runners := s .loaded [pending .model .ModelPath ]
118
+ loadedCount := 0
119
+ for _ , runnerList := range s .loaded {
120
+ loadedCount += len (runnerList )
121
+ }
119
122
s .loadedMu .Unlock ()
123
+ var runner * runnerRef = nil
124
+ if len (runners ) > 0 {
125
+ for _ , r := range runners {
126
+ if ! r .isAtCapacity () {
127
+ runner = r
128
+ break
129
+ }
130
+ }
131
+ }
120
132
if runner != nil {
121
133
if runner .needsReload (ctx , pending ) {
122
134
runnerToExpire = runner
@@ -215,12 +227,12 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
215
227
case <- ctx .Done ():
216
228
slog .Debug ("shutting down scheduler completed loop" )
217
229
return
218
- case finished := <- s .finishedReqCh :
230
+ case finishedRunner := <- s .finishedReqCh :
219
231
s .loadedMu .Lock ()
220
- runner := s . loaded [ finished . model . ModelPath ]
232
+ runner := finishedRunner
221
233
s .loadedMu .Unlock ()
222
234
if runner == nil {
223
- slog .Error ("finished requeset signal received after model unloaded" , "modelPath" , finished .model .ModelPath )
235
+ slog .Error ("finished requeset signal received after model unloaded" , "modelPath" , finishedRunner .model .ModelPath )
224
236
continue
225
237
}
226
238
runner .refMu .Lock ()
@@ -274,7 +286,21 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
274
286
slog .Debug ("got lock to unload" , "modelPath" , runner .modelPath )
275
287
finished := runner .waitForVRAMRecovery ()
276
288
runner .unload ()
277
- delete (s .loaded , runner .modelPath )
289
+
290
+ modelPath := runner .modelPath
291
+ // Find the index of the runner in the slice
292
+ for i , r := range s .loaded [modelPath ] {
293
+ if r == runner {
294
+ // Remove the runner from the slice
295
+ s .loaded [modelPath ] = append (s .loaded [modelPath ][:i ], s .loaded [modelPath ][i + 1 :]... )
296
+ break
297
+ }
298
+ }
299
+
300
+ // If the slice is now empty, delete the entry from the map
301
+ if len (s .loaded [modelPath ]) == 0 {
302
+ delete (s .loaded , modelPath )
303
+ }
278
304
s .loadedMu .Unlock ()
279
305
slog .Debug ("runner released" , "modelPath" , runner .modelPath )
280
306
runner .refMu .Unlock ()
@@ -334,8 +360,16 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
334
360
runner .refMu .Lock ()
335
361
336
362
s .loadedMu .Lock ()
337
- s .loaded [req .model .ModelPath ] = runner
338
- slog .Info ("loaded runners" , "count" , len (s .loaded ))
363
+ if _ , ok := s .loaded [req .model .ModelPath ]; ! ok {
364
+ s .loaded [req .model .ModelPath ] = make ([]* runnerRef , 0 )
365
+ }
366
+ s .loaded [req .model .ModelPath ] = append (s .loaded [req .model .ModelPath ], runner )
367
+
368
+ runnerCount := 0
369
+ for _ , runners := range s .loaded {
370
+ runnerCount += len (runners )
371
+ }
372
+ slog .Info ("loaded runners" , "count" , runnerCount )
339
373
s .loadedMu .Unlock ()
340
374
341
375
go func () {
@@ -366,26 +400,29 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
366
400
}
367
401
predMap := map [predKey ]uint64 {} // Sum up the total predicted usage per GPU for all runners
368
402
s .loadedMu .Lock ()
369
- for _ , r := range s .loaded {
370
- r .refMu .Lock ()
371
- gpuIDs := make ([]string , 0 , len (r .gpus ))
372
- if r .llama != nil {
373
-
374
- // TODO this should be broken down by GPU instead of assuming uniform spread
375
- estimatedVRAMPerGPU := r .llama .EstimatedVRAM () / uint64 (len (r .gpus ))
376
- for _ , gpu := range r .gpus {
377
- gpuIDs = append (gpuIDs , gpu .ID )
378
- }
379
- for _ , gpu := range allGpus {
380
- if slices .Contains (gpuIDs , gpu .ID ) {
381
- predMap [predKey {gpu .Library , gpu .ID }] += estimatedVRAMPerGPU
403
+ for _ , runners := range s .loaded {
404
+ for _ , r := range runners {
405
+ r .refMu .Lock ()
406
+ gpuIDs := make ([]string , 0 , len (r .gpus ))
407
+ if r .llama != nil {
408
+
409
+ // TODO this should be broken down by GPU instead of assuming uniform spread
410
+ estimatedVRAMPerGPU := r .llama .EstimatedVRAM () / uint64 (len (r .gpus ))
411
+ for _ , gpu := range r .gpus {
412
+ gpuIDs = append (gpuIDs , gpu .ID )
413
+ }
414
+ for _ , gpu := range allGpus {
415
+ if slices .Contains (gpuIDs , gpu .ID ) {
416
+ predMap [predKey {gpu .Library , gpu .ID }] += estimatedVRAMPerGPU
417
+ }
382
418
}
419
+ } else {
420
+ slog .Warn ("unexpected nil runner reference, memory prediction may be incorrect" )
383
421
}
384
- } else {
385
- slog .Warn ("unexpected nil runner reference, memory prediction may be incorrect" )
422
+ r .refMu .Unlock ()
386
423
}
387
- r .refMu .Unlock ()
388
424
}
425
+
389
426
s .loadedMu .Unlock ()
390
427
391
428
// Now that we've summed up all the GPU usage predictions across all the loaded runners, update the gpu list
@@ -583,9 +620,9 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
583
620
// findRunnerToUnload finds a runner to unload to make room for a new model
584
621
func (s * Scheduler ) findRunnerToUnload () * runnerRef {
585
622
s .loadedMu .Lock ()
586
- runnerList := make ([]* runnerRef , 0 , len ( s . loaded ) )
587
- for _ , r := range s .loaded {
588
- runnerList = append (runnerList , r )
623
+ runnerList := make ([]* runnerRef , 0 )
624
+ for _ , runners := range s .loaded {
625
+ runnerList = append (runnerList , runners ... )
589
626
}
590
627
s .loadedMu .Unlock ()
591
628
@@ -611,10 +648,12 @@ func (s *Scheduler) findRunnerToUnload() *runnerRef {
611
648
func (s * Scheduler ) unloadAllRunners () {
612
649
s .loadedMu .Lock ()
613
650
defer s .loadedMu .Unlock ()
614
- for model , runner := range s .loaded {
615
- if runner .llama != nil {
616
- slog .Debug ("shutting down runner" , "model" , model )
617
- runner .llama .Close ()
651
+ for model , runners := range s .loaded {
652
+ for _ , runner := range runners {
653
+ if runner .llama != nil {
654
+ slog .Debug ("shutting down runner" , "model" , model )
655
+ runner .llama .Close ()
656
+ }
618
657
}
619
658
}
620
659
}
0 commit comments