-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmulti_lora_manager.h
More file actions
968 lines (840 loc) · 33.6 KB
/
multi_lora_manager.h
File metadata and controls
968 lines (840 loc) · 33.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
/*
╔═════════════════════════════════════════════════════════════════════╗
║ ThemisDB - Hybrid Database System ║
╠═════════════════════════════════════════════════════════════════════╣
File: multi_lora_manager.h ║
Version: 0.0.47 ║
Last Modified: 2026-04-15 18:45:33 ║
Author: unknown ║
╠═════════════════════════════════════════════════════════════════════╣
Quality Metrics: ║
• Maturity Level: 🟢 PRODUCTION-READY ║
• Quality Score: 100.0/100 ║
• Total Lines: 945 ║
• Open Issues: TODOs: 0, Stubs: 0 ║
╠═════════════════════════════════════════════════════════════════════╣
Status: ✅ Production Ready ║
╚═════════════════════════════════════════════════════════════════════╝
*/
#pragma once
#include "llm/llm_plugin_interface.h"
#include <memory>
#include <string>
#include <vector>
#include <unordered_map>
#include <mutex>
#include <optional>
#include <thread>
#include <atomic>
#include <condition_variable>
#include <functional>
// Forward declaration from llama.cpp
struct llama_context;
/**
* @file multi_lora_manager.h
* @brief vLLM-inspired multi-LoRA management for ThemisDB
*
* This component implements efficient multi-LoRA adapter management similar to vLLM:
* - Multiple LoRA adapters can be loaded simultaneously
* - Efficient adapter switching during inference (minimal overhead)
* - Batched inference with different LoRAs for different requests
* - Memory-efficient adapter storage and composition
*
* Key features from vLLM:
* - Multi-LoRA inference: Different requests can use different adapters
* - Adapter batching: Multiple LoRAs in a single batch
* - Dynamic loading/unloading: Adapters load/unload on demand
* - Memory pooling: Efficient VRAM usage for multiple adapters
*/
namespace themis {
namespace llm {
/**
* @brief Multi-GPU placement strategy for LoRA adapters (v1.4.0)
*/
enum class MultiGPUStrategy {
NONE = 0, // Single GPU (default)
ROUND_ROBIN = 1, // Distribute LoRAs evenly across GPUs
DATA_PARALLEL = 2, // Replicate adapter on all GPUs
MODEL_PARALLEL = 3 // Split large adapter across GPUs
};
/**
* @brief GPU placement configuration for a LoRA adapter (v1.4.0)
*/
enum class GPUPlacement {
SINGLE_GPU = 0, // LoRA on single GPU
MULTI_GPU = 1 // LoRA spans multiple GPUs
};
/**
* @brief Multi-GPU configuration for LoRA adapters (v1.4.0)
*/
struct MultiGPUConfig {
bool enabled = false;
std::vector<int> devices; // GPU device IDs to use (e.g., {0, 1, 2, 3})
MultiGPUStrategy strategy = MultiGPUStrategy::ROUND_ROBIN;
bool enable_peer_transfer = false; // GPUDirect P2P
// FIND-015: Use named constant for default max VRAM per GPU
static constexpr size_t DEFAULT_MAX_VRAM_PER_GPU_MB = 24 * 1024; // 24GB default
size_t max_vram_per_gpu_mb = DEFAULT_MAX_VRAM_PER_GPU_MB; // Max VRAM per GPU
// Load balancing
bool enable_load_balancing = true;
float load_balance_threshold = 0.8f; // Rebalance when GPU usage > 80%
// Fault tolerance
bool enable_fault_tolerance = true;
int health_check_interval_sec = 30;
};
/**
* @brief Quantization mode for LoRA adapters
*/
enum class QuantizationMode {
NONE = 0, // No quantization (FP32/FP16)
INT8 = 1, // 8-bit integer quantization (4× compression)
INT4 = 2 // 4-bit integer quantization (8× compression)
};
/**
* @brief LoRA quantization configuration
*/
struct LoRAQuantizationConfig {
bool enabled = false;
QuantizationMode mode = QuantizationMode::INT8;
// Calibration parameters
int calibration_samples = 100; // Number of samples for scale calibration
// Quantization strategy
bool per_channel = true; // Per-channel vs per-tensor scaling
int group_size = 128; // For INT4 grouping (0 = per-channel)
};
/**
* @brief Quantization statistics for a LoRA adapter
*/
struct QuantizationStats {
std::string lora_id;
QuantizationMode mode = QuantizationMode::NONE;
size_t original_bytes = 0; // Original FP32 size
size_t quantized_bytes = 0; // Quantized size
float compression_ratio = 1.0f; // original_bytes / quantized_bytes
float quantization_time_ms = 0.0f; // Time to quantize
float calibration_time_ms = 0.0f; // Time for calibration
// Per-channel statistics
size_t num_channels = 0;
float min_scale = 0.0f; // Minimum scale factor
float max_scale = 0.0f; // Maximum scale factor
float avg_scale = 0.0f; // Average scale factor
};
/**
* @brief Fusion strategy for combining multiple LoRA adapters
*/
enum class FusionStrategy {
STATIC = 0, // Fixed weights, cached permanently
DYNAMIC = 1, // Runtime adjustable weights
SCHEDULED = 2 // Time-varying weights (A/B testing, smooth transitions)
};
/**
* @brief Scheduling strategy for SCHEDULED fusion mode
*/
enum class SchedulingStrategy {
LINEAR = 0, // Linear interpolation between weights
EXPONENTIAL = 1, // Exponential decay/growth between weights
STEP_WISE = 2, // Step-wise discrete transitions
CUSTOM = 3 // User-defined custom schedule function
};
/**
* @brief Alpha scheduling function for dynamic fusion
*
* Allows runtime computation of blend weights based on various factors:
* - Time-based: gradual transition between adapters
* - Performance-based: A/B testing with feedback
* - Context-based: different weights per request type
*/
struct AlphaSchedule {
std::string schedule_id;
FusionStrategy strategy = FusionStrategy::STATIC;
SchedulingStrategy scheduling_strategy = SchedulingStrategy::LINEAR;
// Static weights (for STATIC strategy)
std::vector<float> static_weights;
// Target weights (for SCHEDULED strategy transitions)
std::vector<float> target_weights;
// Dynamic scheduling parameters
std::chrono::system_clock::time_point start_time;
std::chrono::seconds transition_duration{0}; // For smooth transitions
// Exponential scheduling parameters
float exponential_base = 2.0f; // Controls curve steepness (higher = faster/slower transition)
bool exponential_decay = true; // true for decay, false for growth
// Step-wise scheduling parameters
std::vector<double> step_times; // Time points (seconds) for step transitions
std::vector<std::vector<float>> step_weights; // Weight vectors at each step
// Function pointer for custom scheduling logic
// Returns weights vector based on current time offset (seconds since start)
using ScheduleFunc = std::function<std::vector<float>(double time_offset)>;
ScheduleFunc schedule_func;
// A/B testing parameters (for backward compatibility)
float a_weight = 0.5f; // Weight for adapter A in A/B test
float b_weight = 0.5f; // Weight for adapter B in A/B test
// Performance tracking for adaptive scheduling
size_t total_requests = 0;
std::vector<double> performance_scores; // Per-adapter performance
};
/**
* @brief Configuration for LoRA fusion operation
*/
struct FusionConfig {
FusionStrategy strategy = FusionStrategy::STATIC;
std::vector<std::string> source_lora_ids;
std::vector<float> weights;
// Caching behavior
bool enable_cache = true;
std::chrono::seconds cache_ttl{3600}; // 1 hour default
// Compatibility checks
bool enforce_quantization_match = true;
bool enforce_gpu_placement_match = false;
bool enforce_rank_match = false;
// Alpha scheduling (for DYNAMIC/SCHEDULED strategies)
AlphaSchedule alpha_schedule;
};
/**
* @brief Fusion cache entry metadata
*/
struct FusionCacheEntry {
std::string fusion_id;
std::vector<std::string> source_lora_ids;
std::vector<float> weights;
std::chrono::system_clock::time_point created_at;
std::chrono::system_clock::time_point last_used;
size_t use_count = 0;
FusionStrategy strategy = FusionStrategy::STATIC;
// Performance metrics
double avg_inference_time_ms = 0.0;
size_t inference_count = 0;
};
/**
* @brief Fusion performance metrics
*/
struct FusionMetrics {
std::string fusion_id;
FusionStrategy strategy;
size_t total_fusions = 0;
size_t cache_hits = 0;
size_t cache_misses = 0;
size_t invalidations = 0;
double avg_fusion_time_ms = 0.0;
double avg_inference_time_ms = 0.0;
// Per-strategy breakdown
std::map<FusionStrategy, size_t> fusions_by_strategy;
std::map<FusionStrategy, double> avg_time_by_strategy;
};
/**
* @brief LoRA adapter slot
*
* Represents a loaded LoRA adapter with its metadata and handle.
*/
struct LoRASlot {
std::string lora_id;
std::string path;
std::string base_model_id;
void* adapter_handle = nullptr; // Opaque LoRA handle
float scale = 1.0f;
size_t vram_bytes = 0;
size_t rank = 0; // LoRA rank (r)
size_t alpha = 0; // LoRA alpha
std::chrono::system_clock::time_point loaded_at;
std::chrono::system_clock::time_point last_used;
size_t use_count = 0;
bool is_active = false; // Currently applied to model
bool keep_loaded = false; // Pin in memory
// Quantization support (v1.4.0)
bool is_quantized = false;
QuantizationMode quantization_mode = QuantizationMode::NONE;
size_t original_vram_bytes = 0; // Original size before quantization
std::vector<float> scale_factors; // Per-channel scale factors
std::vector<uint8_t> quantized_weights; // Quantized weight data
// Multi-GPU support (v1.4.0)
GPUPlacement gpu_placement = GPUPlacement::SINGLE_GPU;
std::vector<int> assigned_gpus; // GPU device IDs where this LoRA is loaded
int primary_gpu = 0; // Primary GPU for single-GPU or coordinator for multi-GPU
// Security and audit (v1.5.0)
std::string tenant_id; // Tenant identifier for isolation
bool is_replicated = false; // True if replicated across multiple nodes/GPUs for HA
};
/**
* @brief Multi-LoRA Manager (vLLM-inspired)
*
* Manages multiple LoRA adapters for a single base model.
* Supports efficient switching between adapters and even batched
* inference with different adapters per request.
*
* Example workflow:
* 1. Load base model (mistral-7b)
* 2. Load multiple LoRAs: legal-qa, medical-diagnosis, code-assistant
* 3. Request 1: Use legal-qa adapter
* 4. Request 2: Use medical-diagnosis adapter
* 5. Both can be in the same inference batch (if backend supports it)
*/
class MultiLoRAManager {
public:
struct Config {
// Memory limits
size_t max_lora_vram_mb = 2048; // 2 GB for all LoRAs
size_t max_lora_slots = 16; // Max concurrent LoRAs
// Trusted directory: LoRA files must reside under this path (F1-1/F1-2 fix).
// Defaults to an empty string which disables the check (legacy behaviour);
// production deployments must set this to the managed LoRA storage directory.
std::string lora_base_dir;
// Cache policy
std::chrono::seconds lora_ttl{1800}; // 30 min TTL
bool enable_lazy_load = true;
// Batching (vLLM-style)
bool enable_multi_lora_batch = false; // Multiple LoRAs in one batch
size_t max_loras_per_batch = 4;
// Adapter fusion
bool enable_adapter_fusion = false; // Merge multiple LoRAs
// Quantization (v1.4.0)
LoRAQuantizationConfig quantization;
// Multi-GPU support (v1.4.0)
MultiGPUConfig multi_gpu;
};
explicit MultiLoRAManager(const Config& config);
~MultiLoRAManager();
/**
* @brief Set quantization configuration
*
* Configures quantization parameters for subsequently loaded LoRAs.
* Does not affect already-loaded LoRAs.
*
* @param config Quantization configuration
*/
void setQuantizationConfig(const LoRAQuantizationConfig& config);
/**
* @brief Get quantization configuration
*
* @return Current quantization configuration
*/
LoRAQuantizationConfig getQuantizationConfig() const;
/**
* @brief Load a LoRA adapter (lazy loading)
*
* If adapter is already loaded, returns immediately.
* Otherwise, loads it on-demand.
*
* Thread-safe.
*
* @param lora_id Unique LoRA identifier
* @param lora_path Path to LoRA weights file
* @param base_model_id Compatible base model
* @param scale LoRA scaling factor (default: 1.0)
* @return true if loaded successfully
*/
bool loadLoRA(
const std::string& lora_id,
const std::string& lora_path,
const std::string& base_model_id,
float scale = 1.0f
);
/**
* @brief Load a LoRA adapter with optional quantization
*
* Loads a LoRA adapter and optionally applies quantization.
*
* @param lora_id Unique LoRA identifier
* @param lora_path Path to LoRA weights file
* @param base_model_id Compatible base model
* @param quantize Whether to apply quantization (uses current config)
* @param scale LoRA scaling factor (default: 1.0)
* @return true if loaded successfully
*/
bool loadLoRA(
const std::string& lora_id,
const std::string& lora_path,
const std::string& base_model_id,
bool quantize,
float scale = 1.0f
);
/**
* @brief Load a LoRA adapter with multi-GPU placement (v1.4.0)
*
* Loads a LoRA adapter with explicit GPU placement control.
*
* @param lora_id Unique LoRA identifier
* @param lora_path Path to LoRA weights file
* @param base_model_id Compatible base model
* @param quantize Whether to apply quantization
* @param placement GPU placement strategy (SINGLE_GPU or MULTI_GPU)
* @param scale LoRA scaling factor (default: 1.0)
* @return true if loaded successfully
*/
bool loadLoRA(
const std::string& lora_id,
const std::string& lora_path,
const std::string& base_model_id,
bool quantize,
GPUPlacement placement,
float scale = 1.0f
);
/**
* @brief Initialize LoRA adapter with llama.cpp model handle
*
* This method actually loads the LoRA adapter weights using llama.cpp's
* llama_lora_adapter_init() API. Must be called with a valid model handle
* before the LoRA can be applied to contexts.
*
* @param lora_id LoRA identifier (must already exist in loras_ map)
* @param model llama_model handle for loading the adapter
* @return true if initialization successful
*/
bool initializeLoRAWithModel(const std::string& lora_id, void* model);
/**
* @brief Unload a LoRA adapter
*
* @param lora_id LoRA to unload
* @param force If true, unload even if pinned
*/
bool unloadLoRA(const std::string& lora_id, bool force = false);
/**
* @brief Get LoRA slot (if loaded)
*
* Returns pointer to loaded LoRA slot, or nullptr if not loaded.
* Updates last_used timestamp.
*/
LoRASlot* getLoRA(const std::string& lora_id);
/**
* @brief Apply LoRA to model context
*
* Activates a specific LoRA adapter for the next inference.
* Multiple LoRAs can be active simultaneously if backend supports it.
*
* @param lora_id LoRA to activate
* @param context Model context to apply to (llama_context*)
* @return true if applied successfully
*/
/// Bridge callback for applying a LoRA adapter when no llama_context is available.
using ApplyAdapterFn = std::function<bool(const LoRASlot& slot)>;
/// Bridge callback for removing a LoRA adapter when no llama_context is available.
using RemoveAdapterFn = std::function<bool(const LoRASlot& slot)>;
void setApplyAdapterFn(ApplyAdapterFn fn);
void setRemoveAdapterFn(RemoveAdapterFn fn);
bool applyLoRA(const std::string& lora_id, llama_context* context);
/**
* @brief Remove LoRA from model context
*
* Deactivates a LoRA adapter.
*/
bool removeLoRA(const std::string& lora_id, llama_context* context);
/**
* @brief Batch inference with multiple LoRAs (vLLM-style)
*
* Processes multiple inference requests, each with its own LoRA adapter,
* in a single batch for efficiency.
*
* Requires backend support for multi-LoRA batching.
*
* @param requests Vector of inference requests with LoRA IDs
* @return Vector of responses (same order as requests)
*/
std::vector<InferenceResponse> batchInferenceMultiLoRA(
const std::vector<std::pair<InferenceRequest, std::string>>& requests,
llama_context* model_context
);
/**
* @brief Fuse multiple LoRAs into a single adapter
*
* Combines multiple LoRA adapters into one for efficiency.
* Useful when always using the same combination of adapters.
*
* @param lora_ids LoRAs to fuse
* @param fused_id New identifier for fused adapter
* @param weights Weights for each LoRA (default: equal)
* @return true if fusion successful
*/
bool fuseLoRAs(
const std::vector<std::string>& lora_ids,
const std::string& fused_id,
const std::vector<float>& weights = {}
);
/**
* @brief Fuse multiple LoRAs with advanced configuration
*
* Extended fusion API with dynamic composition, alpha scheduling,
* and fine-grained control over caching and compatibility checks.
*
* @param fused_id New identifier for fused adapter
* @param config Fusion configuration including strategy and scheduling
* @return true if fusion successful
*/
bool fuseLoRAsAdvanced(
const std::string& fused_id,
const FusionConfig& config
);
/**
* @brief Update weights for a dynamically fused adapter
*
* Allows runtime adjustment of blend weights for DYNAMIC fusion strategy.
* The fused adapter must have been created with FusionStrategy::DYNAMIC.
*
* @param fusion_id ID of the fused adapter
* @param new_weights New blend weights (must match number of source LoRAs)
* @return true if weights updated successfully
*/
bool updateFusionWeights(
const std::string& fusion_id,
const std::vector<float>& new_weights
);
/**
* @brief Set alpha schedule for scheduled fusion
*
* Configures time-varying weights for SCHEDULED fusion strategy.
* Used for A/B testing, gradual transitions, and adaptive blending.
*
* @param fusion_id ID of the fused adapter
* @param schedule Alpha scheduling configuration
* @return true if schedule set successfully
*/
bool setAlphaSchedule(
const std::string& fusion_id,
const AlphaSchedule& schedule
);
/**
* @brief Get current fusion weights (resolves scheduled weights)
*
* Returns the current effective weights for a fused adapter,
* accounting for any active alpha scheduling.
*
* @param fusion_id ID of the fused adapter
* @return Current weights, or empty vector if fusion not found
*/
std::vector<float> getCurrentFusionWeights(const std::string& fusion_id) const;
/**
* @brief Invalidate fusion cache entry
*
* Forces re-computation of a fused adapter on next use.
* Useful when source LoRAs have been modified or reloaded.
*
* @param fusion_id ID of the fused adapter to invalidate
* @return true if cache entry was invalidated
*/
bool invalidateFusionCache(const std::string& fusion_id);
/**
* @brief Clear all fusion cache entries
*
* Removes all cached fused adapters. Source LoRAs remain loaded.
*
* @return Number of cache entries cleared
*/
size_t clearFusionCache();
/**
* @brief Get fusion cache statistics
*
* Returns metrics about fusion caching and performance.
*
* @return Fusion metrics including cache hit rate and timing
*/
FusionMetrics getFusionMetrics() const;
/**
* @brief List all cached fusion entries
*
* @return Vector of fusion cache entry metadata
*/
std::vector<FusionCacheEntry> listFusionCache() const;
/**
* @brief Check compatibility of LoRAs for fusion
*
* Validates that a set of LoRAs can be safely fused together
* based on quantization mode, base model, rank, and GPU placement.
*
* @param lora_ids LoRAs to check for compatibility
* @param config Fusion configuration with compatibility requirements
* @return true if all LoRAs are compatible for fusion
*/
bool checkFusionCompatibility(
const std::vector<std::string>& lora_ids,
const FusionConfig& config
) const;
/**
* @brief Pin a LoRA in memory (prevent eviction)
*/
void pinLoRA(const std::string& lora_id);
/**
* @brief Unpin a LoRA (allow eviction)
*/
void unpinLoRA(const std::string& lora_id);
/**
* @brief Check if LoRA is loaded
*/
bool isLoRALoaded(const std::string& lora_id) const;
/**
* @brief Get quantization statistics for a LoRA adapter
*
* Returns statistics about quantization for a specific LoRA,
* including compression ratio and memory savings.
*
* @param lora_id LoRA identifier
* @return Quantization statistics, or nullopt if LoRA not loaded or not quantized
*/
std::optional<QuantizationStats> getQuantizationStats(const std::string& lora_id) const;
/**
* @brief Get multi-GPU configuration (v1.4.0)
*
* @return Current multi-GPU configuration
*/
MultiGPUConfig getMultiGPUConfig() const;
/**
* @brief Set multi-GPU configuration (v1.4.0)
*
* Updates multi-GPU configuration. Affects subsequently loaded LoRAs.
*
* @param config Multi-GPU configuration
*/
void setMultiGPUConfig(const MultiGPUConfig& config);
/**
* @brief Get GPU placement for a LoRA adapter (v1.4.0)
*
* @param lora_id LoRA identifier
* @return GPU device IDs where the LoRA is placed, empty if not loaded
*/
std::vector<int> getLoRAGPUPlacement(const std::string& lora_id) const;
/**
* @brief Get per-GPU memory statistics (v1.4.0)
*
* @return Map of GPU ID to VRAM usage in bytes
*/
std::unordered_map<int, size_t> getPerGPUMemoryUsage() const;
/**
* @brief Balance LoRA load across GPUs (v1.4.0)
*
* Redistributes LoRAs across GPUs for better load balancing.
* Only effective when multi-GPU is enabled with ROUND_ROBIN strategy.
*
* @return Number of LoRAs moved
*/
size_t balanceGPULoad();
/**
* @brief Get usage heatmap for all LoRAs (v1.5.0)
*
* Returns a heatmap showing access patterns and usage statistics
* for resource-aware eviction decisions.
*
* @return Map of LoRA ID to usage metrics (access count, last used, etc.)
*/
json getUsageHeatmap() const;
/**
* @brief Resource-aware eviction based on GPU VRAM pressure (v1.5.0)
*
* Evicts LoRAs based on GPU-specific resource constraints, usage patterns,
* and priority. More intelligent than simple LRU eviction.
*
* @param gpu_id GPU device to free memory on (-1 for global)
* @param target_vram_mb Target VRAM to free
* @return Amount of VRAM freed (MB)
*/
size_t evictResourceAware(int gpu_id = -1, size_t target_vram_mb = 0);
/**
* @brief Get scheduling recommendations for LoRA placement (v1.5.0)
*
* Provides intelligent placement recommendations based on available
* slots, VRAM, expected latency, and current GPU loads.
*
* @param lora_vram_bytes Expected VRAM usage of LoRA
* @param priority Priority level (0-10)
* @return Recommended GPU ID and placement metrics
*/
json getSchedulingRecommendation(size_t lora_vram_bytes, int priority = 5) const;
/**
* @brief Migrate LoRA adapter to another GPU (v1.5.0)
*
* Performs warm migration of a LoRA adapter from current GPU to target GPU
* with minimal service interruption.
*
* @param lora_id LoRA to migrate
* @param target_gpu Target GPU device ID
* @return true if migration successful
*/
bool migrateLoRAToGPU(const std::string& lora_id, int target_gpu);
/**
* @brief Check GPU health and trigger auto-migration on failure (v1.5.0)
*
* Monitors GPU health and automatically migrates adapters from
* unhealthy GPUs to healthy ones.
*
* @return Number of adapters migrated due to GPU failures
*/
size_t checkGPUHealthAndMigrate();
/**
* @brief List all loaded LoRAs
*/
std::vector<LoRAInfo> listLoRAs() const;
/**
* @brief Set tenant ID for a LoRA adapter (v1.5.0 - Security)
*
* Associates a LoRA adapter with a specific tenant for
* GPU memory isolation and audit logging.
*
* @param lora_id LoRA identifier
* @param tenant_id Tenant identifier
*/
void setLoRATenant(const std::string& lora_id, const std::string& tenant_id);
/**
* @brief Get audit log for GPU transfer events (v1.5.0 - Security)
*
* Returns audit log of all GPU transfer events including LoRA
* migrations, load/unload operations with timestamps and tenant info.
*
* @param limit Maximum number of recent events to return (0 = all)
* @return JSON array of audit events
*/
json getGPUTransferAuditLog(size_t limit = 100) const;
/**
* @brief List loaded LoRAs filtered by base model id
*/
std::vector<LoRAInfo> listLoRAs(const std::string& base_model_id) const;
/**
* @brief Get LoRA info by id
*/
std::optional<LoRAInfo> getLoRAInfo(const std::string& lora_id) const;
/**
* @brief Evict least recently used LoRA(s)
*
* @param target_vram_mb Target VRAM to free
* @return Amount of VRAM freed (MB)
*/
size_t evictLRU(size_t target_vram_mb = 0);
/**
* @brief Evict LoRAs that exceeded their TTL
*
* @return Number of LoRAs evicted
*/
size_t evictExpired();
/**
* @brief Get memory usage statistics
*/
json getMemoryStats() const;
/**
* @brief Get LoRA cache statistics
*/
json getCacheStats() const;
// Compact typed statistics API for tests
struct Stats {
size_t total_loras_loaded = 0;
size_t cache_hits = 0;
size_t cache_misses = 0;
size_t evictions = 0;
size_t switches = 0;
};
Stats getStatistics() const;
// Backward-compat: legacy tests expect getStats()
Stats getStats() const { return getStatistics(); }
/**
* @brief Export LoRA for cross-shard transfer
*
* Serializes a LoRA adapter for transfer to another shard.
*/
std::vector<uint8_t> exportLoRA(const std::string& lora_id);
/**
* @brief Import LoRA from another shard
*
* Deserializes and loads a LoRA adapter received from another shard.
*/
bool importLoRA(
const std::string& lora_id,
const std::vector<uint8_t>& data,
const std::string& base_model_id
);
private:
Config config_;
std::unordered_map<std::string, std::unique_ptr<LoRASlot>> loras_;
mutable std::mutex mutex_;
// Statistics
size_t total_vram_bytes_ = 0;
size_t cache_hits_ = 0;
size_t cache_misses_ = 0;
size_t evictions_ = 0;
size_t switches_ = 0; // LoRA switch count
// Multi-GPU state (v1.4.0)
std::unordered_map<int, size_t> gpu_vram_usage_; // Per-GPU VRAM tracking
int next_round_robin_gpu_ = 0; // Round-robin counter
// Enhanced tracking for v1.5.0
std::unordered_map<std::string, std::string> lora_tenants_; // LoRA -> Tenant mapping
// Audit log structure (v1.5.0)
struct AuditEvent {
std::chrono::system_clock::time_point timestamp;
std::string event_type; // "load", "unload", "migrate", "evict"
std::string lora_id;
std::string tenant_id;
int source_gpu;
int target_gpu;
size_t vram_bytes;
std::string details;
};
std::vector<AuditEvent> audit_log_;
size_t max_audit_log_size_ = 1000;
// GPU health tracking (v1.5.0)
std::unordered_map<int, bool> gpu_health_status_; // GPU ID -> healthy status
std::unordered_map<int, std::chrono::system_clock::time_point> gpu_last_health_check_;
void logGPUTransferEvent(const std::string& event_type, const std::string& lora_id,
int source_gpu, int target_gpu, size_t vram_bytes,
const std::string& details = "");
// Helper for access frequency calculation
double calculateAccessFrequency(const LoRASlot* lora,
const std::chrono::system_clock::time_point& now) const;
// Fusion cache and metrics (v1.5.0)
std::unordered_map<std::string, FusionCacheEntry> fusion_cache_;
std::unordered_map<std::string, FusionConfig> fusion_configs_;
std::unordered_map<std::string, AlphaSchedule> fusion_schedules_;
FusionMetrics fusion_metrics_;
size_t total_fusions_ = 0;
size_t fusion_cache_hits_ = 0;
size_t fusion_cache_misses_ = 0;
size_t fusion_invalidations_ = 0;
// Background eviction thread
std::unique_ptr<std::thread> eviction_thread_;
std::atomic<bool> eviction_thread_running_{false};
std::condition_variable eviction_cv_;
ApplyAdapterFn apply_adapter_fn_;
RemoveAdapterFn remove_adapter_fn_;
// Internal helpers
LoRASlot* loadLoRAInternal(
const std::string& lora_id,
const std::string& lora_path,
const std::string& base_model_id,
float scale,
bool quantize = false,
GPUPlacement placement = GPUPlacement::SINGLE_GPU
);
/**
* @brief Verify that @p lora_path is contained within the trusted
* @c config_.lora_base_dir directory (F1-1/F1-2 fix).
*
* Returns true when the check passes or when @c config_.lora_base_dir is
* empty (legacy/unconfigured deployments). Returns false when the path
* escapes the base directory; callers must reject the request in that case.
*/
bool isLoRAPathTrusted(const std::string& lora_path) const;
// Background eviction worker
void evictionWorker();
void startEvictionThread();
void stopEvictionThread();
// Multi-GPU helpers (v1.4.0)
int selectGPUForLoRA(size_t vram_bytes); // Select best GPU for new LoRA
bool loadLoRAOnGPU(LoRASlot* lora, int gpu_id); // Load LoRA on specific GPU
bool loadLoRAMultiGPU(LoRASlot* lora); // Load LoRA across multiple GPUs
void updateGPUMemoryTracking(); // Recalculate per-GPU memory usage
bool isGPUHealthy(int gpu_id) const; // Check GPU health status
std::vector<int> getAvailableGPUs() const; // Get list of available GPUs
// Quantization helpers
bool quantizeLoRA(LoRASlot* lora);
void quantizeINT8(LoRASlot* lora, const std::vector<float>& weights);
void quantizeINT4(LoRASlot* lora, const std::vector<float>& weights);
void calibrateScales(const std::vector<float>& weights, std::vector<float>& scales);
std::vector<float> simulateWeights(size_t count); // For testing without real weights
// Fusion helpers (v1.5.0)
bool fuseLoRAsInternal(const std::string& fused_id, const FusionConfig& config);
std::vector<float> computeScheduledWeights(const std::string& fusion_id) const;
std::vector<float> computeLinearSchedule(const AlphaSchedule& schedule, double time_offset) const;
std::vector<float> computeExponentialSchedule(const AlphaSchedule& schedule, double time_offset) const;
std::vector<float> computeStepWiseSchedule(const AlphaSchedule& schedule, double time_offset) const;
bool validateFusionCompatibility(
const std::vector<LoRASlot*>& source_loras,
const FusionConfig& config
) const;
void updateFusionMetrics(const std::string& fusion_id, double fusion_time_ms);
void updateInferenceMetrics(const std::string& fusion_id, double inference_time_ms);
bool hasCapacity(size_t vram_bytes) const;
void updateMemoryUsage();
};
} // namespace llm
} // namespace themis