Skip to content

Commit 31c0334

Browse files
authored
fix: stop double-counting partial offload in ranking (#108)
1 parent 66d4332 commit 31c0334

4 files changed

Lines changed: 168 additions & 33 deletions

File tree

docs/how-it-works.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -168,9 +168,10 @@ For each candidate variant:
168168
5. Compute a quality score.
169169
6. Keep the best variant for the model family.
170170

171-
The final sorting key includes the quality score, a fit bonus, and a small
172-
direct-benchmark bonus. Full-GPU candidates are preferred over comparable
173-
partial-offload candidates because they are usually more responsive in practice.
171+
The final sorting key stays close to the displayed quality score, with a small
172+
direct-benchmark bonus and a CPU-only penalty. Full-GPU candidates are already
173+
favored inside the score through the runtime-fit and speed adjustments, so the
174+
sort key does not add a second full-GPU bonus.
174175

175176
See [Scoring](scoring.md) for the score details.
176177

docs/scoring.md

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -99,20 +99,17 @@ The candidate's runtime form matters:
9999
| Fit | Multiplier |
100100
| --- | ---: |
101101
| Full GPU | `1.00` |
102-
| Partial offload | `0.72` |
102+
| Partial offload | `0.42`-`0.88`, based on spill ratio |
103103
| CPU-only | `0.50` |
104104

105-
The final family selection key also adds a fit bonus:
105+
Light partial offload is penalized less than heavy offload. MoE models receive
106+
a milder penalty when the active parameter working set can plausibly stay on
107+
GPU while inactive experts spill to CPU RAM.
106108

107-
| Fit | Bonus |
108-
| --- | ---: |
109-
| Full GPU | `+15` |
110-
| Partial offload | `0` |
111-
| CPU-only | `-15` |
112-
113-
This keeps a responsive full-GPU result ahead of a similar partial-offload
114-
result, without letting a very weak full-GPU model beat a much stronger model
115-
that only needs modest offload.
109+
The final family selection key does not add a separate full-GPU bonus. Runtime
110+
fit is already reflected in the quality score through the multiplier above and
111+
the speed adjustment below. CPU-only results receive a small extra sort penalty
112+
when mixed with GPU-backed candidates.
116113

117114
## Speed adjustment
118115

src/whichllm/engine/ranker.py

Lines changed: 32 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -41,28 +41,20 @@ def _family_selection_key(
4141
) -> tuple[float]:
4242
"""Family-level selection key — single composite score.
4343
44-
Combines quality, fit type, and evidence tier into one number so the
45-
sort is fully transitive and edge cases resolve sensibly:
46-
47-
- ``fit_bonus`` (+15 / 0 / -15) is large enough that "estimated,
48-
full-GPU" still beats "direct, partial-offload" of comparable
49-
quality (users on small VRAM prefer the responsive option),
50-
but small enough that a quality-17 Q1_0 full-GPU model loses to
51-
a quality-57 partial-offload 27B model
44+
``quality_score`` already includes the runtime fit penalty and speed
45+
adjustment. Keep final selection close to that displayed score so strong
46+
partial-offload candidates do not get discounted again while sorting.
47+
5248
- ``direct_bonus`` (+5) gives independent leaderboard evidence a
5349
small edge at the same fit; cannot overturn a 6+ point quality gap
5450
"""
55-
fit_bonus = {
56-
"full_gpu": 15.0,
57-
"partial_offload": 0.0,
58-
"cpu_only": -15.0,
59-
}.get(result.fit_type, -15.0)
6051
if require_direct_top and result.benchmark_status == "direct":
6152
direct_bonus = 5.0
6253
else:
6354
direct_bonus = 0.0
55+
cpu_penalty = -6.0 if result.fit_type == "cpu_only" else 0.0
6456
ctx_penalty = -20.0 if not result.context_fits else 0.0
65-
return (result.quality_score + fit_bonus + direct_bonus + ctx_penalty,)
57+
return (result.quality_score + direct_bonus + cpu_penalty + ctx_penalty,)
6658

6759

6860
def _partial_offload_quality_factor(model: ModelInfo, offload_ratio: float) -> float:
@@ -74,14 +66,36 @@ def _partial_offload_quality_factor(model: ModelInfo, offload_ratio: float) -> f
7466
factor = 0.52
7567
elif ratio >= 0.40:
7668
factor = 0.62
69+
elif ratio >= 0.25:
70+
factor = 0.76
7771
else:
78-
factor = 0.72
72+
factor = 0.86
7973

8074
# MoE offload is more nuanced: inactive experts and router/runtime
81-
# placement do not hurt equally. Keep the penalty, but do not treat it
82-
# as badly as dense-layer offload.
75+
# placement do not hurt equally. If the GPU can plausibly hold the
76+
# active expert working set, do not treat inactive-expert spill like
77+
# dense-layer spill.
8378
if model.is_moe and model.parameter_count_active:
84-
factor = min(0.72, factor + 0.08)
79+
active_ratio = (
80+
model.parameter_count_active / model.parameter_count
81+
if model.parameter_count > 0
82+
else 1.0
83+
)
84+
active_ratio = max(0.0, min(1.0, active_ratio))
85+
active_set_fits = ratio <= max(0.0, 1.0 - active_ratio)
86+
if active_set_fits:
87+
if ratio >= 0.75:
88+
factor = max(factor, 0.66)
89+
elif ratio >= 0.60:
90+
factor = max(factor, 0.70)
91+
elif ratio >= 0.40:
92+
factor = max(factor, 0.76)
93+
elif ratio >= 0.25:
94+
factor = max(factor, 0.82)
95+
else:
96+
factor = max(factor, 0.88)
97+
else:
98+
factor = min(0.76, factor + 0.08)
8599

86100
return factor
87101

tests/test_ranker.py

Lines changed: 124 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Tests for ranking behavior."""
22

33
from whichllm.engine.quantization import effective_quant_type
4-
from whichllm.engine.ranker import rank_models
4+
from whichllm.engine.ranker import _partial_offload_quality_factor, rank_models
55
from whichllm.hardware.types import GPUInfo, HardwareInfo
66
from whichllm.models.types import GGUFVariant, ModelInfo
77

@@ -493,6 +493,129 @@ def test_full_gpu_estimated_ranks_above_partial_direct():
493493
assert results[0].model.id == "Qwen/Qwen3-8B-AWQ"
494494

495495

496+
def test_strong_partial_offload_not_buried_below_weaker_full_gpu():
497+
strong_partial = ModelInfo(
498+
id="Qwen/Qwen3.6-27B",
499+
family_id="qwen3.6-27b",
500+
name="Qwen3.6-27B",
501+
parameter_count=27_800_000_000,
502+
downloads=5_300_000,
503+
likes=10_000,
504+
gguf_variants=[
505+
GGUFVariant(
506+
filename="qwen3.6-27b-q4_k_m.gguf",
507+
quant_type="Q4_K_M",
508+
file_size_bytes=15 * 1024**3,
509+
)
510+
],
511+
)
512+
full_gpu_14b = ModelInfo(
513+
id="Qwen/Qwen3-14B",
514+
family_id="qwen3-14b",
515+
name="Qwen3-14B",
516+
parameter_count=14_800_000_000,
517+
downloads=1_600_000,
518+
likes=5_000,
519+
gguf_variants=[
520+
GGUFVariant(
521+
filename="qwen3-14b-q5_k_m.gguf",
522+
quant_type="Q5_K_M",
523+
file_size_bytes=9 * 1024**3,
524+
)
525+
],
526+
)
527+
full_gpu_8b = ModelInfo(
528+
id="Qwen/Qwen3-8B",
529+
family_id="qwen3-8b",
530+
name="Qwen3-8B",
531+
parameter_count=8_200_000_000,
532+
downloads=11_000_000,
533+
likes=5_000,
534+
gguf_variants=[
535+
GGUFVariant(
536+
filename="qwen3-8b-q5_k_m.gguf",
537+
quant_type="Q5_K_M",
538+
file_size_bytes=5 * 1024**3,
539+
)
540+
],
541+
)
542+
old_full_gpu = ModelInfo(
543+
id="google/gemma-2-9b-it",
544+
family_id="gemma-2-9b-it",
545+
name="gemma-2-9b-it",
546+
parameter_count=9_200_000_000,
547+
downloads=400_000,
548+
likes=1_000,
549+
gguf_variants=[
550+
GGUFVariant(
551+
filename="gemma-2-9b-q5_k_m.gguf",
552+
quant_type="Q5_K_M",
553+
file_size_bytes=5_500_000_000,
554+
)
555+
],
556+
)
557+
hardware = HardwareInfo(
558+
gpus=[
559+
GPUInfo(
560+
name="RTX 3060",
561+
vendor="nvidia",
562+
vram_bytes=12 * 1024**3,
563+
compute_capability=(8, 6),
564+
memory_bandwidth_gbps=360.0,
565+
)
566+
],
567+
cpu_name="Test CPU",
568+
cpu_cores=6,
569+
has_avx2=True,
570+
ram_bytes=32 * 1024**3,
571+
disk_free_bytes=500 * 1024**3,
572+
os="windows",
573+
)
574+
575+
results = rank_models(
576+
[strong_partial, full_gpu_14b, full_gpu_8b, old_full_gpu],
577+
hardware,
578+
top_n=10,
579+
benchmark_scores={
580+
"Qwen/Qwen3.6-27B": 83.5,
581+
"Qwen/Qwen3-14B": 66.7,
582+
"Qwen/Qwen3-8B": 56.1,
583+
"google/gemma-2-9b-it": 35.1,
584+
},
585+
task_profile="any",
586+
)
587+
588+
ids = [r.model.id for r in results]
589+
assert ids.index("Qwen/Qwen3.6-27B") < ids.index("Qwen/Qwen3-8B")
590+
assert ids.index("Qwen/Qwen3.6-27B") < ids.index("google/gemma-2-9b-it")
591+
strong = next(r for r in results if r.model.id == "Qwen/Qwen3.6-27B")
592+
assert strong.fit_type == "partial_offload"
593+
assert (
594+
strong.quality_score
595+
> next(r for r in results if r.model.id == "Qwen/Qwen3-8B").quality_score
596+
)
597+
598+
599+
def test_moe_partial_offload_penalty_uses_active_working_set():
600+
dense = ModelInfo(
601+
id="example/Dense-30B",
602+
family_id="dense-30b",
603+
name="Dense-30B",
604+
parameter_count=30_000_000_000,
605+
)
606+
moe = ModelInfo(
607+
id="example/MoE-30B-A3B",
608+
family_id="moe-30b-a3b",
609+
name="MoE-30B-A3B",
610+
parameter_count=30_000_000_000,
611+
parameter_count_active=3_000_000_000,
612+
is_moe=True,
613+
)
614+
615+
assert _partial_offload_quality_factor(dense, 0.80) == 0.42
616+
assert _partial_offload_quality_factor(moe, 0.80) >= 0.66
617+
618+
496619
def test_evidence_strict_filters_out_estimated_models():
497620
direct_model = ModelInfo(
498621
id="Qwen/Qwen2.5-7B-Instruct",

0 commit comments

Comments
 (0)