fix: stop double-counting partial offload in ranking (#108)

Andyyyy64 · web-flow · commit 31c0334af3b4 · 2026-06-11T16:46:24.000+09:00
diff --git a/docs/how-it-works.md b/docs/how-it-works.md
@@ -168,9 +168,10 @@ For each candidate variant:
 5. Compute a quality score.
 6. Keep the best variant for the model family.
 
-The final sorting key includes the quality score, a fit bonus, and a small
-direct-benchmark bonus. Full-GPU candidates are preferred over comparable
-partial-offload candidates because they are usually more responsive in practice.
+The final sorting key stays close to the displayed quality score, with a small
+direct-benchmark bonus and a CPU-only penalty. Full-GPU candidates are already
+favored inside the score through the runtime-fit and speed adjustments, so the
+sort key does not add a second full-GPU bonus.
 
 See [Scoring](scoring.md) for the score details.
 
diff --git a/docs/scoring.md b/docs/scoring.md
@@ -99,20 +99,17 @@ The candidate's runtime form matters:
 | Fit | Multiplier |
 | --- | ---: |
 | Full GPU | `1.00` |
-| Partial offload | `0.72` |
+| Partial offload | `0.42`-`0.88`, based on spill ratio |
 | CPU-only | `0.50` |
 
-The final family selection key also adds a fit bonus:
+Light partial offload is penalized less than heavy offload. MoE models receive
+a milder penalty when the active parameter working set can plausibly stay on
+GPU while inactive experts spill to CPU RAM.
 
-| Fit | Bonus |
-| --- | ---: |
-| Full GPU | `+15` |
-| Partial offload | `0` |
-| CPU-only | `-15` |
-
-This keeps a responsive full-GPU result ahead of a similar partial-offload
-result, without letting a very weak full-GPU model beat a much stronger model
-that only needs modest offload.
+The final family selection key does not add a separate full-GPU bonus. Runtime
+fit is already reflected in the quality score through the multiplier above and
+the speed adjustment below. CPU-only results receive a small extra sort penalty
+when mixed with GPU-backed candidates.
 
 ## Speed adjustment
 
diff --git a/src/whichllm/engine/ranker.py b/src/whichllm/engine/ranker.py
@@ -41,28 +41,20 @@ def _family_selection_key(
 ) -> tuple[float]:
     """Family-level selection key — single composite score.
 
-    Combines quality, fit type, and evidence tier into one number so the
-    sort is fully transitive and edge cases resolve sensibly:
-
-    - ``fit_bonus`` (+15 / 0 / -15) is large enough that "estimated,
-      full-GPU" still beats "direct, partial-offload" of comparable
-      quality (users on small VRAM prefer the responsive option),
-      but small enough that a quality-17 Q1_0 full-GPU model loses to
-      a quality-57 partial-offload 27B model
+    ``quality_score`` already includes the runtime fit penalty and speed
+    adjustment. Keep final selection close to that displayed score so strong
+    partial-offload candidates do not get discounted again while sorting.
+
     - ``direct_bonus`` (+5) gives independent leaderboard evidence a
       small edge at the same fit; cannot overturn a 6+ point quality gap
     """
-    fit_bonus = {
-        "full_gpu": 15.0,
-        "partial_offload": 0.0,
-        "cpu_only": -15.0,
-    }.get(result.fit_type, -15.0)
     if require_direct_top and result.benchmark_status == "direct":
         direct_bonus = 5.0
     else:
         direct_bonus = 0.0
+    cpu_penalty = -6.0 if result.fit_type == "cpu_only" else 0.0
     ctx_penalty = -20.0 if not result.context_fits else 0.0
-    return (result.quality_score + fit_bonus + direct_bonus + ctx_penalty,)
+    return (result.quality_score + direct_bonus + cpu_penalty + ctx_penalty,)
 
 
 def _partial_offload_quality_factor(model: ModelInfo, offload_ratio: float) -> float:
@@ -74,14 +66,36 @@ def _partial_offload_quality_factor(model: ModelInfo, offload_ratio: float) -> f
         factor = 0.52
     elif ratio >= 0.40:
         factor = 0.62
+    elif ratio >= 0.25:
+        factor = 0.76
     else:
-        factor = 0.72
+        factor = 0.86
 
     # MoE offload is more nuanced: inactive experts and router/runtime
-    # placement do not hurt equally. Keep the penalty, but do not treat it
-    # as badly as dense-layer offload.
+    # placement do not hurt equally. If the GPU can plausibly hold the
+    # active expert working set, do not treat inactive-expert spill like
+    # dense-layer spill.
     if model.is_moe and model.parameter_count_active:
-        factor = min(0.72, factor + 0.08)
+        active_ratio = (
+            model.parameter_count_active / model.parameter_count
+            if model.parameter_count > 0
+            else 1.0
+        )
+        active_ratio = max(0.0, min(1.0, active_ratio))
+        active_set_fits = ratio <= max(0.0, 1.0 - active_ratio)
+        if active_set_fits:
+            if ratio >= 0.75:
+                factor = max(factor, 0.66)
+            elif ratio >= 0.60:
+                factor = max(factor, 0.70)
+            elif ratio >= 0.40:
+                factor = max(factor, 0.76)
+            elif ratio >= 0.25:
+                factor = max(factor, 0.82)
+            else:
+                factor = max(factor, 0.88)
+        else:
+            factor = min(0.76, factor + 0.08)
 
     return factor
 
diff --git a/tests/test_ranker.py b/tests/test_ranker.py
@@ -1,7 +1,7 @@
 """Tests for ranking behavior."""
 
 from whichllm.engine.quantization import effective_quant_type
-from whichllm.engine.ranker import rank_models
+from whichllm.engine.ranker import _partial_offload_quality_factor, rank_models
 from whichllm.hardware.types import GPUInfo, HardwareInfo
 from whichllm.models.types import GGUFVariant, ModelInfo
 
@@ -493,6 +493,129 @@ def test_full_gpu_estimated_ranks_above_partial_direct():
     assert results[0].model.id == "Qwen/Qwen3-8B-AWQ"
 
 
+def test_strong_partial_offload_not_buried_below_weaker_full_gpu():
+    strong_partial = ModelInfo(
+        id="Qwen/Qwen3.6-27B",
+        family_id="qwen3.6-27b",
+        name="Qwen3.6-27B",
+        parameter_count=27_800_000_000,
+        downloads=5_300_000,
+        likes=10_000,
+        gguf_variants=[
+            GGUFVariant(
+                filename="qwen3.6-27b-q4_k_m.gguf",
+                quant_type="Q4_K_M",
+                file_size_bytes=15 * 1024**3,
+            )
+        ],
+    )
+    full_gpu_14b = ModelInfo(
+        id="Qwen/Qwen3-14B",
+        family_id="qwen3-14b",
+        name="Qwen3-14B",
+        parameter_count=14_800_000_000,
+        downloads=1_600_000,
+        likes=5_000,
+        gguf_variants=[
+            GGUFVariant(
+                filename="qwen3-14b-q5_k_m.gguf",
+                quant_type="Q5_K_M",
+                file_size_bytes=9 * 1024**3,
+            )
+        ],
+    )
+    full_gpu_8b = ModelInfo(
+        id="Qwen/Qwen3-8B",
+        family_id="qwen3-8b",
+        name="Qwen3-8B",
+        parameter_count=8_200_000_000,
+        downloads=11_000_000,
+        likes=5_000,
+        gguf_variants=[
+            GGUFVariant(
+                filename="qwen3-8b-q5_k_m.gguf",
+                quant_type="Q5_K_M",
+                file_size_bytes=5 * 1024**3,
+            )
+        ],
+    )
+    old_full_gpu = ModelInfo(
+        id="google/gemma-2-9b-it",
+        family_id="gemma-2-9b-it",
+        name="gemma-2-9b-it",
+        parameter_count=9_200_000_000,
+        downloads=400_000,
+        likes=1_000,
+        gguf_variants=[
+            GGUFVariant(
+                filename="gemma-2-9b-q5_k_m.gguf",
+                quant_type="Q5_K_M",
+                file_size_bytes=5_500_000_000,
+            )
+        ],
+    )
+    hardware = HardwareInfo(
+        gpus=[
+            GPUInfo(
+                name="RTX 3060",
+                vendor="nvidia",
+                vram_bytes=12 * 1024**3,
+                compute_capability=(8, 6),
+                memory_bandwidth_gbps=360.0,
+            )
+        ],
+        cpu_name="Test CPU",
+        cpu_cores=6,
+        has_avx2=True,
+        ram_bytes=32 * 1024**3,
+        disk_free_bytes=500 * 1024**3,
+        os="windows",
+    )
+
+    results = rank_models(
+        [strong_partial, full_gpu_14b, full_gpu_8b, old_full_gpu],
+        hardware,
+        top_n=10,
+        benchmark_scores={
+            "Qwen/Qwen3.6-27B": 83.5,
+            "Qwen/Qwen3-14B": 66.7,
+            "Qwen/Qwen3-8B": 56.1,
+            "google/gemma-2-9b-it": 35.1,
+        },
+        task_profile="any",
+    )
+
+    ids = [r.model.id for r in results]
+    assert ids.index("Qwen/Qwen3.6-27B") < ids.index("Qwen/Qwen3-8B")
+    assert ids.index("Qwen/Qwen3.6-27B") < ids.index("google/gemma-2-9b-it")
+    strong = next(r for r in results if r.model.id == "Qwen/Qwen3.6-27B")
+    assert strong.fit_type == "partial_offload"
+    assert (
+        strong.quality_score
+        > next(r for r in results if r.model.id == "Qwen/Qwen3-8B").quality_score
+    )
+
+
+def test_moe_partial_offload_penalty_uses_active_working_set():
+    dense = ModelInfo(
+        id="example/Dense-30B",
+        family_id="dense-30b",
+        name="Dense-30B",
+        parameter_count=30_000_000_000,
+    )
+    moe = ModelInfo(
+        id="example/MoE-30B-A3B",
+        family_id="moe-30b-a3b",
+        name="MoE-30B-A3B",
+        parameter_count=30_000_000_000,
+        parameter_count_active=3_000_000_000,
+        is_moe=True,
+    )
+
+    assert _partial_offload_quality_factor(dense, 0.80) == 0.42
+    assert _partial_offload_quality_factor(moe, 0.80) >= 0.66
+
+
 def test_evidence_strict_filters_out_estimated_models():
     direct_model = ModelInfo(
         id="Qwen/Qwen2.5-7B-Instruct",