|
1 | 1 | """Tests for ranking behavior.""" |
2 | 2 |
|
3 | 3 | from whichllm.engine.quantization import effective_quant_type |
4 | | -from whichllm.engine.ranker import rank_models |
| 4 | +from whichllm.engine.ranker import _partial_offload_quality_factor, rank_models |
5 | 5 | from whichllm.hardware.types import GPUInfo, HardwareInfo |
6 | 6 | from whichllm.models.types import GGUFVariant, ModelInfo |
7 | 7 |
|
@@ -493,6 +493,129 @@ def test_full_gpu_estimated_ranks_above_partial_direct(): |
493 | 493 | assert results[0].model.id == "Qwen/Qwen3-8B-AWQ" |
494 | 494 |
|
495 | 495 |
|
| 496 | +def test_strong_partial_offload_not_buried_below_weaker_full_gpu(): |
| 497 | + strong_partial = ModelInfo( |
| 498 | + id="Qwen/Qwen3.6-27B", |
| 499 | + family_id="qwen3.6-27b", |
| 500 | + name="Qwen3.6-27B", |
| 501 | + parameter_count=27_800_000_000, |
| 502 | + downloads=5_300_000, |
| 503 | + likes=10_000, |
| 504 | + gguf_variants=[ |
| 505 | + GGUFVariant( |
| 506 | + filename="qwen3.6-27b-q4_k_m.gguf", |
| 507 | + quant_type="Q4_K_M", |
| 508 | + file_size_bytes=15 * 1024**3, |
| 509 | + ) |
| 510 | + ], |
| 511 | + ) |
| 512 | + full_gpu_14b = ModelInfo( |
| 513 | + id="Qwen/Qwen3-14B", |
| 514 | + family_id="qwen3-14b", |
| 515 | + name="Qwen3-14B", |
| 516 | + parameter_count=14_800_000_000, |
| 517 | + downloads=1_600_000, |
| 518 | + likes=5_000, |
| 519 | + gguf_variants=[ |
| 520 | + GGUFVariant( |
| 521 | + filename="qwen3-14b-q5_k_m.gguf", |
| 522 | + quant_type="Q5_K_M", |
| 523 | + file_size_bytes=9 * 1024**3, |
| 524 | + ) |
| 525 | + ], |
| 526 | + ) |
| 527 | + full_gpu_8b = ModelInfo( |
| 528 | + id="Qwen/Qwen3-8B", |
| 529 | + family_id="qwen3-8b", |
| 530 | + name="Qwen3-8B", |
| 531 | + parameter_count=8_200_000_000, |
| 532 | + downloads=11_000_000, |
| 533 | + likes=5_000, |
| 534 | + gguf_variants=[ |
| 535 | + GGUFVariant( |
| 536 | + filename="qwen3-8b-q5_k_m.gguf", |
| 537 | + quant_type="Q5_K_M", |
| 538 | + file_size_bytes=5 * 1024**3, |
| 539 | + ) |
| 540 | + ], |
| 541 | + ) |
| 542 | + old_full_gpu = ModelInfo( |
| 543 | + id="google/gemma-2-9b-it", |
| 544 | + family_id="gemma-2-9b-it", |
| 545 | + name="gemma-2-9b-it", |
| 546 | + parameter_count=9_200_000_000, |
| 547 | + downloads=400_000, |
| 548 | + likes=1_000, |
| 549 | + gguf_variants=[ |
| 550 | + GGUFVariant( |
| 551 | + filename="gemma-2-9b-q5_k_m.gguf", |
| 552 | + quant_type="Q5_K_M", |
| 553 | + file_size_bytes=5_500_000_000, |
| 554 | + ) |
| 555 | + ], |
| 556 | + ) |
| 557 | + hardware = HardwareInfo( |
| 558 | + gpus=[ |
| 559 | + GPUInfo( |
| 560 | + name="RTX 3060", |
| 561 | + vendor="nvidia", |
| 562 | + vram_bytes=12 * 1024**3, |
| 563 | + compute_capability=(8, 6), |
| 564 | + memory_bandwidth_gbps=360.0, |
| 565 | + ) |
| 566 | + ], |
| 567 | + cpu_name="Test CPU", |
| 568 | + cpu_cores=6, |
| 569 | + has_avx2=True, |
| 570 | + ram_bytes=32 * 1024**3, |
| 571 | + disk_free_bytes=500 * 1024**3, |
| 572 | + os="windows", |
| 573 | + ) |
| 574 | + |
| 575 | + results = rank_models( |
| 576 | + [strong_partial, full_gpu_14b, full_gpu_8b, old_full_gpu], |
| 577 | + hardware, |
| 578 | + top_n=10, |
| 579 | + benchmark_scores={ |
| 580 | + "Qwen/Qwen3.6-27B": 83.5, |
| 581 | + "Qwen/Qwen3-14B": 66.7, |
| 582 | + "Qwen/Qwen3-8B": 56.1, |
| 583 | + "google/gemma-2-9b-it": 35.1, |
| 584 | + }, |
| 585 | + task_profile="any", |
| 586 | + ) |
| 587 | + |
| 588 | + ids = [r.model.id for r in results] |
| 589 | + assert ids.index("Qwen/Qwen3.6-27B") < ids.index("Qwen/Qwen3-8B") |
| 590 | + assert ids.index("Qwen/Qwen3.6-27B") < ids.index("google/gemma-2-9b-it") |
| 591 | + strong = next(r for r in results if r.model.id == "Qwen/Qwen3.6-27B") |
| 592 | + assert strong.fit_type == "partial_offload" |
| 593 | + assert ( |
| 594 | + strong.quality_score |
| 595 | + > next(r for r in results if r.model.id == "Qwen/Qwen3-8B").quality_score |
| 596 | + ) |
| 597 | + |
| 598 | + |
| 599 | +def test_moe_partial_offload_penalty_uses_active_working_set(): |
| 600 | + dense = ModelInfo( |
| 601 | + id="example/Dense-30B", |
| 602 | + family_id="dense-30b", |
| 603 | + name="Dense-30B", |
| 604 | + parameter_count=30_000_000_000, |
| 605 | + ) |
| 606 | + moe = ModelInfo( |
| 607 | + id="example/MoE-30B-A3B", |
| 608 | + family_id="moe-30b-a3b", |
| 609 | + name="MoE-30B-A3B", |
| 610 | + parameter_count=30_000_000_000, |
| 611 | + parameter_count_active=3_000_000_000, |
| 612 | + is_moe=True, |
| 613 | + ) |
| 614 | + |
| 615 | + assert _partial_offload_quality_factor(dense, 0.80) == 0.42 |
| 616 | + assert _partial_offload_quality_factor(moe, 0.80) >= 0.66 |
| 617 | + |
| 618 | + |
496 | 619 | def test_evidence_strict_filters_out_estimated_models(): |
497 | 620 | direct_model = ModelInfo( |
498 | 621 | id="Qwen/Qwen2.5-7B-Instruct", |
|
0 commit comments