Skip to content

Commit 0aaf257

Browse files
committed
fix: harden GPU classifier matching for shared device_ids, fix awk injection and pcie_width parsing in amd-topo
1 parent d8ddf7a commit 0aaf257

File tree

3 files changed

+114
-15
lines changed

3 files changed

+114
-15
lines changed

dream-server/installers/lib/amd-topo.sh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -376,7 +376,7 @@ detect_amd_topo() {
376376

377377
device_id=$(cat "$card_dir/device" 2>/dev/null | sed 's/^0x//') || device_id="0000"
378378
vram_bytes=$(cat "$card_dir/mem_info_vram_total" 2>/dev/null) || vram_bytes=0
379-
vram_gb=$(awk "BEGIN { printf \"%.1f\", $vram_bytes / 1073741824 }")
379+
vram_gb=$(awk -v bytes="$vram_bytes" 'BEGIN { printf "%.1f", bytes / 1073741824 }')
380380

381381
uuid=$(amd_gpu_id "$card_dir" "$idx")
382382
gfx_ver=$(amd_gfx_version "$card_dir" "$idx")
@@ -387,10 +387,10 @@ detect_amd_topo() {
387387
pci_bdf=$(readlink -f "$card_dir" | grep -oP '[0-9a-f]{4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-9]' | tail -1) || pci_bdf="unknown"
388388
pcie_gen=$(cat "$card_dir/current_link_speed" 2>/dev/null | grep -oP '^\d+' || \
389389
cat "$card_dir/max_link_speed" 2>/dev/null | grep -oP '^\d+' || echo "unknown")
390-
pcie_width=$(cat "$card_dir/current_link_width" 2>/dev/null || \
391-
cat "$card_dir/max_link_width" 2>/dev/null || echo "unknown")
392-
[[ "$pcie_width" == "0" || "$pcie_width" == "Unknown" ]] && \
393-
pcie_width=$(cat "$card_dir/max_link_width" 2>/dev/null || echo "unknown")
390+
pcie_width=$(cat "$card_dir/current_link_width" 2>/dev/null | grep -oP '^\d+' || \
391+
cat "$card_dir/max_link_width" 2>/dev/null | grep -oP '^\d+' || echo "unknown")
392+
[[ "$pcie_width" == "0" ]] && \
393+
pcie_width=$(cat "$card_dir/max_link_width" 2>/dev/null | grep -oP '^\d+' || echo "unknown")
394394

395395
# Detect memory type per card
396396
local gtt_bytes mem_type

dream-server/scripts/classify-hardware.sh

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@ OVERLAY_MAP = {
8383
8484
# --- Pass 1: Match known_gpus by device_id then name_patterns ---
8585
selected = None
86+
best_name_len = 0 # longest matching pattern wins (prevents "XT" matching "XTX")
87+
best_id_vram_diff = None # closest VRAM wins for device_id-only fallback
8688
combined_name = f"{gpu_name} {cpu_name}".strip().lower()
8789
8890
for entry in db.get("known_gpus", []):
@@ -94,20 +96,30 @@ for entry in db.get("known_gpus", []):
9496
9597
# Try name_patterns match (case-insensitive substring against gpu_name + cpu_name)
9698
patterns = match.get("name_patterns", [])
97-
name_matched = any(p.lower() in combined_name for p in patterns) if combined_name and patterns else False
99+
matched_patterns = [p for p in patterns if p.lower() in combined_name] if combined_name and patterns else []
100+
name_matched = len(matched_patterns) > 0
101+
match_len = max((len(p) for p in matched_patterns), default=0)
98102
99103
if id_matched and name_matched:
100-
# Best match: both device_id and name match
101-
selected = entry
102-
break
103-
elif id_matched and not selected:
104-
# Device ID matched but name didn't — remember as fallback
105-
selected = entry
106-
# Keep looking for a better match with same device_id
107-
continue
104+
# Both match — prefer longest pattern to avoid "XT" matching "XTX"
105+
if match_len > best_name_len:
106+
selected = entry
107+
best_name_len = match_len
108+
elif id_matched and best_name_len == 0:
109+
# Device ID matched but name didn't — use VRAM proximity as tiebreaker
110+
entry_vram = entry.get("specs", {}).get("memory_mb", 0)
111+
if vram_mb > 0:
112+
diff = abs(entry_vram - vram_mb)
113+
else:
114+
# No VRAM info: prefer smallest card (under-provision is safe,
115+
# over-provision crashes the model loader)
116+
diff = entry_vram if entry_vram > 0 else float("inf")
117+
if best_id_vram_diff is None or diff < best_id_vram_diff:
118+
selected = entry
119+
best_id_vram_diff = diff
108120
elif name_matched and not selected:
109121
selected = entry
110-
break
122+
best_name_len = match_len
111123
112124
# --- Pass 2: Heuristic fallback (threshold-based, top-down) ---
113125
if not selected:

dream-server/tests/contracts/test-installer-contracts.sh

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,4 +52,91 @@ grep -q 'MINIO_TELEMETRY_DISABLED.*1' extensions/services/langfuse/compose.yaml.
5252
grep -q 'MINIO_TELEMETRY_DISABLED.*1' extensions/services/langfuse/compose.yaml 2>/dev/null || \
5353
{ echo "[FAIL] MinIO telemetry not disabled"; exit 1; }
5454

55+
# --- classify-hardware: shared device_id disambiguation ---
56+
echo "[contract] classify-hardware shared device_id"
57+
_classify() {
58+
bash scripts/classify-hardware.sh --device-id "$1" --gpu-name "$2" --gpu-vendor "${3:-amd}" --vram-mb "${4:-0}" 2>/dev/null
59+
}
60+
_classify_id() { _classify "$@" | jq -r '.id'; }
61+
_classify_tier() { _classify "$@" | jq -r '.recommended.tier'; }
62+
_classify_bw() { _classify "$@" | jq -r '.bandwidth_gbps'; }
63+
64+
# --- 0x744c: XTX / XT / GRE (same die, different SKUs) ---
65+
66+
# Happy path: device_id + name → exact match
67+
[[ "$(_classify_id 0x744c "AMD Radeon RX 7900 XTX" amd 24576)" == "rx_7900_xtx" ]] \
68+
|| { echo "[FAIL] XTX with name"; exit 1; }
69+
[[ "$(_classify_id 0x744c "AMD Radeon RX 7900 XT" amd 20480)" == "rx_7900_xt" ]] \
70+
|| { echo "[FAIL] XT with name"; exit 1; }
71+
[[ "$(_classify_id 0x744c "AMD Radeon RX 7900 GRE" amd 16384)" == "rx_7900_gre" ]] \
72+
|| { echo "[FAIL] GRE with name"; exit 1; }
73+
74+
# Substring safety: "RX 7900 XT" is a substring of "RX 7900 XTX"
75+
# XT name must NOT match XTX entry (longest pattern wins)
76+
[[ "$(_classify_id 0x744c "AMD Radeon RX 7900 XT" amd 20480)" != "rx_7900_xtx" ]] \
77+
|| { echo "[FAIL] XT matched XTX (substring collision)"; exit 1; }
78+
# XTX name must NOT match XT entry
79+
[[ "$(_classify_id 0x744c "AMD Radeon RX 7900 XTX" amd 24576)" != "rx_7900_xt" ]] \
80+
|| { echo "[FAIL] XTX matched XT"; exit 1; }
81+
82+
# Tier correctness: GRE is T2, the others are T3
83+
[[ "$(_classify_tier 0x744c "AMD Radeon RX 7900 XTX" amd 24576)" == "T3" ]] \
84+
|| { echo "[FAIL] XTX tier"; exit 1; }
85+
[[ "$(_classify_tier 0x744c "AMD Radeon RX 7900 GRE" amd 16384)" == "T2" ]] \
86+
|| { echo "[FAIL] GRE tier"; exit 1; }
87+
88+
# Bandwidth correctness: each SKU has a different value
89+
[[ "$(_classify_bw 0x744c "AMD Radeon RX 7900 XTX" amd 24576)" == "960" ]] \
90+
|| { echo "[FAIL] XTX bandwidth"; exit 1; }
91+
[[ "$(_classify_bw 0x744c "AMD Radeon RX 7900 XT" amd 20480)" == "800" ]] \
92+
|| { echo "[FAIL] XT bandwidth"; exit 1; }
93+
[[ "$(_classify_bw 0x744c "AMD Radeon RX 7900 GRE" amd 16384)" == "576" ]] \
94+
|| { echo "[FAIL] GRE bandwidth"; exit 1; }
95+
96+
# Empty name: VRAM tiebreaker picks closest match
97+
[[ "$(_classify_id 0x744c "" amd 24576)" == "rx_7900_xtx" ]] \
98+
|| { echo "[FAIL] empty name + 24GB → XTX"; exit 1; }
99+
[[ "$(_classify_id 0x744c "" amd 20480)" == "rx_7900_xt" ]] \
100+
|| { echo "[FAIL] empty name + 20GB → XT"; exit 1; }
101+
[[ "$(_classify_id 0x744c "" amd 16384)" == "rx_7900_gre" ]] \
102+
|| { echo "[FAIL] empty name + 16GB → GRE"; exit 1; }
103+
104+
# Empty name + zero VRAM: picks smallest card (under-provision is safe,
105+
# over-provision would crash the model loader)
106+
[[ "$(_classify_id 0x744c "" amd 0)" == "rx_7900_gre" ]] \
107+
|| { echo "[FAIL] empty name + 0 VRAM → should be GRE (smallest)"; exit 1; }
108+
109+
# Empty name + close-but-not-exact VRAM: picks nearest
110+
# 22000 MB is closer to XT (20480, diff=1520) than XTX (24576, diff=2576)
111+
[[ "$(_classify_id 0x744c "" amd 22000)" == "rx_7900_xt" ]] \
112+
|| { echo "[FAIL] empty name + 22GB → should be XT (nearest)"; exit 1; }
113+
# 18000 MB is closer to GRE (16384, diff=1616) than XT (20480, diff=2480)
114+
[[ "$(_classify_id 0x744c "" amd 18000)" == "rx_7900_gre" ]] \
115+
|| { echo "[FAIL] empty name + 18GB → should be GRE (nearest)"; exit 1; }
116+
117+
# --- 0x7480: RX 7800 XT / RX 7700 XT (second shared device_id pair) ---
118+
119+
[[ "$(_classify_id 0x7480 "AMD Radeon RX 7800 XT" amd 16384)" == "rx_7800_xt" ]] \
120+
|| { echo "[FAIL] 7800 XT with name"; exit 1; }
121+
[[ "$(_classify_id 0x7480 "AMD Radeon RX 7700 XT" amd 12288)" == "rx_7700_xt" ]] \
122+
|| { echo "[FAIL] 7700 XT with name"; exit 1; }
123+
[[ "$(_classify_id 0x7480 "" amd 16384)" == "rx_7800_xt" ]] \
124+
|| { echo "[FAIL] 0x7480 empty name + 16GB → 7800 XT"; exit 1; }
125+
[[ "$(_classify_id 0x7480 "" amd 12288)" == "rx_7700_xt" ]] \
126+
|| { echo "[FAIL] 0x7480 empty name + 12GB → 7700 XT"; exit 1; }
127+
128+
# --- Name-only match (no device_id) ---
129+
130+
[[ "$(_classify_id "" "RYZEN AI MAX+ 395" amd 0)" == "strix_halo_395" ]] \
131+
|| { echo "[FAIL] Strix Halo name-only match"; exit 1; }
132+
[[ "$(_classify_id "" "RX 9070 XT" amd 16384)" == "rx_9070_xt" ]] \
133+
|| { echo "[FAIL] RX 9070 XT name-only match"; exit 1; }
134+
135+
# --- No match → heuristic fallback (should not crash) ---
136+
137+
result=$(_classify_id "0xFFFF" "Unknown GPU" amd 8192)
138+
[[ -n "$result" && "$result" != "null" ]] \
139+
|| { echo "[FAIL] unknown GPU crashed"; exit 1; }
140+
141+
55142
echo "[PASS] installer contracts"

0 commit comments

Comments
 (0)