Skip to content

Commit 4325784

Browse files
committed
feat(throughput): warn on no-fit deploys and refine TP decode scaling
- surface a non-blocking "model does not fit" warning on the deploy page when the high-confidence estimate leaves no room for the KV cache; `Deploy` stays enabled since the user may pick more GPUs per replica than the estimate assumed, and it is hidden when `fp8Blocked` already explains a blocking reason - step `tpDecodeEfficiency` down by TP group size (1.0 for TP1, 0.85 for TP2-4, 0.75 for TP>4) instead of a flat 0.85, so large tensor-parallel groups crossing NVLink domains are not over-estimated - add tests for the no-fit warning and the new efficiency tiers Signed-off-by: Suraj Deshmukh <suraj.deshmukh@microsoft.com>
1 parent 6c7b73f commit 4325784

5 files changed

Lines changed: 168 additions & 11 deletions

File tree

backend/src/services/gpuPerformance.test.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@ import {
55
deriveTpSizeToFitWeights,
66
estimatePerChatTokensPerSec,
77
estimateConcurrentCapacity,
8+
tpDecodeEfficiency,
89
TP_DECODE_EFFICIENCY,
10+
TP_DECODE_EFFICIENCY_LARGE,
911
} from './gpuPerformance';
1012
import { resolveModelParamCount } from '@airunway/shared';
1113
import { gpuSupportsFp8 } from './costEstimation';
@@ -252,6 +254,36 @@ describe('estimatePerChatTokensPerSec', () => {
252254
});
253255
expect(explicit).toBe(omitted);
254256
});
257+
258+
test('tpDecodeEfficiency steps down with TP group size', () => {
259+
// TP1: no cross-GPU all-reduce on the decode path.
260+
expect(tpDecodeEfficiency(1)).toBe(1);
261+
// TP2–4: single NVLink-domain mid tier.
262+
expect(tpDecodeEfficiency(2)).toBe(TP_DECODE_EFFICIENCY);
263+
expect(tpDecodeEfficiency(4)).toBe(TP_DECODE_EFFICIENCY);
264+
// TP>4: crosses domains / nodes → larger haircut.
265+
expect(tpDecodeEfficiency(8)).toBe(TP_DECODE_EFFICIENCY_LARGE);
266+
expect(tpDecodeEfficiency(16)).toBe(TP_DECODE_EFFICIENCY_LARGE);
267+
// Lower tier is the optimistic bound.
268+
expect(TP_DECODE_EFFICIENCY).toBeGreaterThan(TP_DECODE_EFFICIENCY_LARGE);
269+
});
270+
271+
test('large TP groups (>4) use the reduced decode efficiency tier', () => {
272+
const single = estimatePerChatTokensPerSec({
273+
paramCount: 70e9,
274+
bytesPerWeight: 2,
275+
memBandwidthGBs: 3350,
276+
tpSize: 1,
277+
});
278+
const octa = estimatePerChatTokensPerSec({
279+
paramCount: 70e9,
280+
bytesPerWeight: 2,
281+
memBandwidthGBs: 3350,
282+
tpSize: 8,
283+
});
284+
// TP8 scales by 8 × TP_DECODE_EFFICIENCY_LARGE, not the mid-tier 0.85.
285+
expect(octa / single).toBeCloseTo(8 * TP_DECODE_EFFICIENCY_LARGE, 5);
286+
});
255287
});
256288

257289
describe('estimateConcurrentCapacity', () => {

backend/src/services/gpuPerformance.ts

Lines changed: 41 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,16 +28,46 @@ export const MEM_BW_EFFICIENCY = 0.8;
2828
* Tensor-parallel decode scaling efficiency. Under TP the weights are sharded
2929
* across `tpSize` GPUs whose HBM bandwidth aggregates, so single-stream decode
3030
* speeds up ~`tpSize×` — minus a per-GPU haircut for all-reduce / interconnect
31-
* overhead. Aggregate effective bandwidth ≈ `tpSize × perGpuBW × this`.
31+
* overhead. Aggregate effective bandwidth ≈ `tpSize × perGpuBW × efficiency`.
3232
*
33-
* Caveat: this is a flat factor, independent of `tpSize` and interconnect. Real
34-
* single-stream TP decode is latency/communication bound, so the per-GPU haircut
35-
* grows with larger TP groups (and is worse across PCIe / multi-node than NVLink).
36-
* 0.85 is therefore optimistic for big TP groups; treat it as a rough upper bound,
37-
* consistent with the estimator's overall heuristic, disclaimer-backed nature.
33+
* Real single-stream TP decode is latency/communication bound, so the per-GPU
34+
* haircut grows with larger TP groups (and is worse across PCIe / multi-node
35+
* than NVLink). We therefore step the factor down with group size rather than
36+
* applying a single flat value (see `tpDecodeEfficiency`): small groups stay on
37+
* one NVLink domain, while big groups cross domains / nodes and lose more to
38+
* communication. These remain coarse heuristics, consistent with the
39+
* estimator's overall disclaimer-backed nature.
40+
*
41+
* `TP_DECODE_EFFICIENCY` is the mid-tier value (TP 2–4, a typical single NVLink
42+
* domain). Larger groups use `TP_DECODE_EFFICIENCY_LARGE`.
3843
*/
3944
export const TP_DECODE_EFFICIENCY = 0.85;
4045

46+
/**
47+
* Decode efficiency for large TP groups (more than 4 GPUs per replica). Beyond a
48+
* typical 4-GPU NVLink domain the all-reduce traffic increasingly crosses slower
49+
* links (multi-domain NVSwitch, PCIe, or multi-node fabric), so the realised
50+
* per-GPU bandwidth fraction drops further. 0.75 is a deliberately rough
51+
* step-down for the TP≥8 regime.
52+
*/
53+
export const TP_DECODE_EFFICIENCY_LARGE = 0.75;
54+
55+
/**
56+
* Per-GPU decode efficiency fraction for a given tensor-parallel size. Stepped
57+
* by group size to approximate the growing communication haircut:
58+
* - TP 1 → 1.0 (no cross-GPU all-reduce on the decode path)
59+
* - TP 2–4 → TP_DECODE_EFFICIENCY (0.85; typically one NVLink domain)
60+
* - TP > 4 → TP_DECODE_EFFICIENCY_LARGE (0.75; crosses domains / nodes)
61+
*
62+
* The cutover at 4 mirrors common 4-GPU NVLink partitioning; the values are
63+
* heuristic upper bounds, not measured constants.
64+
*/
65+
export function tpDecodeEfficiency(tpSize: number): number {
66+
if (tpSize <= 1) return 1;
67+
if (tpSize <= 4) return TP_DECODE_EFFICIENCY;
68+
return TP_DECODE_EFFICIENCY_LARGE;
69+
}
70+
4171
/** Per-GPU activation + workspace reserve (GiB) held back from the KV budget. */
4272
export const DECODE_HEADROOM_GIB = 5;
4373

@@ -87,7 +117,7 @@ export interface PerChatInput {
87117
/**
88118
* Tensor-parallel size (GPUs per replica). Defaults to 1. With TP > 1 the
89119
* weights shard across `tpSize` GPUs whose HBM bandwidth aggregates, so the
90-
* effective decode bandwidth scales by `tpSize × TP_DECODE_EFFICIENCY`.
120+
* effective decode bandwidth scales by `tpSize × tpDecodeEfficiency(tpSize)`.
91121
*/
92122
tpSize?: number;
93123
efficiency?: number;
@@ -98,8 +128,8 @@ export interface PerChatInput {
98128
* streaming the full set of model weights from HBM, so speed ≈ bandwidth /
99129
* model_bytes. Under tensor parallelism the weights shard across `tpSize` GPUs
100130
* whose HBM bandwidth aggregates, so single-stream decode scales ~`tpSize×`
101-
* (minus TP_DECODE_EFFICIENCY for interconnect overhead); tpSize=1 reduces to
102-
* the exact single-GPU figure.
131+
* (minus the per-group `tpDecodeEfficiency` haircut for interconnect overhead);
132+
* tpSize=1 reduces to the exact single-GPU figure.
103133
*
104134
* Note the one decimal/binary boundary: memory bandwidth is decimal GB/s
105135
* (vendor spec) and model bytes are decimal (paramCount × bytesPerWeight), so
@@ -115,7 +145,8 @@ export function estimatePerChatTokensPerSec(input: PerChatInput): number {
115145
} = input;
116146
const modelBytesDecimal = paramCount * bytesPerWeight; // decimal bytes
117147
// TP aggregates per-GPU bandwidth; tpSize=1 keeps the exact single-GPU number.
118-
const tpScale = tpSize > 1 ? tpSize * TP_DECODE_EFFICIENCY : 1;
148+
// The per-GPU efficiency steps down with group size (see tpDecodeEfficiency).
149+
const tpScale = tpSize > 1 ? tpSize * tpDecodeEfficiency(tpSize) : 1;
119150
const bandwidthBytesPerSec = memBandwidthGBs * 1e9 * tpScale; // decimal GB/s -> bytes/s
120151
if (modelBytesDecimal <= 0) return 0;
121152
return (bandwidthBytesPerSec / modelBytesDecimal) * efficiency;

frontend/src/components/deployments/DeploymentForm.test.tsx

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,69 @@ describe('DeploymentForm', () => {
149149
expect(screen.getByRole('button', { name: /Deploy Model/i })).toBeEnabled()
150150
})
151151

152+
it('warns but does not block deploying when the throughput estimate says the model does not fit', () => {
153+
render(
154+
<MemoryRouter>
155+
<DeploymentForm
156+
model={createModel({ supportedEngines: ['vllm'] })}
157+
detailedCapacity={createCapacity()}
158+
runtimes={[
159+
createRuntime({
160+
id: 'vllm',
161+
name: 'vLLM',
162+
installed: true,
163+
healthy: true,
164+
requiresCRD: false,
165+
}),
166+
]}
167+
doesNotFit
168+
doesNotFitReason="This model is estimated not to fit on this cluster's GPU (A10) at 1 GPU per replica."
169+
/>
170+
</MemoryRouter>
171+
)
172+
173+
const vllmCard = screen
174+
.getByText('High-throughput inference with the native vLLM provider')
175+
.closest('[role="radio"]') as HTMLElement
176+
fireEvent.click(vllmCard)
177+
178+
// The warning is surfaced...
179+
expect(
180+
screen.getByText(/estimated not to fit on this cluster's GPU \(A10\)/i)
181+
).toBeInTheDocument()
182+
// ...but Deploy stays enabled (the user may pick more GPUs per replica).
183+
expect(screen.getByRole('button', { name: /Deploy Model/i })).toBeEnabled()
184+
})
185+
186+
it('hides the does-not-fit warning when FP8 is already blocking deployment', () => {
187+
render(
188+
<MemoryRouter>
189+
<DeploymentForm
190+
model={createModel({ supportedEngines: ['vllm'] })}
191+
detailedCapacity={createCapacity()}
192+
runtimes={[
193+
createRuntime({
194+
id: 'vllm',
195+
name: 'vLLM',
196+
installed: true,
197+
healthy: true,
198+
requiresCRD: false,
199+
}),
200+
]}
201+
doesNotFit
202+
doesNotFitReason="This model is estimated not to fit."
203+
fp8Blocked
204+
fp8BlockReason="FP8 is only supported on H100/H200 GPUs."
205+
/>
206+
</MemoryRouter>
207+
)
208+
209+
// The blocking FP8 message wins; the does-not-fit warning is suppressed to
210+
// avoid stacking two conflicting messages.
211+
expect(screen.getByText(/FP8 is only supported on H100\/H200 GPUs/i)).toBeInTheDocument()
212+
expect(screen.queryByText(/estimated not to fit/i)).not.toBeInTheDocument()
213+
})
214+
152215
it('treats a CRD-less vLLM provider that is not ready as registered but unavailable', async () => {
153216
render(
154217
<MemoryRouter>

frontend/src/components/deployments/DeploymentForm.tsx

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,16 @@ interface DeploymentFormProps {
9696
fp8Blocked?: boolean
9797
/** Human-readable reason shown when fp8Blocked is true. */
9898
fp8BlockReason?: string
99+
/**
100+
* True when the throughput estimate determined (with high confidence) that the
101+
* model does not fit on the cluster's GPU at the estimated topology. Surfaced
102+
* as a non-blocking warning near the Deploy button — it does NOT disable
103+
* deploying, since the user may select more GPUs per replica than the estimate
104+
* assumed.
105+
*/
106+
doesNotFit?: boolean
107+
/** Human-readable reason shown when doesNotFit is true. */
108+
doesNotFitReason?: string
99109
}
100110

101111
// Subset of Engine type for traditional GPU inference engines (excludes llamacpp which is KAITO-only)
@@ -262,7 +272,7 @@ export function setFp8PrecisionEngineArgs(
262272
return Object.keys(nextEngineArgs).length > 0 ? nextEngineArgs : undefined;
263273
}
264274

265-
export function DeploymentForm({ model, detailedCapacity, autoscaler, runtimes, weightQuant = 'fp16', kvCacheDtype = 'fp16', fp8Blocked = false, fp8BlockReason }: DeploymentFormProps) {
275+
export function DeploymentForm({ model, detailedCapacity, autoscaler, runtimes, weightQuant = 'fp16', kvCacheDtype = 'fp16', fp8Blocked = false, fp8BlockReason, doesNotFit = false, doesNotFitReason }: DeploymentFormProps) {
266276
const navigate = useNavigate()
267277
const { toast } = useToast()
268278
const createDeployment = useCreateDeployment()
@@ -2002,6 +2012,15 @@ export function DeploymentForm({ model, detailedCapacity, autoscaler, runtimes,
20022012
{fp8BlockReason || 'FP8 is only supported on H100/H200 GPUs. Choose FP16/BF16 to deploy.'}
20032013
</p>
20042014
)}
2015+
{/* Non-blocking "does not fit" warning. Deploy stays enabled: the estimate
2016+
assumes a fixed GPUs-per-replica, but the user may select more here, so
2017+
we caution rather than block. Hidden when fp8Blocked already explains a
2018+
blocking reason. */}
2019+
{doesNotFit && !fp8Blocked && (
2020+
<p className="text-sm text-yellow-500/90 text-center">
2021+
{doesNotFitReason || "This model is estimated not to fit on this cluster's GPUs at the selected precision. Try more GPUs per replica, a smaller model, or FP8 precision."}
2022+
</p>
2023+
)}
20052024
</form>
20062025
</>
20072026
)

frontend/src/pages/DeployPage.tsx

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,16 @@ export function DeployPage() {
109109
const fp8CapabilityUnknown =
110110
fp8Selected && !throughputLoading && throughput?.fp8Supported === undefined
111111

112+
// High-confidence "model does not fit": the backend had real architecture
113+
// details and the KV budget left no room for even one sequence on the
114+
// estimate's assumed topology. We surface a non-blocking warning (the user may
115+
// still pick more GPUs-per-replica in the form than the estimate assumed, which
116+
// can make it fit) rather than disabling Deploy outright.
117+
const doesNotFit = !!throughput?.doesNotFit && !throughput?.lowConfidence
118+
const doesNotFitReason = doesNotFit
119+
? `This model is estimated not to fit on this cluster's GPU${throughput?.gpuModel ? ` (${throughput.gpuModel})` : ''} at ${throughput?.tpSize ?? 1} GPU${(throughput?.tpSize ?? 1) > 1 ? 's' : ''} per replica — the model weights plus reserved memory leave no room for the conversation cache. Increasing GPUs per replica below, choosing a smaller model, or using FP8 precision may help.`
120+
: undefined
121+
112122
// Wait for both model and runtimes to load before showing the form
113123
// This ensures the runtime selector is visible when the form renders
114124
if (modelLoading || runtimesLoading) {
@@ -306,6 +316,8 @@ export function DeployPage() {
306316
kvCacheDtype={kvCacheDtype}
307317
fp8Blocked={fp8Blocked}
308318
fp8BlockReason={fp8BlockReason}
319+
doesNotFit={doesNotFit}
320+
doesNotFitReason={doesNotFitReason}
309321
/>
310322
</div>
311323
</div>

0 commit comments

Comments
 (0)