feat(throughput): warn on no-fit deploys and refine TP decode scaling

surajssd · surajssd · commit 43257849ee29 · 2026-06-05T10:06:48.000-07:00
- surface a non-blocking "model does not fit" warning on the deploy
  page when the high-confidence estimate leaves no room for the KV
  cache; `Deploy` stays enabled since the user may pick more GPUs per
  replica than the estimate assumed, and it is hidden when `fp8Blocked`
  already explains a blocking reason
- step `tpDecodeEfficiency` down by TP group size (1.0 for TP1, 0.85
  for TP2-4, 0.75 for TP&gt;4) instead of a flat 0.85, so large
  tensor-parallel groups crossing NVLink domains are not over-estimated
- add tests for the no-fit warning and the new efficiency tiers

Signed-off-by: Suraj Deshmukh &lt;suraj.deshmukh@microsoft.com&gt;
diff --git a/backend/src/services/gpuPerformance.test.ts b/backend/src/services/gpuPerformance.test.ts
@@ -5,7 +5,9 @@ import {
   deriveTpSizeToFitWeights,
   estimatePerChatTokensPerSec,
   estimateConcurrentCapacity,
+  tpDecodeEfficiency,
   TP_DECODE_EFFICIENCY,
+  TP_DECODE_EFFICIENCY_LARGE,
 } from './gpuPerformance';
 import { resolveModelParamCount } from '@airunway/shared';
 import { gpuSupportsFp8 } from './costEstimation';
@@ -252,6 +254,36 @@ describe('estimatePerChatTokensPerSec', () => {
     });
     expect(explicit).toBe(omitted);
   });
+
+  test('tpDecodeEfficiency steps down with TP group size', () => {
+    // TP1: no cross-GPU all-reduce on the decode path.
+    expect(tpDecodeEfficiency(1)).toBe(1);
+    // TP2–4: single NVLink-domain mid tier.
+    expect(tpDecodeEfficiency(2)).toBe(TP_DECODE_EFFICIENCY);
+    expect(tpDecodeEfficiency(4)).toBe(TP_DECODE_EFFICIENCY);
+    // TP>4: crosses domains / nodes → larger haircut.
+    expect(tpDecodeEfficiency(8)).toBe(TP_DECODE_EFFICIENCY_LARGE);
+    expect(tpDecodeEfficiency(16)).toBe(TP_DECODE_EFFICIENCY_LARGE);
+    // Lower tier is the optimistic bound.
+    expect(TP_DECODE_EFFICIENCY).toBeGreaterThan(TP_DECODE_EFFICIENCY_LARGE);
+  });
+
+  test('large TP groups (>4) use the reduced decode efficiency tier', () => {
+    const single = estimatePerChatTokensPerSec({
+      paramCount: 70e9,
+      bytesPerWeight: 2,
+      memBandwidthGBs: 3350,
+      tpSize: 1,
+    });
+    const octa = estimatePerChatTokensPerSec({
+      paramCount: 70e9,
+      bytesPerWeight: 2,
+      memBandwidthGBs: 3350,
+      tpSize: 8,
+    });
+    // TP8 scales by 8 × TP_DECODE_EFFICIENCY_LARGE, not the mid-tier 0.85.
+    expect(octa / single).toBeCloseTo(8 * TP_DECODE_EFFICIENCY_LARGE, 5);
+  });
 });
 
 describe('estimateConcurrentCapacity', () => {
diff --git a/backend/src/services/gpuPerformance.ts b/backend/src/services/gpuPerformance.ts
@@ -28,16 +28,46 @@ export const MEM_BW_EFFICIENCY = 0.8;
  * Tensor-parallel decode scaling efficiency. Under TP the weights are sharded
  * across `tpSize` GPUs whose HBM bandwidth aggregates, so single-stream decode
  * speeds up ~`tpSize×` — minus a per-GPU haircut for all-reduce / interconnect
- * overhead. Aggregate effective bandwidth ≈ `tpSize × perGpuBW × this`.
+ * overhead. Aggregate effective bandwidth ≈ `tpSize × perGpuBW × efficiency`.
  *
- * Caveat: this is a flat factor, independent of `tpSize` and interconnect. Real
- * single-stream TP decode is latency/communication bound, so the per-GPU haircut
- * grows with larger TP groups (and is worse across PCIe / multi-node than NVLink).
- * 0.85 is therefore optimistic for big TP groups; treat it as a rough upper bound,
- * consistent with the estimator's overall heuristic, disclaimer-backed nature.
+ * Real single-stream TP decode is latency/communication bound, so the per-GPU
+ * haircut grows with larger TP groups (and is worse across PCIe / multi-node
+ * than NVLink). We therefore step the factor down with group size rather than
+ * applying a single flat value (see `tpDecodeEfficiency`): small groups stay on
+ * one NVLink domain, while big groups cross domains / nodes and lose more to
+ * communication. These remain coarse heuristics, consistent with the
+ * estimator's overall disclaimer-backed nature.
+ *
+ * `TP_DECODE_EFFICIENCY` is the mid-tier value (TP 2–4, a typical single NVLink
+ * domain). Larger groups use `TP_DECODE_EFFICIENCY_LARGE`.
  */
 export const TP_DECODE_EFFICIENCY = 0.85;
 
+/**
+ * Decode efficiency for large TP groups (more than 4 GPUs per replica). Beyond a
+ * typical 4-GPU NVLink domain the all-reduce traffic increasingly crosses slower
+ * links (multi-domain NVSwitch, PCIe, or multi-node fabric), so the realised
+ * per-GPU bandwidth fraction drops further. 0.75 is a deliberately rough
+ * step-down for the TP≥8 regime.
+ */
+export const TP_DECODE_EFFICIENCY_LARGE = 0.75;
+
+/**
+ * Per-GPU decode efficiency fraction for a given tensor-parallel size. Stepped
+ * by group size to approximate the growing communication haircut:
+ *   - TP 1     → 1.0  (no cross-GPU all-reduce on the decode path)
+ *   - TP 2–4   → TP_DECODE_EFFICIENCY (0.85; typically one NVLink domain)
+ *   - TP > 4   → TP_DECODE_EFFICIENCY_LARGE (0.75; crosses domains / nodes)
+ *
+ * The cutover at 4 mirrors common 4-GPU NVLink partitioning; the values are
+ * heuristic upper bounds, not measured constants.
+ */
+export function tpDecodeEfficiency(tpSize: number): number {
+  if (tpSize <= 1) return 1;
+  if (tpSize <= 4) return TP_DECODE_EFFICIENCY;
+  return TP_DECODE_EFFICIENCY_LARGE;
+}
+
 /** Per-GPU activation + workspace reserve (GiB) held back from the KV budget. */
 export const DECODE_HEADROOM_GIB = 5;
 
@@ -87,7 +117,7 @@ export interface PerChatInput {
   /**
    * Tensor-parallel size (GPUs per replica). Defaults to 1. With TP > 1 the
    * weights shard across `tpSize` GPUs whose HBM bandwidth aggregates, so the
-   * effective decode bandwidth scales by `tpSize × TP_DECODE_EFFICIENCY`.
+   * effective decode bandwidth scales by `tpSize × tpDecodeEfficiency(tpSize)`.
    */
   tpSize?: number;
   efficiency?: number;
@@ -98,8 +128,8 @@ export interface PerChatInput {
  * streaming the full set of model weights from HBM, so speed ≈ bandwidth /
  * model_bytes. Under tensor parallelism the weights shard across `tpSize` GPUs
  * whose HBM bandwidth aggregates, so single-stream decode scales ~`tpSize×`
- * (minus TP_DECODE_EFFICIENCY for interconnect overhead); tpSize=1 reduces to
- * the exact single-GPU figure.
+ * (minus the per-group `tpDecodeEfficiency` haircut for interconnect overhead);
+ * tpSize=1 reduces to the exact single-GPU figure.
  *
  * Note the one decimal/binary boundary: memory bandwidth is decimal GB/s
  * (vendor spec) and model bytes are decimal (paramCount × bytesPerWeight), so
@@ -115,7 +145,8 @@ export function estimatePerChatTokensPerSec(input: PerChatInput): number {
   } = input;
   const modelBytesDecimal = paramCount * bytesPerWeight; // decimal bytes
   // TP aggregates per-GPU bandwidth; tpSize=1 keeps the exact single-GPU number.
-  const tpScale = tpSize > 1 ? tpSize * TP_DECODE_EFFICIENCY : 1;
+  // The per-GPU efficiency steps down with group size (see tpDecodeEfficiency).
+  const tpScale = tpSize > 1 ? tpSize * tpDecodeEfficiency(tpSize) : 1;
   const bandwidthBytesPerSec = memBandwidthGBs * 1e9 * tpScale; // decimal GB/s -> bytes/s
   if (modelBytesDecimal <= 0) return 0;
   return (bandwidthBytesPerSec / modelBytesDecimal) * efficiency;
diff --git a/frontend/src/components/deployments/DeploymentForm.test.tsx b/frontend/src/components/deployments/DeploymentForm.test.tsx
@@ -149,6 +149,69 @@ describe('DeploymentForm', () => {
     expect(screen.getByRole('button', { name: /Deploy Model/i })).toBeEnabled()
   })
 
+  it('warns but does not block deploying when the throughput estimate says the model does not fit', () => {
+    render(
+      <MemoryRouter>
+        <DeploymentForm
+          model={createModel({ supportedEngines: ['vllm'] })}
+          detailedCapacity={createCapacity()}
+          runtimes={[
+            createRuntime({
+              id: 'vllm',
+              name: 'vLLM',
+              installed: true,
+              healthy: true,
+              requiresCRD: false,
+            }),
+          ]}
+          doesNotFit
+          doesNotFitReason="This model is estimated not to fit on this cluster's GPU (A10) at 1 GPU per replica."
+        />
+      </MemoryRouter>
+    )
+
+    const vllmCard = screen
+      .getByText('High-throughput inference with the native vLLM provider')
+      .closest('[role="radio"]') as HTMLElement
+    fireEvent.click(vllmCard)
+
+    // The warning is surfaced...
+    expect(
+      screen.getByText(/estimated not to fit on this cluster's GPU \(A10\)/i)
+    ).toBeInTheDocument()
+    // ...but Deploy stays enabled (the user may pick more GPUs per replica).
+    expect(screen.getByRole('button', { name: /Deploy Model/i })).toBeEnabled()
+  })
+
+  it('hides the does-not-fit warning when FP8 is already blocking deployment', () => {
+    render(
+      <MemoryRouter>
+        <DeploymentForm
+          model={createModel({ supportedEngines: ['vllm'] })}
+          detailedCapacity={createCapacity()}
+          runtimes={[
+            createRuntime({
+              id: 'vllm',
+              name: 'vLLM',
+              installed: true,
+              healthy: true,
+              requiresCRD: false,
+            }),
+          ]}
+          doesNotFit
+          doesNotFitReason="This model is estimated not to fit."
+          fp8Blocked
+          fp8BlockReason="FP8 is only supported on H100/H200 GPUs."
+        />
+      </MemoryRouter>
+    )
+
+    // The blocking FP8 message wins; the does-not-fit warning is suppressed to
+    // avoid stacking two conflicting messages.
+    expect(screen.getByText(/FP8 is only supported on H100\/H200 GPUs/i)).toBeInTheDocument()
+    expect(screen.queryByText(/estimated not to fit/i)).not.toBeInTheDocument()
+  })
+
   it('treats a CRD-less vLLM provider that is not ready as registered but unavailable', async () => {
     render(
       <MemoryRouter>
diff --git a/frontend/src/components/deployments/DeploymentForm.tsx b/frontend/src/components/deployments/DeploymentForm.tsx
@@ -96,6 +96,16 @@ interface DeploymentFormProps {
   fp8Blocked?: boolean
   /** Human-readable reason shown when fp8Blocked is true. */
   fp8BlockReason?: string
+  /**
+   * True when the throughput estimate determined (with high confidence) that the
+   * model does not fit on the cluster's GPU at the estimated topology. Surfaced
+   * as a non-blocking warning near the Deploy button — it does NOT disable
+   * deploying, since the user may select more GPUs per replica than the estimate
+   * assumed.
+   */
+  doesNotFit?: boolean
+  /** Human-readable reason shown when doesNotFit is true. */
+  doesNotFitReason?: string
 }
 
 // Subset of Engine type for traditional GPU inference engines (excludes llamacpp which is KAITO-only)
@@ -262,7 +272,7 @@ export function setFp8PrecisionEngineArgs(
   return Object.keys(nextEngineArgs).length > 0 ? nextEngineArgs : undefined;
 }
 
-export function DeploymentForm({ model, detailedCapacity, autoscaler, runtimes, weightQuant = 'fp16', kvCacheDtype = 'fp16', fp8Blocked = false, fp8BlockReason }: DeploymentFormProps) {
+export function DeploymentForm({ model, detailedCapacity, autoscaler, runtimes, weightQuant = 'fp16', kvCacheDtype = 'fp16', fp8Blocked = false, fp8BlockReason, doesNotFit = false, doesNotFitReason }: DeploymentFormProps) {
   const navigate = useNavigate()
   const { toast } = useToast()
   const createDeployment = useCreateDeployment()
@@ -2002,6 +2012,15 @@ export function DeploymentForm({ model, detailedCapacity, autoscaler, runtimes,
           {fp8BlockReason || 'FP8 is only supported on H100/H200 GPUs. Choose FP16/BF16 to deploy.'}
         </p>
       )}
+      {/* Non-blocking "does not fit" warning. Deploy stays enabled: the estimate
+          assumes a fixed GPUs-per-replica, but the user may select more here, so
+          we caution rather than block. Hidden when fp8Blocked already explains a
+          blocking reason. */}
+      {doesNotFit && !fp8Blocked && (
+        <p className="text-sm text-yellow-500/90 text-center">
+          {doesNotFitReason || "This model is estimated not to fit on this cluster's GPUs at the selected precision. Try more GPUs per replica, a smaller model, or FP8 precision."}
+        </p>
+      )}
     </form>
     </>
   )
diff --git a/frontend/src/pages/DeployPage.tsx b/frontend/src/pages/DeployPage.tsx
@@ -109,6 +109,16 @@ export function DeployPage() {
   const fp8CapabilityUnknown =
     fp8Selected && !throughputLoading && throughput?.fp8Supported === undefined
 
+  // High-confidence "model does not fit": the backend had real architecture
+  // details and the KV budget left no room for even one sequence on the
+  // estimate's assumed topology. We surface a non-blocking warning (the user may
+  // still pick more GPUs-per-replica in the form than the estimate assumed, which
+  // can make it fit) rather than disabling Deploy outright.
+  const doesNotFit = !!throughput?.doesNotFit && !throughput?.lowConfidence
+  const doesNotFitReason = doesNotFit
+    ? `This model is estimated not to fit on this cluster's GPU${throughput?.gpuModel ? ` (${throughput.gpuModel})` : ''} at ${throughput?.tpSize ?? 1} GPU${(throughput?.tpSize ?? 1) > 1 ? 's' : ''} per replica — the model weights plus reserved memory leave no room for the conversation cache. Increasing GPUs per replica below, choosing a smaller model, or using FP8 precision may help.`
+    : undefined
+
   // Wait for both model and runtimes to load before showing the form
   // This ensures the runtime selector is visible when the form renders
   if (modelLoading || runtimesLoading) {
@@ -306,6 +316,8 @@ export function DeployPage() {
           kvCacheDtype={kvCacheDtype}
           fp8Blocked={fp8Blocked}
           fp8BlockReason={fp8BlockReason}
+          doesNotFit={doesNotFit}
+          doesNotFitReason={doesNotFitReason}
         />
       </div>
     </div>