samuellimabraz
diff --git a/‎src/demo/pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎src/demo/pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/demo/src/app/api/status/route.ts‎
Lines changed: 145 additions & 0 deletions b/‎src/demo/src/app/api/status/route.ts‎
Lines changed: 145 additions & 0 deletions
diff --git a/‎src/demo/src/app/api/warmup/route.ts‎
Lines changed: 192 additions & 0 deletions b/‎src/demo/src/app/api/warmup/route.ts‎
Lines changed: 192 additions & 0 deletions
@@ -31,6 +31,7 @@ dependencies = [
     "imbalanced-learn>=0.14.0",
     "gem-suite>=0.1.6",
     "quimb>=1.11.2",
+    "numba>=0.57.0",
     "yfinance>=0.2.66",
     "plotly>=6.5.0",
     "kaleido>=1.2.0",
 
@@ -0,0 +1,145 @@
+import { NextResponse } from 'next/server';
+
+export interface RunPodHealth {
+  jobs: {
+    completed: number;
+    failed: number;
+    inProgress: number;
+    inQueue: number;
+    retried: number;
+  };
+  workers: {
+    idle: number;
+    initializing: number;
+    running: number;
+    throttled: number;
+  };
+}
+
+export interface StatusResponse {
+  status: 'ready' | 'cold_start' | 'initializing' | 'processing' | 'unavailable';
+  message: string;
+  workers: {
+    idle: number;
+    running: number;
+    initializing: number;
+  };
+  queue: {
+    inProgress: number;
+    inQueue: number;
+  };
+  estimatedWait?: number; // seconds
+}
+
+/**
+ * Check RunPod endpoint health to provide user feedback during cold starts
+ */
+export async function GET(): Promise<NextResponse<StatusResponse>> {
+  const baseUrl = process.env.DEMO_MODEL_URL || 'http://localhost:8000/v1';
+  const apiKey = process.env.DEMO_API_KEY || '';
+
+  // Extract RunPod endpoint URL from the vLLM base URL
+  // vLLM URL format: https://api.runpod.ai/v2/{endpoint_id}/openai/v1
+  // Health URL format: https://api.runpod.ai/v2/{endpoint_id}/health
+  const runpodMatch = baseUrl.match(/https:\/\/api\.runpod\.ai\/v2\/([^/]+)/);
+  
+  if (!runpodMatch) {
+    // Not a RunPod endpoint, assume it's always ready (local/other provider)
+    return NextResponse.json({
+      status: 'ready',
+      message: 'Model server ready',
+      workers: { idle: 1, running: 0, initializing: 0 },
+      queue: { inProgress: 0, inQueue: 0 },
+    });
+  }
+
+  const endpointId = runpodMatch[1];
+  const healthUrl = `https://api.runpod.ai/v2/${endpointId}/health`;
+
+  try {
+    const response = await fetch(healthUrl, {
+      method: 'GET',
+      headers: {
+        'Authorization': `Bearer ${apiKey}`,
+        'Content-Type': 'application/json',
+      },
+      // Short timeout for health check
+      signal: AbortSignal.timeout(5000),
+    });
+
+    if (!response.ok) {
+      return NextResponse.json({
+        status: 'unavailable',
+        message: 'Unable to check model status',
+        workers: { idle: 0, running: 0, initializing: 0 },
+        queue: { inProgress: 0, inQueue: 0 },
+      });
+    }
+
+    const health: RunPodHealth = await response.json();
+    
+    const totalWorkers = health.workers.idle + health.workers.running + (health.workers.initializing || 0);
+    const hasActiveWorkers = totalWorkers > 0;
+    const hasIdleWorkers = health.workers.idle > 0;
+    const isInitializing = (health.workers.initializing || 0) > 0;
+    const hasQueuedJobs = health.jobs.inQueue > 0;
+    const hasRunningJobs = health.jobs.inProgress > 0;
+
+    let status: StatusResponse['status'];
+    let message: string;
+    let estimatedWait: number | undefined;
+
+    if (hasIdleWorkers) {
+      status = 'ready';
+      message = 'Model ready';
+    } else if (isInitializing) {
+      status = 'initializing';
+      message = 'Model loading...';
+      estimatedWait = 30; // Typical vLLM model load time
+    } else if (health.workers.running > 0) {
+      status = 'processing';
+      message = hasQueuedJobs 
+        ? `Processing (${health.jobs.inQueue} in queue)` 
+        : 'Processing request...';
+      estimatedWait = hasQueuedJobs ? health.jobs.inQueue * 15 : undefined;
+    } else if (!hasActiveWorkers && (hasQueuedJobs || hasRunningJobs)) {
+      status = 'cold_start';
+      message = 'Starting worker...';
+      estimatedWait = 45; // Cold start + model load
+    } else if (!hasActiveWorkers) {
+      status = 'cold_start';
+      message = 'Workers scaled to zero, will start on request';
+      estimatedWait = 45;
+    } else {
+      status = 'ready';
+      message = 'Model ready';
+    }
+
+    return NextResponse.json({
+      status,
+      message,
+      workers: {
+        idle: health.workers.idle,
+        running: health.workers.running,
+        initializing: health.workers.initializing || 0,
+      },
+      queue: {
+        inProgress: health.jobs.inProgress,
+        inQueue: health.jobs.inQueue,
+      },
+      estimatedWait,
+    });
+  } catch (error) {
+    console.error('Health check error:', error);
+    
+    // Network error might indicate cold start
+    return NextResponse.json({
+      status: 'cold_start',
+      message: 'Connecting to model server...',
+      workers: { idle: 0, running: 0, initializing: 0 },
+      queue: { inProgress: 0, inQueue: 0 },
+      estimatedWait: 45,
+    });
+  }
+}
+
@@ -0,0 +1,192 @@
+import { NextResponse } from 'next/server';
+
+export async function POST(): Promise<NextResponse> {
+  const baseUrl = process.env.DEMO_MODEL_URL || 'http://localhost:8000/v1';
+  const apiKey = process.env.DEMO_API_KEY || '';
+  const modelName = process.env.DEMO_MODEL_NAME || 'default';
+
+  console.log('[Warmup] Starting warmup...');
+  console.log('[Warmup] Base URL:', baseUrl);
+
+  const runpodMatch = baseUrl.match(/https:\/\/api\.runpod\.ai\/v2\/([^/]+)/);
+  
+  if (!runpodMatch) {
+    console.log('[Warmup] Not a RunPod endpoint, skipping');
+    return NextResponse.json({ 
+      status: 'skipped', 
+      message: 'Not a RunPod endpoint',
+    });
+  }
+
+  const endpointId = runpodMatch[1];
+  console.log('[Warmup] Endpoint ID:', endpointId);
+
+  try {
+    const healthUrl = `https://api.runpod.ai/v2/${endpointId}/health`;
+    let healthData = null;
+    
+    try {
+      const healthResponse = await fetch(healthUrl, {
+        method: 'GET',
+        headers: {
+          'Authorization': `Bearer ${apiKey}`,
+        },
+        signal: AbortSignal.timeout(5000),
+      });
+
+      if (healthResponse.ok) {
+        healthData = await healthResponse.json();
+        console.log('[Warmup] Health:', JSON.stringify(healthData));
+        
+        if (healthData.workers?.idle > 0) {
+          console.log('[Warmup] Idle workers available');
+          return NextResponse.json({
+            status: 'ready',
+            message: 'Workers already available',
+            workers: healthData.workers,
+          });
+        }
+        
+        if (healthData.workers?.initializing > 0) {
+          console.log('[Warmup] Workers already initializing');
+          return NextResponse.json({
+            status: 'warming',
+            message: 'Workers already starting',
+            workers: healthData.workers,
+          });
+        }
+      }
+    } catch (e) {
+      console.log('[Warmup] Health check error:', e);
+    }
+
+    const openaiUrl = `${baseUrl}/chat/completions`;
+    console.log('[Warmup] Sending to OpenAI endpoint:', openaiUrl);
+    
+    const abortController = new AbortController();
+    const timeoutId = setTimeout(() => abortController.abort(), 5000);
+    
+    try {
+      const warmupResponse = await fetch(openaiUrl, {
+        method: 'POST',
+        headers: {
+          'Authorization': `Bearer ${apiKey}`,
+          'Content-Type': 'application/json',
+        },
+        body: JSON.stringify({
+          model: modelName,
+          messages: [{ role: 'user', content: 'hi' }],
+          max_tokens: 1,
+          stream: false,
+        }),
+        signal: abortController.signal,
+      });
+
+      clearTimeout(timeoutId);
+      
+      console.log('[Warmup] Response status:', warmupResponse.status);
+      
+      return NextResponse.json({
+        status: warmupResponse.status === 200 ? 'ready' : 'warming',
+        message: warmupResponse.status === 200 
+          ? 'Model responded (was ready)' 
+          : 'Request queued, worker starting',
+        httpStatus: warmupResponse.status,
+        workers: healthData?.workers,
+      });
+      
+    } catch (fetchError) {
+      clearTimeout(timeoutId);
+      
+      if ((fetchError as Error).name === 'AbortError') {
+        console.log('[Warmup] Request sent (aborted wait - worker starting)');
+        return NextResponse.json({
+          status: 'warming',
+          message: 'Request sent, worker starting',
+          workers: healthData?.workers,
+        });
+      }
+      
+      throw fetchError;
+    }
+
+  } catch (error) {
+    console.error('[Warmup] Error:', error);
+    return NextResponse.json({
+      status: 'error',
+      message: error instanceof Error ? error.message : 'Warmup failed',
+    }, { status: 500 });
+  }
+}
+
+export async function GET(): Promise<NextResponse> {
+  const baseUrl = process.env.DEMO_MODEL_URL || 'http://localhost:8000/v1';
+  const apiKey = process.env.DEMO_API_KEY || '';
+
+  const runpodMatch = baseUrl.match(/https:\/\/api\.runpod\.ai\/v2\/([^/]+)/);
+  
+  if (!runpodMatch) {
+    return NextResponse.json({ 
+      ready: true,
+      message: 'Not a RunPod endpoint' 
+    });
+  }
+
+  const endpointId = runpodMatch[1];
+  const healthUrl = `https://api.runpod.ai/v2/${endpointId}/health`;
+
+  try {
+    const response = await fetch(healthUrl, {
+      method: 'GET',
+      headers: {
+        'Authorization': `Bearer ${apiKey}`,
+      },
+      signal: AbortSignal.timeout(10000), 
+    });
+
+    if (!response.ok) {
+      console.log('[Warmup GET] Health check failed:', response.status);
+      return NextResponse.json({ ready: false, message: 'Health check failed' });
+    }
+
+    const health = await response.json();
+    console.log('[Warmup GET] Health:', JSON.stringify(health));
+    
+    const idleWorkers = health.workers?.idle || 0;
+    const readyWorkers = health.workers?.ready || 0;
+    const runningWorkers = health.workers?.running || 0;
+    const initializingWorkers = health.workers?.initializing || 0;
+    const throttledWorkers = health.workers?.throttled || 0;
+    
+    const isReady = idleWorkers > 0 || readyWorkers > 0;
+    const isWarming = initializingWorkers > 0;
+    const isBusy = runningWorkers > 0 && !isReady;
+    const jobsInQueue = health.jobs?.inQueue || 0;
+    const jobsInProgress = health.jobs?.inProgress || 0;
+
+    return NextResponse.json({
+      ready: isReady,
+      warming: isWarming,
+      busy: isBusy,
+      jobsInQueue,
+      jobsInProgress,
+      workers: {
+        idle: idleWorkers,
+        ready: readyWorkers,
+        running: runningWorkers,
+        initializing: initializingWorkers,
+        throttled: throttledWorkers,
+      },
+    });
+  } catch (error) {
+    const isTimeout = error instanceof Error && error.name === 'TimeoutError';
+    if (!isTimeout) {
+      console.error('[Warmup GET] Error:', error);
+    }
+    return NextResponse.json({ 
+      ready: false, 
+      warming: true, 
+      message: isTimeout ? 'Health check timed out' : 'Check failed'
+    });
+  }
+}