feat(api): added Groq Llama 3.3 70B as cross-provider fallback

sunnypatell · sunnypatell · commit 575b8ba3c9a6 · 2026-02-23T16:27:50.000-05:00
- replaced Google Flash/Flash-Lite fallbacks with Groq (independent quota, 14,400 RPD)
- moved JSON validation inside retry loop so bad responses trigger next provider
- per-provider timeouts [35s, 20s] fit within Vercel's 60s maxDuration
- removed all Cerebras references from docs and .env.example
- updated about page, README, and all Starlight docs with new provider chain
- added Google/Groq rate limit doc links where RPM/RPD/TPM tables appear
- added scripts/test-providers.mjs for dry-run provider testing
diff --git a/.env.example b/.env.example
@@ -2,9 +2,9 @@
 # Get yours at https://aistudio.google.com/apikey
 GEMINI_API_KEY=
 
-# Optional fallback providers (for self-hosting)
+# Groq API key (recommended fallback, Llama 3.3 70B)
+# Get yours at https://console.groq.com/keys
 # GROQ_API_KEY=
-# CEREBRAS_API_KEY=
 
 # Firebase configuration (required)
 # Get these from Firebase Console → Project Settings → Your Apps
diff --git a/.prettierignore b/.prettierignore
@@ -6,3 +6,5 @@ coverage/
 playwright-report/
 test-results/
 pnpm-lock.yaml
+docs/.astro/
+docs/pnpm-lock.yaml
diff --git a/README.md b/README.md
@@ -74,18 +74,18 @@ Each profile is based on research into the platform's documented parsing and mat
 
 ## Tech Stack
 
-| Layer            | Choice                                             | Why                                                                                               |
-| ---------------- | -------------------------------------------------- | ------------------------------------------------------------------------------------------------- |
-| **Framework**    | SvelteKit 5 (Svelte 5 runes)                       | Compiled to vanilla JS, ~15KB runtime. No VDOM overhead.                                          |
-| **Styling**      | Scoped CSS + CSS custom properties                 | Dark glassmorphic design. No Tailwind. Component-scoped.                                          |
-| **PDF Parsing**  | pdfjs-dist (Web Worker)                            | Mozilla-maintained, fully client-side.                                                            |
-| **DOCX Parsing** | mammoth                                            | Client-side Word to text extraction.                                                              |
-| **NLP**          | Custom TF-IDF + tokenizer + skills taxonomy        | Lightweight, browser-native, supports 8+ industries.                                              |
-| **LLM**          | Gemma 3 27B (primary), Gemini 2.5 Flash (fallback) | 14,400 RPD free tier via Google Generative Language API. Groq + Cerebras available for self-host. |
-| **Auth**         | Firebase Authentication                            | Google + email/password sign-in. Free Spark plan.                                                 |
-| **Storage**      | Cloud Firestore                                    | Scan history per user. Free Spark plan.                                                           |
-| **Hosting**      | Vercel                                             | Free hobby tier. Edge functions for API.                                                          |
-| **Testing**      | Vitest + Playwright + @testing-library/svelte      | Unit, integration, and E2E coverage.                                                              |
+| Layer            | Choice                                                   | Why                                                                                         |
+| ---------------- | -------------------------------------------------------- | ------------------------------------------------------------------------------------------- |
+| **Framework**    | SvelteKit 5 (Svelte 5 runes)                             | Compiled to vanilla JS, ~15KB runtime. No VDOM overhead.                                    |
+| **Styling**      | Scoped CSS + CSS custom properties                       | Dark glassmorphic design. No Tailwind. Component-scoped.                                    |
+| **PDF Parsing**  | pdfjs-dist (Web Worker)                                  | Mozilla-maintained, fully client-side.                                                      |
+| **DOCX Parsing** | mammoth                                                  | Client-side Word to text extraction.                                                        |
+| **NLP**          | Custom TF-IDF + tokenizer + skills taxonomy              | Lightweight, browser-native, supports 8+ industries.                                        |
+| **LLM**          | Gemma 3 27B (primary), Llama 3.3 70B via Groq (fallback) | Cross-provider fallback: Google (14,400 RPD) + Groq (14,400 RPD) on independent free tiers. |
+| **Auth**         | Firebase Authentication                                  | Google + email/password sign-in. Free Spark plan.                                           |
+| **Storage**      | Cloud Firestore                                          | Scan history per user. Free Spark plan.                                                     |
+| **Hosting**      | Vercel                                                   | Free hobby tier. Edge functions for API.                                                    |
+| **Testing**      | Vitest + Playwright + @testing-library/svelte            | Unit, integration, and E2E coverage.                                                        |
 
 **Total infrastructure cost: $0.** Everything runs on free tiers.
 
diff --git a/docs/src/content/docs/api/rate-limits.md b/docs/src/content/docs/api/rate-limits.md
@@ -50,14 +50,16 @@ When you receive a `429` response:
 
 When self-hosting, rate limits are configurable. The actual bottleneck becomes your LLM provider's free tier:
 
-| Provider | Model           | RPM | RPD    |
-| -------- | --------------- | --- | ------ |
-| Gemma    | 3 27B (primary) | 30  | 14,400 |
-| Gemini   | 2.5 Flash       | 5   | 20     |
-| Gemini   | 2.5 Flash Lite  | 10  | 20     |
-| Groq     | Llama 3.3 70B   | 30  | 14,400 |
-| Cerebras | Llama 3.3 70B   | 30  | 1,000  |
+| Provider | Model         | RPM  | RPD    | TPM |
+| -------- | ------------- | ---- | ------ | --- |
+| Google   | Gemma 3 27B   | 30   | 14,400 | 15K |
+| Groq     | Llama 3.3 70B | 1000 | 14,400 | 12K |
+
+For the latest limits, see the official documentation:
+
+- [Google AI rate limits](https://ai.google.dev/gemini-api/docs/rate-limits)
+- [Groq rate limits](https://console.groq.com/docs/rate-limits)
 
 :::tip
-The hosted version uses Gemma 3 27B as the primary model (14,400 RPD), giving roughly 14,000+ scans per day on the free tier. Groq and Cerebras are available as optional fallbacks for self-hosted instances.
+The hosted version uses Gemma 3 27B as the primary model with Llama 3.3 70B via Groq as fallback. Both run on independent free tiers. The binding constraint is TPM (tokens per minute), not RPD. Each scan uses ~8,000 tokens total (prompt + response), giving a realistic combined throughput of roughly 4,500 scans per day under sustained load.
 :::
diff --git a/docs/src/content/docs/getting-started/introduction.md b/docs/src/content/docs/getting-started/introduction.md
@@ -56,7 +56,7 @@ Built with performance and privacy in mind:
 - **SvelteKit 5** with Svelte 5 runes for the frontend
 - **pdfjs-dist** (Web Worker) for client-side PDF parsing
 - **mammoth** for client-side DOCX parsing
-- **Gemma 3 27B** (primary) with Gemini fallbacks for AI-powered analysis
+- **Gemma 3 27B** (primary) with **Llama 3.3 70B** via Groq as fallback for AI-powered analysis
 - **Firebase** for authentication and scan history
 - **Vercel** for hosting (free tier)
 
diff --git a/docs/src/content/docs/self-hosting/configuration.md b/docs/src/content/docs/self-hosting/configuration.md
@@ -7,39 +7,37 @@ description: Environment variables and configuration options for self-hosted ins
 
 All configuration is done through environment variables in the `.env` file.
 
-| Variable           | Required | Description                                          |
-| ------------------ | -------- | ---------------------------------------------------- |
-| `GEMINI_API_KEY`   | Yes      | Google AI API key (used for Gemma 3 + Gemini models) |
-| `GROQ_API_KEY`     | Optional | Groq API key (optional fallback)                     |
-| `CEREBRAS_API_KEY` | Optional | Cerebras API key (optional fallback)                 |
+| Variable         | Required    | Description                                    |
+| ---------------- | ----------- | ---------------------------------------------- |
+| `GEMINI_API_KEY` | Yes         | Google AI API key (powers Gemma 3 27B primary) |
+| `GROQ_API_KEY`   | Recommended | Groq API key (Llama 3.3 70B fallback)          |
 
 :::caution
 Never commit your `.env` file to version control. It's already in `.gitignore`, but double-check before pushing.
 :::
 
 ## Provider Priority
 
-The LLM fallback chain follows this order:
+The LLM fallback chain uses cross-provider redundancy so quota limits on one provider don't cascade:
 
-1. **Gemma 3 27B** (primary, 14,400 RPD via `GEMINI_API_KEY`)
-2. **Gemini 2.5 Flash** (fallback, 20 RPD via `GEMINI_API_KEY`)
-3. **Gemini 2.5 Flash Lite** (fallback, 20 RPD via `GEMINI_API_KEY`)
-4. **Groq Llama 3.3 70B** (if `GROQ_API_KEY` is set)
-5. **Cerebras Llama 3.3 70B** (if `CEREBRAS_API_KEY` is set)
+1. **Gemma 3 27B** via Google (primary, `GEMINI_API_KEY`)
+2. **Llama 3.3 70B** via Groq (fallback, `GROQ_API_KEY`)
 
-If a provider fails (timeout, rate limit, malformed response), the system automatically tries the next one. All Google models (Gemma + Gemini) use the same API key.
+If a provider fails (timeout, rate limit, malformed response), the system automatically tries the next one. Because each provider uses a separate API key, their quotas are completely independent.
 
 ## Free Tier Limits
 
-| Provider | Model           | RPM | RPD    | Cost                   |
-| -------- | --------------- | --- | ------ | ---------------------- |
-| Gemma    | 3 27B (primary) | 30  | 14,400 | Free (blocks at limit) |
-| Gemini   | 2.5 Flash       | 5   | 20     | Free (blocks at limit) |
-| Gemini   | 2.5 Flash Lite  | 10  | 20     | Free (blocks at limit) |
-| Groq     | Llama 3.3 70B   | 30  | 14,400 | Free                   |
-| Cerebras | Llama 3.3 70B   | 30  | 1,000  | Free                   |
+| Provider | Model         | RPM  | RPD    | TPM | Cost |
+| -------- | ------------- | ---- | ------ | --- | ---- |
+| Google   | Gemma 3 27B   | 30   | 14,400 | 15K | Free |
+| Groq     | Llama 3.3 70B | 1000 | 14,400 | 12K | Free |
 
-**Key detail about Google AI:** The free tier will **block** requests at the limit, never auto-charge. You cannot accidentally incur costs.
+Both providers block at their limits and never auto-charge. You cannot accidentally incur costs.
+
+For the latest limits, see the official documentation:
+
+- [Google AI rate limits](https://ai.google.dev/gemini-api/docs/rate-limits)
+- [Groq rate limits](https://console.groq.com/docs/rate-limits)
 
 ## Rate Limiting
 
@@ -56,10 +54,11 @@ Adjust these values based on your expected traffic and API key limits.
 
 ## Timeouts
 
-The default timeout for LLM requests is 60 seconds:
+Each provider has its own timeout. [Vercel Fluid Compute](https://vercel.com/docs/fluid-compute) is enabled by default and allows up to 300 seconds on the Hobby plan:
 
 ```typescript
-const PROVIDER_TIMEOUT_MS = 60_000;
+// Gemma: 90s, Groq: 30s → worst case total: 120s
+const PROVIDER_TIMEOUTS_MS = [90_000, 30_000];
 ```
 
-Increase this if you're experiencing timeouts with longer resumes.
+Gemma 3 27B typically takes 30-45 seconds for the full scoring prompt but can spike under load. The 90s timeout gives generous headroom. Groq responds in under 1 second but gets 30s for safety. If both providers fail, the system falls back to rule-based scoring on the client side.
diff --git a/docs/src/content/docs/self-hosting/deployment.md b/docs/src/content/docs/self-hosting/deployment.md
@@ -24,8 +24,7 @@ In the Vercel dashboard:
 1. Go to your project > **Settings** > **Environment Variables**
 2. Add your API keys:
    - `GEMINI_API_KEY` (required)
-   - `GROQ_API_KEY` (optional fallback)
-   - `CEREBRAS_API_KEY` (optional fallback)
+   - `GROQ_API_KEY` (recommended fallback)
 3. Add your Firebase config (all `PUBLIC_FIREBASE_*` variables from `.env.example`)
 
 :::tip
diff --git a/docs/src/content/docs/self-hosting/setup.md b/docs/src/content/docs/self-hosting/setup.md
@@ -9,7 +9,7 @@ ATS Screener can be self-hosted for free. You'll need at least one LLM API key.
 
 - **Node.js** 18+ (20 recommended)
 - **pnpm** 8+ (package manager)
-- A free API key from [Google AI Studio](https://aistudio.google.com/apikey) (required for Gemma/Gemini models)
+- A free API key from [Google AI Studio](https://aistudio.google.com/apikey) (required for Gemma 3 27B)
 
 ## Installation
 
@@ -33,20 +33,14 @@ cp .env.example .env
 2. Click "Create API Key"
 3. Add to `.env`: `GEMINI_API_KEY=your_key_here`
 
-### Groq (Optional Fallback)
+### Groq (Recommended Fallback)
 
 1. Go to [Groq Console](https://console.groq.com/keys)
 2. Create a new API key
 3. Add to `.env`: `GROQ_API_KEY=your_key_here`
 
-### Cerebras (Optional Fallback)
-
-1. Go to [Cerebras Cloud](https://cloud.cerebras.ai/)
-2. Generate an API key
-3. Add to `.env`: `CEREBRAS_API_KEY=your_key_here`
-
 :::tip
-You only need the **Google AI (Gemini) API key** to run the app. It powers Gemma 3 27B (14,400 RPD) as the primary model plus Gemini models as fallbacks. Groq and Cerebras are optional for additional availability.
+You need the **Google AI API key** to run the app (Gemma 3 27B primary, 14,400 RPD). Adding a **Groq API key** is strongly recommended as it provides a completely independent fallback (Llama 3.3 70B, 14,400 RPD) so users never see failures during peak traffic.
 :::
 
 ## Run Locally
diff --git a/eslint.config.js b/eslint.config.js
@@ -41,7 +41,8 @@ export default ts.config(
 			'playwright-report/',
 			'test-results/',
 			'docs/',
-			'static/docs/'
+			'static/docs/',
+			'scripts/'
 		]
 	}
 );
diff --git a/scripts/test-gemma-json.mjs b/scripts/test-gemma-json.mjs
@@ -0,0 +1,128 @@
+import { readFileSync } from 'fs';
+
+const envContent = readFileSync('.env', 'utf-8');
+const GEMINI_KEY = envContent.match(/GEMINI_API_KEY=(.+)/)?.[1]?.trim();
+
+if (!GEMINI_KEY) {
+	console.error('no GEMINI_API_KEY in .env');
+	process.exit(1);
+}
+
+// simulate the real scoring prompt (similar size to buildFullScoringPrompt)
+const resumeText =
+	'Sunny Patel. (437) 216-1611. Software Engineer Intern at IBM. IT Technician at Canadas Wonderland. Supported tenant-to-tenant migration during Six Flags acquisition for 3000+ directory objects. Authored 10+ PowerShell/ConnectWise scripts automating workstation imaging. Built and deployed MDT task sequences. Managed Active Directory accounts, security groups, and GPOs. System Support Specialist at Mackenzie Health. Migrated 400+ Surface tablets to bedside iPads. Skills: Java, Python, Go, Scala, PowerShell, C++, C#, YAML, Kotlin, Assembly, Django, Ruby on Rails, MongoDB, PostgreSQL, MySQL, Express.js, ASP.NET Core, Spring Boot, Kafka, React.js, JavaScript, Flutter, TypeScript, WebGL, GraphQL, Tailwind CSS, Three.js, Vue.js, Git, Docker, Kubernetes, Azure, GCP, AWS, Jamf Pro, Datadog. Education: Ontario Tech University, Honours BSc Computer Science. Projects: Axelot collaborative document platform with Next.js 16 and WebRTC, Netdash Electron networking toolkit with 15+ tools, SecureBank CTF banking app for SQL injection training, Sunnify Spotify downloader with PyQt5. Certifications: Microsoft GH-300 GitHub Copilot Intermediate, MongoDB Python Developer Path, GitHub Foundations, ConnectWise Automate Certified Enterprise Scripting Architect, Google IT Automation with Python.';
+
+const scoringPrompt = `You are a senior talent acquisition technology analyst. Analyze this resume from the perspective of 6 enterprise ATS platforms.
+
+<RESUME>
+${resumeText}
+</RESUME>
+
+MODE: general ATS readiness. Evaluate formatting, structure, and keyword density.
+
+## PLATFORM SPECIFICATIONS
+### 1. WORKDAY RECRUITING - strict parser, skips headers/footers, penalizes creative formats
+### 2. ORACLE TALEO - literal exact keyword match, strictest matching
+### 3. iCIMS - semantic ML-based matching, most forgiving parser
+### 4. GREENHOUSE - LLM-based parser, no auto-scoring, human review focused
+### 5. LEVER - stemming-based matching, no ranking system
+### 6. SAP SUCCESSFACTORS - Textkernel parser, taxonomy normalization
+
+Score each platform on: formatting (0-100), keywordMatch (0-100), sections (0-100), experience (0-100), education (0-100), overallScore (0-100).
+
+Respond ONLY with valid JSON matching this structure:
+{
+  "results": [
+    {
+      "system": "Workday",
+      "vendor": "Workday Inc.",
+      "overallScore": 75,
+      "passesFilter": true,
+      "breakdown": {
+        "formatting": { "score": 80, "issues": [], "details": [] },
+        "keywordMatch": { "score": 70, "matched": [], "missing": [], "synonymMatched": [] },
+        "sections": { "score": 85, "present": [], "missing": [] },
+        "experience": { "score": 75, "quantifiedBullets": 5, "totalBullets": 10, "actionVerbCount": 7, "highlights": [] },
+        "education": { "score": 90, "notes": [] }
+      },
+      "suggestions": []
+    }
+  ]
+}
+
+Return exactly 6 results: Workday, Taleo, iCIMS, Greenhouse, Lever, SuccessFactors.`;
+
+console.log('prompt length:', scoringPrompt.length, 'chars');
+console.log('estimated tokens:', Math.ceil(scoringPrompt.length / 3.5));
+console.log('');
+
+async function test() {
+	const start = Date.now();
+	const res = await fetch(
+		`https://generativelanguage.googleapis.com/v1beta/models/gemma-3-27b-it:generateContent?key=${GEMINI_KEY}`,
+		{
+			method: 'POST',
+			headers: { 'Content-Type': 'application/json' },
+			body: JSON.stringify({
+				contents: [{ parts: [{ text: scoringPrompt }] }],
+				generationConfig: { temperature: 0.3, topP: 0.85, maxOutputTokens: 16384 }
+			})
+		}
+	);
+
+	const elapsed = Date.now() - start;
+	console.log('status:', res.status, `(${elapsed}ms)`);
+
+	if (!res.ok) {
+		const err = await res.text();
+		console.log('ERROR:', err.slice(0, 500));
+		return;
+	}
+
+	const data = await res.json();
+	const text = data.candidates?.[0]?.content?.parts?.[0]?.text ?? '';
+	console.log('response length:', text.length, 'chars');
+
+	// try JSON parse (same logic as extractJSON in +server.ts)
+	const trimmed = text.trim();
+
+	// attempt 1: direct parse
+	try {
+		JSON.parse(trimmed);
+		console.log('JSON parse: DIRECT SUCCESS');
+		return;
+	} catch {
+		/* continue */
+	}
+
+	// attempt 2: strip markdown fences
+	const cleaned = trimmed.replace(/```json\n?|\n?```/g, '').trim();
+	try {
+		JSON.parse(cleaned);
+		console.log('JSON parse: SUCCESS (after fence strip)');
+		return;
+	} catch {
+		/* continue */
+	}
+
+	// attempt 3: find { ... } block
+	const s = cleaned.indexOf('{');
+	const e = cleaned.lastIndexOf('}');
+	if (s !== -1 && e > s) {
+		try {
+			JSON.parse(cleaned.slice(s, e + 1));
+			console.log('JSON parse: SUCCESS (extracted { } block)');
+			return;
+		} catch {
+			/* continue */
+		}
+	}
+
+	console.log('JSON parse: FAILED - this is why Gemma falls through to Groq');
+	console.log('--- first 500 chars of response ---');
+	console.log(text.slice(0, 500));
+	console.log('--- last 200 chars of response ---');
+	console.log(text.slice(-200));
+}
+
+test().catch(console.error);
diff --git a/scripts/test-providers.mjs b/scripts/test-providers.mjs
diff --git a/src/routes/about/+page.svelte b/src/routes/about/+page.svelte
diff --git a/src/routes/api/analyze/+server.ts b/src/routes/api/analyze/+server.ts

Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,8 @@ export default ts.config(`
`41`	`41`	`'playwright-report/',`
`42`	`42`	`'test-results/',`
`43`	`43`	`'docs/',`
`44`		`- 'static/docs/'`
	`44`	`+ 'static/docs/',`
	`45`	`+ 'scripts/'`
`45`	`46`	`]`
`46`	`47`	`}`
`47`	`48`	`);`