quilrai
diff --git a/‎docs/llm-gateway/architecture.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/llm-gateway/architecture.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/llm-gateway/bedrock-boto3.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/llm-gateway/bedrock-boto3.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/llm-gateway/features/copilot-studio.md‎
Lines changed: 8 additions & 7 deletions b/‎docs/llm-gateway/features/copilot-studio.md‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎docs/llm-gateway/features/prompt-store.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/llm-gateway/features/prompt-store.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/llm-gateway/features/request-routing.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/llm-gateway/features/request-routing.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/llm-gateway/features/sdk-mode.md‎
Lines changed: 6 additions & 6 deletions b/‎docs/llm-gateway/features/sdk-mode.md‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎docs/llm-gateway/ha-and-sla.md‎
Lines changed: 30 additions & 23 deletions b/‎docs/llm-gateway/ha-and-sla.md‎
Lines changed: 30 additions & 23 deletions
@@ -12,7 +12,7 @@ How the QuilrAI LLM Gateway processes every request - from your application to t
   source={{
     label: "Your Application",
     code: `client = OpenAI(
-    base_url='https://guardrails.quilr.ai/openai_compatible/',
+    base_url='https://guardrails-usa-2.quilr.ai/openai_compatible/',
     api_key='sk-quilr-xxx'
 )
 client.chat.completions.create(
 
@@ -41,7 +41,7 @@ For assume-role setup, see [AWS Bedrock - Assume Role Setup](./bedrock-assume-ro
 
 ## Configure boto3
 
-Set `endpoint_url` to the QuilrAI Bedrock Runtime endpoint. Use the same QuilrAI key for both `aws_access_key_id` and `aws_secret_access_key`; the gateway uses SigV4 to authenticate the request.
+Set `endpoint_url` to the closest regional QuilrAI Bedrock Runtime endpoint. Use the same QuilrAI key for both `aws_access_key_id` and `aws_secret_access_key`; the gateway uses SigV4 to authenticate the request.
 
 ```python
 import boto3
@@ -52,7 +52,7 @@ QUILR_KEY = "sk-quilr-xxx"
 bedrock = boto3.client(
     "bedrock-runtime",
     region_name="us-east-1",
-    endpoint_url="https://guardrails.quilr.ai/bedrock-runtime",
+    endpoint_url="https://guardrails-usa-2.quilr.ai/bedrock-runtime",
     aws_access_key_id=QUILR_KEY,
     aws_secret_access_key=QUILR_KEY,
     config=Config(read_timeout=300),
@@ -75,7 +75,7 @@ print(response["output"]["message"]["content"][0]["text"])
 You can also set `endpoint_url` to the service root:
 
 ```python
-endpoint_url="https://guardrails.quilr.ai"
+endpoint_url="https://guardrails-usa-2.quilr.ai"
 ```
 
 Both endpoint styles are accepted.
 
@@ -24,18 +24,19 @@ Microsoft external threat detection is called for generative agents that use gen
 
 ## Endpoint
 
-Create a QuilrAI key with provider `copilot_studio`, then use this endpoint as the external threat detection base URL:
+Create a QuilrAI key with provider `copilot_studio`, then use the closest regional endpoint as the external threat detection base URL:
 
 ```text
-https://guardrails.quilr.ai/copilot_studio/sk-quilr-xxx
+https://guardrails-usa-2.quilr.ai/copilot_studio/sk-quilr-xxx
 ```
 
-Use the regional base URL if your tenant uses a regional QuilrAI deployment:
+The example uses US East. Choose the nearest regional base URL for your tenant:
 
 | Endpoint | Region | Endpoint base |
 |----------|--------|---------------|
 | Global (auto-routed) | Nearest | `https://guardrails.quilr.ai/copilot_studio/sk-quilr-xxx` |
-| USA | US East | `https://guardrails-usa-1.quilr.ai/copilot_studio/sk-quilr-xxx` |
+| USA 1 | US Central West | `https://guardrails-usa-1.quilr.ai/copilot_studio/sk-quilr-xxx` |
+| USA 2 | US East | `https://guardrails-usa-2.quilr.ai/copilot_studio/sk-quilr-xxx` |
 | India | Mumbai | `https://guardrails-india-1.quilr.ai/copilot_studio/sk-quilr-xxx` |
 
 Treat this URL as a secret. The QuilrAI key is part of the path because Copilot Studio owns the webhook call shape.
@@ -49,11 +50,11 @@ Copilot Studio appends these paths to the endpoint base:
 | `POST /validate` | Checks that the QuilrAI endpoint is reachable and ready. |
 | `POST /analyze-tool-execution` | Sends proposed tool execution context for allow/block evaluation. |
 
-For example, if the endpoint base is `https://guardrails.quilr.ai/copilot_studio/sk-quilr-xxx`, Copilot Studio calls:
+For example, if the endpoint base is `https://guardrails-usa-2.quilr.ai/copilot_studio/sk-quilr-xxx`, Copilot Studio calls:
 
 ```text
-https://guardrails.quilr.ai/copilot_studio/sk-quilr-xxx/validate
-https://guardrails.quilr.ai/copilot_studio/sk-quilr-xxx/analyze-tool-execution
+https://guardrails-usa-2.quilr.ai/copilot_studio/sk-quilr-xxx/validate
+https://guardrails-usa-2.quilr.ai/copilot_studio/sk-quilr-xxx/analyze-tool-execution
 ```
 
 Copilot Studio may also include an `api-version` query parameter. QuilrAI ignores unknown query parameters.
 
@@ -79,7 +79,7 @@ This applies uniformly across Chat Completions, Anthropic Messages (both the top
 from openai import OpenAI
 
 client = OpenAI(
-    base_url='https://guardrails.quilr.ai/openai_compatible/',
+    base_url='https://guardrails-usa-2.quilr.ai/openai_compatible/',
     api_key='sk-quilr-xxx'
 )
 
@@ -101,7 +101,7 @@ response = client.chat.completions.create(
 import anthropic
 
 client = anthropic.Anthropic(
-    base_url='https://guardrails.quilr.ai/anthropic_messages/',
+    base_url='https://guardrails-usa-2.quilr.ai/anthropic_messages/',
     api_key='sk-quilr-xxx'
 )
 
 
@@ -135,7 +135,7 @@ Your code still sends `model="gpt-4.1"` - zero code changes, but requests get ro
 from openai import OpenAI
 
 client = OpenAI(
-    base_url='https://guardrails.quilr.ai/openai_compatible/',
+    base_url='https://guardrails-usa-2.quilr.ai/openai_compatible/',
     api_key='sk-quilr-xxx'
 )
 
@@ -150,7 +150,7 @@ print(response.choices[0].message.content)
 ### cURL
 
 ```bash
-curl https://guardrails.quilr.ai/openai_compatible/v1/chat/completions \
+curl https://guardrails-usa-2.quilr.ai/openai_compatible/v1/chat/completions \
   -H "Content-Type: application/json" \
   -H "Authorization: Bearer sk-quilr-xxx" \
   -d '{
 
@@ -118,7 +118,7 @@ A typical pattern: check the user message before sending it to your LLM, then ch
 ```python
 import httpx
 
-QUILR_BASE = "https://guardrails.quilr.ai"
+QUILR_BASE = "https://guardrails-usa-2.quilr.ai"
 QUILR_SDK_KEY = "sk-quilr-xxx"
 
 async def check_messages(messages: list[dict]) -> dict:
@@ -184,7 +184,7 @@ asyncio.run(safe_chat("What is my SSN?"))
 ```python
 import requests
 
-QUILR_BASE = "https://guardrails.quilr.ai"
+QUILR_BASE = "https://guardrails-usa-2.quilr.ai"
 QUILR_SDK_KEY = "sk-quilr-xxx"
 
 def check_text(text: str, type_: str = "response") -> dict:
@@ -212,7 +212,7 @@ match result["status"]:
 ### JavaScript / TypeScript - `fetch`
 
 ```typescript
-const QUILR_BASE = "https://guardrails.quilr.ai";
+const QUILR_BASE = "https://guardrails-usa-2.quilr.ai";
 const QUILR_SDK_KEY = "sk-quilr-xxx";
 
 async function checkMessages(messages: Array<{ role: string; content: string }>) {
@@ -269,13 +269,13 @@ async function safeChat(userMessage: string): Promise<string> {
 
 ```bash
 # Check raw text
-curl -X POST https://guardrails.quilr.ai/sdk/v1/check \
+curl -X POST https://guardrails-usa-2.quilr.ai/sdk/v1/check \
   -H "Authorization: Bearer sk-quilr-xxx" \
   -H "Content-Type: application/json" \
   -d '{"text": "Call me at 555-867-5309", "type": "request"}'
 
 # Check a conversation
-curl -X POST https://guardrails.quilr.ai/sdk/v1/check \
+curl -X POST https://guardrails-usa-2.quilr.ai/sdk/v1/check \
   -H "Authorization: Bearer sk-quilr-xxx" \
   -H "Content-Type: application/json" \
   -d '{
@@ -305,7 +305,7 @@ Or copy `quilr_litellm_guardrails.py` into your project.
 | Variable | Required | Default | Description |
 |----------|----------|---------|-------------|
 | `QUILR_GUARDRAILS_KEY` | Yes | - | Your `quilr_sdk` API key |
-| `QUILR_GUARDRAILS_BASE_URL` | No | `https://guardrails.quilr.ai` | Override for self-hosted deployments |
+| `QUILR_GUARDRAILS_BASE_URL` | No | `https://guardrails.quilr.ai` | Override with the closest regional endpoint for production or with a self-hosted deployment URL |
 | `QUILR_GUARDRAILS_TIMEOUT` | No | `3` | Seconds before the check times out (request passes on timeout) |
 | `APPLY_QUILR_GUARDRAILS_FOR_MODELS` | No | (all) | Comma-separated list of models to restrict guardrails to |
 | `APPLY_QUILR_GUARDRAILS_FOR_KEY_NAMES` | No | (all) | Comma-separated list of LiteLLM key names to restrict guardrails to |
 
@@ -15,29 +15,34 @@ All endpoints are fully interchangeable - same API surface, same features, same
 | Endpoint | Region | Base URL |
 |----------|--------|----------|
 | **Global (auto-routed)** | Nearest | `https://guardrails.quilr.ai` |
-| **USA** | US East | `https://guardrails-usa-1.quilr.ai` |
+| **USA 1** | US Central West | `https://guardrails-usa-1.quilr.ai` |
+| **USA 2** | US East | `https://guardrails-usa-2.quilr.ai` |
 | **India** | Mumbai | `https://guardrails-india-1.quilr.ai` |
 
 Append the API format path to any base URL - for example, `https://guardrails-usa-1.quilr.ai/openai_compatible/`. See the [Integration Guide](./integration-guide) for all supported formats.
 
+For production traffic, choose the location-specific endpoint closest to your application as the primary base URL. Use `guardrails.quilr.ai` only when you explicitly want global auto-routing.
+
 :::info Expanding regions
 This list will continue to grow as we bring new regions online. Check this page or the [Integration Guide](./integration-guide) for the latest endpoints.
 :::
 
 ## Routing Architecture
 
-When you send a request to `guardrails.quilr.ai`, it automatically routes to the nearest available gateway server based on your geographic location. No configuration needed.
+If you use `guardrails.quilr.ai`, it automatically routes to the nearest available gateway server based on your geographic location. For predictable production routing, use a regional endpoint directly.
 
 ```mermaid
 flowchart TD
     A["Your Application"] --> B["guardrails.quilr.ai"]
     B --> C{"Auto-route to<br/>nearest server"}
-    C -->|"US traffic"| D["guardrails-usa-1.quilr.ai"]
-    C -->|"India traffic"| E["guardrails-india-1.quilr.ai"]
-    C -->|"Future regions"| F["..."]
+    C -->|"US Central West traffic"| D["guardrails-usa-1.quilr.ai"]
+    C -->|"US East traffic"| E["guardrails-usa-2.quilr.ai"]
+    C -->|"India traffic"| F["guardrails-india-1.quilr.ai"]
+    C -->|"Future regions"| H["..."]
     D --> G["LLM Providers"]
     E --> G
     F --> G
+    H --> G
 ```
 
 Each regional server runs the full QuilrAI pipeline - validation, scanning, transformation, routing, and observability - so there is no functional difference between endpoints.
@@ -48,17 +53,17 @@ Each regional server runs the full QuilrAI pipeline - validation, scanning, tran
   {
     label: "Attempt 1",
     items: [
-      "→ guardrails.quilr.ai",
-      "Auto-routes to nearest ✓",
-      "Optimal latency ✓",
+      "→ guardrails-usa-2.quilr.ai",
+      "Direct to US East server ✓",
+      "Primary regional endpoint ✓",
     ],
   },
   {
     label: "Attempt 2",
     items: [
       "→ guardrails-usa-1.quilr.ai",
-      "Direct to US server ✓",
-      "Bypasses auto-routing ✓",
+      "Direct to US Central West server ✓",
+      "Host-level redundancy ✓",
     ],
   },
   {
@@ -71,24 +76,26 @@ Each regional server runs the full QuilrAI pipeline - validation, scanning, tran
   },
 ]} />
 
-Even though `guardrails.quilr.ai` auto-routes to the nearest healthy server, we recommend a three-tier retry strategy that falls back to explicit regional endpoints:
+For production retry logic, use explicit regional endpoints. Start with the location-specific endpoint closest to your application, then fail over to other regional hosts. Do not include the global auto-routed endpoint in the retry chain.
+
+Example order for a US East deployment:
 
-1. **First attempt** - `guardrails.quilr.ai` - Uses auto-routing for optimal latency under normal conditions.
-2. **Second attempt** - `guardrails-usa-1.quilr.ai` - Direct connection to the US server, bypassing the auto-routing layer entirely.
+1. **First attempt** - `guardrails-usa-2.quilr.ai` - Direct connection to the nearest regional server.
+2. **Second attempt** - `guardrails-usa-1.quilr.ai` - Direct connection to another US server for host-level redundancy.
 3. **Third attempt** - `guardrails-india-1.quilr.ai` - Targets a geographically distinct server for maximum redundancy.
 
 ### Why retry with regional endpoints?
 
-Auto-routing handles most failure scenarios transparently. However, explicit regional fallbacks protect against edge cases that auto-routing alone cannot cover:
+Explicit regional fallbacks protect against edge cases that auto-routing alone cannot cover:
 
-- **DNS or routing-layer issues** - If the global endpoint's routing layer itself is degraded, direct regional URLs bypass it entirely.
-- **Auto-routing detection latency** - The auto-router takes 3-7 seconds to detect a downed host. During this window, your request may still be routed to the unhealthy server. Retrying with an explicit regional URL immediately targets a different host, avoiding the detection delay.
+- **DNS or routing-layer issues** - Direct regional URLs bypass the global routing layer entirely.
+- **Deterministic failover** - Retrying with an explicit regional URL immediately targets a different host instead of letting the auto-router choose.
 - **Regional propagation delays** - A server that has just recovered may not yet be visible to the auto-router. Hitting it directly avoids propagation lag.
 - **Geographic redundancy** - Retrying across regions ensures your request reaches an entirely independent infrastructure stack, eliminating single points of failure.
 
 The overhead is minimal - two additional fallback URLs in your retry logic - but the resilience improvement is significant.
 
-We recommend **one retry per QuilrAI host**. If a request fails on a given endpoint, move on to the next one rather than retrying the same host. This maximizes the chance of hitting a healthy server quickly, especially during the 3-7 second window before auto-routing detects a failure.
+We recommend **one retry per QuilrAI host**. If a request fails on a given endpoint, move on to the next one rather than retrying the same host. This maximizes the chance of hitting a healthy server quickly.
 
 ### Code Example
 
@@ -97,8 +104,8 @@ import time
 import httpx
 
 ENDPOINTS = [
-    "https://guardrails.quilr.ai",           # auto-routes to nearest
-    "https://guardrails-usa-1.quilr.ai",      # direct US fallback
+    "https://guardrails-usa-2.quilr.ai",      # primary US East endpoint
+    "https://guardrails-usa-1.quilr.ai",      # direct US Central West fallback
     "https://guardrails-india-1.quilr.ai",    # direct India fallback
 ]
 
@@ -124,8 +131,8 @@ import time
 from openai import OpenAI
 
 ENDPOINTS = [
-    "https://guardrails.quilr.ai/openai_compatible/v1",        # auto-routes to nearest
-    "https://guardrails-usa-1.quilr.ai/openai_compatible/v1",   # direct US fallback
+    "https://guardrails-usa-2.quilr.ai/openai_compatible/v1",   # primary US East endpoint
+    "https://guardrails-usa-1.quilr.ai/openai_compatible/v1",   # direct US Central West fallback
     "https://guardrails-india-1.quilr.ai/openai_compatible/v1", # direct India fallback
 ]
 
@@ -146,8 +153,8 @@ def call_llm(messages: list) -> str:
 
 ```javascript
 const ENDPOINTS = [
-  "https://guardrails.quilr.ai",           // auto-routes to nearest
-  "https://guardrails-usa-1.quilr.ai",     // direct US fallback
+  "https://guardrails-usa-2.quilr.ai",     // primary US East endpoint
+  "https://guardrails-usa-1.quilr.ai",     // direct US Central West fallback
   "https://guardrails-india-1.quilr.ai",   // direct India fallback
 ];
Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@ How the QuilrAI LLM Gateway processes every request - from your application to t`
`12`	`12`	`source={{`
`13`	`13`	`label: "Your Application",`
`14`	`14`	code: `client = OpenAI(
`15`		`- base_url='https://guardrails.quilr.ai/openai_compatible/',`
	`15`	`+ base_url='https://guardrails-usa-2.quilr.ai/openai_compatible/',`
`16`	`16`	`api_key='sk-quilr-xxx'`
`17`	`17`	`)`
`18`	`18`	`client.chat.completions.create(`
Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@ This applies uniformly across Chat Completions, Anthropic Messages (both the top`
`79`	`79`	`from openai import OpenAI`
`80`	`80`
`81`	`81`	`client = OpenAI(`
`82`		`- base_url='https://guardrails.quilr.ai/openai_compatible/',`
	`82`	`+ base_url='https://guardrails-usa-2.quilr.ai/openai_compatible/',`
`83`	`83`	`api_key='sk-quilr-xxx'`
`84`	`84`	`)`
`85`	`85`
`@@ -101,7 +101,7 @@ response = client.chat.completions.create(`
`101`	`101`	`import anthropic`
`102`	`102`
`103`	`103`	`client = anthropic.Anthropic(`
`104`		`- base_url='https://guardrails.quilr.ai/anthropic_messages/',`
	`104`	`+ base_url='https://guardrails-usa-2.quilr.ai/anthropic_messages/',`
`105`	`105`	`api_key='sk-quilr-xxx'`
`106`	`106`	`)`
`107`	`107`