Skip to content

Commit c4b3431

Browse files
authored
removed http error code swalling (#11)
1 parent d814407 commit c4b3431

3 files changed

Lines changed: 34 additions & 138 deletions

File tree

backend/main.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
1+
import json
12
import logging
23
from contextlib import asynccontextmanager
3-
from fastapi import FastAPI
4+
from fastapi import FastAPI, Request
45
from fastapi.middleware.cors import CORSMiddleware
6+
from fastapi.responses import Response
57
from sqlmodel import create_engine
68
from backend.config import get_settings
79
from backend.services.metrics_service import metrics_collector
810
from backend.middleware.logging import AccessLogMiddleware
11+
from backend.models.protocols import BackendHTTPError
912
from backend.routers import (
1013
completions,
1114
responses,
@@ -45,6 +48,19 @@ async def lifespan(app: FastAPI):
4548
allow_headers=["*"],
4649
)
4750

51+
52+
@app.exception_handler(BackendHTTPError)
53+
async def backend_http_error_handler(request: Request, exc: BackendHTTPError):
54+
try:
55+
json.loads(exc.body)
56+
media_type = "application/json"
57+
content = exc.body
58+
except (json.JSONDecodeError, TypeError):
59+
media_type = "text/plain"
60+
content = exc.body
61+
return Response(content=content, status_code=exc.status_code, media_type=media_type)
62+
63+
4864
app.include_router(completions.router)
4965
app.include_router(responses.router)
5066
app.include_router(embeddings.router)

backend/models/protocols.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -297,16 +297,14 @@ def json(self, **kwargs):
297297
return self.model_dump_json()
298298

299299

300-
class RetryConstantError(Exception):
301-
pass
302-
303-
304-
class RetryExpoError(Exception):
305-
pass
306-
307-
308-
class UnknownLLMError(Exception):
309-
pass
300+
class BackendHTTPError(Exception):
301+
"""Error from the backend (vllm/sglang) passed through to the client
302+
with the original status code and body. No retries — clients handle their own."""
303+
304+
def __init__(self, status_code: int, body: str):
305+
self.status_code = status_code
306+
self.body = body
307+
super().__init__(f"HTTP {status_code}: {body}")
310308

311309

312310
class ProviderKeySubmission(BaseModel):

backend/services/llm_service.py

Lines changed: 9 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,9 @@
11
import json
2-
import backoff
32
import aiohttp
43
from typing import Dict, Union
54
from backend.models.protocols import (
65
ModelResponse,
7-
RetryConstantError,
8-
RetryExpoError,
9-
UnknownLLMError,
6+
BackendHTTPError,
107
LLMRequest,
118
LLMCompletionsRequest,
129
)
@@ -122,18 +119,6 @@ async def response_generator_raw(response):
122119
active_requests -= 1
123120

124121

125-
def handle_llm_exception(e: Exception):
126-
if isinstance(e, aiohttp.ClientResponseError):
127-
if e.status in [408, 429, 500, 502, 503, 504]:
128-
raise RetryExpoError(f"HTTP {e.status}: {e.message}") from e
129-
else:
130-
raise RetryConstantError(f"HTTP {e.status}: {e.message}") from e
131-
elif isinstance(e, (aiohttp.ClientError, aiohttp.ServerTimeoutError)):
132-
raise RetryConstantError(str(e)) from e
133-
else:
134-
raise UnknownLLMError from e
135-
136-
137122
class StreamWrapper:
138123
def __init__(self, gen, headers=None):
139124
self.gen = gen
@@ -173,11 +158,7 @@ async def _execute_http_request(
173158
text = str(resp.status)
174159
await req_cm.__aexit__(None, None, None)
175160
await session.close()
176-
177-
if resp.status in [429, 500, 502, 503, 504]:
178-
raise RetryExpoError(f"HTTP {resp.status}: {text}")
179-
else:
180-
raise RetryConstantError(f"HTTP {resp.status}: {text}")
161+
raise BackendHTTPError(status_code=resp.status, body=text)
181162

182163
response_headers = dict(resp.headers)
183164
if stream:
@@ -274,26 +255,18 @@ async def _shared_proxy_handler(
274255

275256
return resp
276257

277-
except Exception as e:
258+
except BackendHTTPError:
278259
active_requests -= 1
279260
if not session.closed:
280261
await session.close()
281-
handle_llm_exception(e)
262+
raise
263+
except Exception:
264+
active_requests -= 1
265+
if not session.closed:
266+
await session.close()
267+
raise
282268

283269

284-
@backoff.on_exception(
285-
wait_gen=backoff.constant,
286-
exception=RetryConstantError,
287-
max_tries=3,
288-
interval=3,
289-
)
290-
@backoff.on_exception(
291-
wait_gen=backoff.expo,
292-
exception=RetryExpoError,
293-
jitter=backoff.full_jitter,
294-
max_value=100,
295-
factor=1.5,
296-
)
297270
async def llm_proxy(endpoint, api_key, request: LLMRequest) -> ModelResponse:
298271
return await _shared_proxy_handler(
299272
endpoint=endpoint,
@@ -306,19 +279,6 @@ async def llm_proxy(endpoint, api_key, request: LLMRequest) -> ModelResponse:
306279
)
307280

308281

309-
@backoff.on_exception(
310-
wait_gen=backoff.constant,
311-
exception=RetryConstantError,
312-
max_tries=3,
313-
interval=3,
314-
)
315-
@backoff.on_exception(
316-
wait_gen=backoff.expo,
317-
exception=RetryExpoError,
318-
jitter=backoff.full_jitter,
319-
max_value=100,
320-
factor=1.5,
321-
)
322282
async def llm_proxy_completions(
323283
endpoint, api_key, request: LLMCompletionsRequest
324284
) -> ModelResponse:
@@ -333,19 +293,6 @@ async def llm_proxy_completions(
333293
)
334294

335295

336-
@backoff.on_exception(
337-
wait_gen=backoff.constant,
338-
exception=RetryConstantError,
339-
max_tries=3,
340-
interval=3,
341-
)
342-
@backoff.on_exception(
343-
wait_gen=backoff.expo,
344-
exception=RetryExpoError,
345-
jitter=backoff.full_jitter,
346-
max_value=100,
347-
factor=1.5,
348-
)
349296
async def llm_proxy_embeddings(endpoint, api_key, **kwargs) -> ModelResponse:
350297
embedding_params = {
351298
"model": kwargs.get("model"),
@@ -368,19 +315,6 @@ async def llm_proxy_embeddings(endpoint, api_key, **kwargs) -> ModelResponse:
368315
)
369316

370317

371-
@backoff.on_exception(
372-
wait_gen=backoff.constant,
373-
exception=RetryConstantError,
374-
max_tries=3,
375-
interval=3,
376-
)
377-
@backoff.on_exception(
378-
wait_gen=backoff.expo,
379-
exception=RetryExpoError,
380-
jitter=backoff.full_jitter,
381-
max_value=100,
382-
factor=1.5,
383-
)
384318
async def llm_proxy_responses(
385319
endpoint, api_key, payload: dict, stream: bool, model: str
386320
):
@@ -396,19 +330,6 @@ async def llm_proxy_responses(
396330
)
397331

398332

399-
@backoff.on_exception(
400-
wait_gen=backoff.constant,
401-
exception=RetryConstantError,
402-
max_tries=3,
403-
interval=3,
404-
)
405-
@backoff.on_exception(
406-
wait_gen=backoff.expo,
407-
exception=RetryExpoError,
408-
jitter=backoff.full_jitter,
409-
max_value=100,
410-
factor=1.5,
411-
)
412333
async def llm_proxy_rerank(endpoint, api_key, payload: dict, model: str):
413334
return await _shared_proxy_handler(
414335
endpoint=endpoint,
@@ -422,19 +343,6 @@ async def llm_proxy_rerank(endpoint, api_key, payload: dict, model: str):
422343
)
423344

424345

425-
@backoff.on_exception(
426-
wait_gen=backoff.constant,
427-
exception=RetryConstantError,
428-
max_tries=3,
429-
interval=3,
430-
)
431-
@backoff.on_exception(
432-
wait_gen=backoff.expo,
433-
exception=RetryExpoError,
434-
jitter=backoff.full_jitter,
435-
max_value=100,
436-
factor=1.5,
437-
)
438346
async def llm_proxy_score(endpoint, api_key, payload: dict, model: str):
439347
return await _shared_proxy_handler(
440348
endpoint=endpoint,
@@ -448,19 +356,6 @@ async def llm_proxy_score(endpoint, api_key, payload: dict, model: str):
448356
)
449357

450358

451-
@backoff.on_exception(
452-
wait_gen=backoff.constant,
453-
exception=RetryConstantError,
454-
max_tries=3,
455-
interval=3,
456-
)
457-
@backoff.on_exception(
458-
wait_gen=backoff.expo,
459-
exception=RetryExpoError,
460-
jitter=backoff.full_jitter,
461-
max_value=100,
462-
factor=1.5,
463-
)
464359
async def llm_proxy_tokenize(endpoint, api_key, payload: dict, model: str):
465360
return await _shared_proxy_handler(
466361
endpoint=endpoint,
@@ -474,19 +369,6 @@ async def llm_proxy_tokenize(endpoint, api_key, payload: dict, model: str):
474369
)
475370

476371

477-
@backoff.on_exception(
478-
wait_gen=backoff.constant,
479-
exception=RetryConstantError,
480-
max_tries=3,
481-
interval=3,
482-
)
483-
@backoff.on_exception(
484-
wait_gen=backoff.expo,
485-
exception=RetryExpoError,
486-
jitter=backoff.full_jitter,
487-
max_value=100,
488-
factor=1.5,
489-
)
490372
async def llm_proxy_detokenize(endpoint, api_key, payload: dict, model: str):
491373
return await _shared_proxy_handler(
492374
endpoint=endpoint,

0 commit comments

Comments
 (0)