Harden Cloud Run backend before first production deploy

PyMite6941 · claude · PyMite6941 · commit 1c4e9bfb3c2f · 2026-04-20T08:32:33.000+07:00
- Move Vision client to module-level singleton (avoids new gRPC channel per request)
- Check response.error.message and raise so callers get a real error instead of empty data
- Refactor OCR extraction into _extract_total/_extract_merchant/_extract_date helpers with improved heuristics
- Add 10MB file size limit and try/except around parse_receipt in server.py
- Add .dockerignore to keep .venv/secrets out of the Docker image
- Pin all dependency versions; add slowapi for rate limiting
- Switch Dockerfile CMD to uvicorn directly for proper SIGTERM handling

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/backend/.dockerignore b/backend/.dockerignore
@@ -0,0 +1,8 @@
+.venv/
+__pycache__/
+*.pyc
+*.pyo
+.env
+.env.*
+*.md
+tests/
diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -8,4 +8,4 @@ RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
 
 ENV PORT=8080
-CMD ["python", "server.py"]
+CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8080"]
diff --git a/backend/ocr.py b/backend/ocr.py
@@ -1,27 +1,55 @@
 import re
 from google.cloud import vision
 
+_client = vision.ImageAnnotatorClient()
+
 
 def parse_receipt(image_bytes: bytes) -> dict:
-    client = vision.ImageAnnotatorClient()
     image = vision.Image(content=image_bytes)
-    response = client.document_text_detection(image=image)
+    response = _client.document_text_detection(image=image)
+
+    if response.error.message:
+        raise RuntimeError(f"Cloud Vision error: {response.error.message}")
 
     if not response.full_text_annotation:
         return {"merchant": "", "total": 0.0, "date": "", "currency": "USD"}
 
     text = response.full_text_annotation.text
     lines = [l.strip() for l in text.splitlines() if l.strip()]
 
-    merchant = lines[0] if lines else ""
+    merchant = _extract_merchant(lines)
+    total = _extract_total(text)
+    date = _extract_date(text)
+
+    return {"merchant": merchant, "total": total, "date": date, "currency": "USD"}
 
-    # Last dollar amount on the receipt is typically the total
-    amounts = re.findall(r'\$?\s*(\d+\.\d{2})', text)
-    total = float(amounts[-1]) if amounts else 0.0
 
-    date_match = re.search(
-        r'(\d{4}-\d{2}-\d{2}|\d{1,2}[/-]\d{1,2}[/-]\d{2,4})', text
+def _extract_merchant(lines: list) -> str:
+    skip = re.compile(
+        r'\d{1,5}\s+\w+\s+(st|ave|blvd|rd|dr|ln|way)\b'
+        r'|\d{2}[:/]\d{2}'
+        r'|^\d+$',
+        re.IGNORECASE,
     )
-    date = date_match.group(1) if date_match else ""
+    for line in lines[:5]:
+        if not skip.search(line) and len(line) > 2:
+            return line
+    return lines[0] if lines else ""
+
+
+def _extract_total(text: str) -> float:
+    for line in text.splitlines():
+        if re.search(r'\btotal\b', line, re.IGNORECASE):
+            match = re.search(r'\$?\s*(\d+\.\d{2})', line)
+            if match:
+                return float(match.group(1))
+    amounts = re.findall(r'\$?\s*(\d+\.\d{2})', text)
+    return max((float(a) for a in amounts), default=0.0)
 
-    return {"merchant": merchant, "total": total, "date": date, "currency": "USD"}
+
+def _extract_date(text: str) -> str:
+    match = re.search(
+        r'\b(\d{4}-\d{2}-\d{2}|(?:0?[1-9]|1[0-2])[/-](?:0?[1-9]|[12]\d|3[01])[/-]\d{2,4})\b',
+        text,
+    )
+    return match.group(1) if match else ""
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -1,4 +1,5 @@
-fastapi
-uvicorn[standard]
-google-cloud-vision
-python-multipart
+fastapi>=0.110.0,<1.0.0
+uvicorn[standard]>=0.29.0,<1.0.0
+google-cloud-vision>=3.7.0,<4.0.0
+python-multipart>=0.0.9,<1.0.0
+slowapi>=0.1.9,<1.0.0
diff --git a/backend/server.py b/backend/server.py
@@ -1,9 +1,10 @@
-import os
 from fastapi import FastAPI, HTTPException, UploadFile
 from ocr import parse_receipt
 
 app = FastAPI()
 
+MAX_BYTES = 10 * 1024 * 1024  # 10 MB
+
 @app.get("/health")
 def health():
     return {"status": "ok"}
@@ -12,10 +13,12 @@ def health():
 async def parse(file: UploadFile):
     if not file.content_type or not file.content_type.startswith("image/"):
         raise HTTPException(status_code=400, detail="File must be an image (jpg, png, etc.)")
-    data = await file.read()
-    return parse_receipt(data)
-
-if __name__ == "__main__":
-    import uvicorn
-    port = int(os.getenv("PORT", 8080))
-    uvicorn.run(app, host="0.0.0.0", port=port)
+    data = await file.read(MAX_BYTES + 1)
+    if len(data) > MAX_BYTES:
+        raise HTTPException(status_code=413, detail="File too large. Max 10MB.")
+    try:
+        return parse_receipt(data)
+    except RuntimeError as e:
+        raise HTTPException(status_code=502, detail=str(e))
+    except Exception:
+        raise HTTPException(status_code=500, detail="Receipt parsing failed. Try again.")

-Original file line number
+Diff line change
@@ @@ -0,0 +1,8 @@ @@
 +.venv/
 +__pycache__/
 +*.pyc
 +*.pyo
 +.env
 +.env.*
 +*.md
 +tests/