Merge pull request #13 from joonsoome/fix/cohere-null-document

joonsoome · web-flow · commit 0ee45b445800 · 2025-11-14T15:08:54.000+09:00
fix(cohere): omit null document when return_documents=false
diff --git a/.env.example b/.env.example
@@ -1,92 +1,112 @@
 ############################################################
 # Example Environment (.env example)
+#
+# Copy this file to `.env` and tweak as needed.
 ############################################################
 
-# Backend
+# =========================
+# Backend (Embeddings)
+# =========================
+# auto | mlx | torch (auto picks MLX on Apple Silicon)
 BACKEND=auto
+# Default Apple‑Silicon‑friendly embedding model
 MODEL_NAME=mlx-community/Qwen3-Embedding-4B-4bit-DWQ
+# Optional: local MLX-converted model directory (overrides HF cache)
 MODEL_PATH=
-CROSS_ENCODER_MODEL=
 
-# Reranker (Cross-Encoder) — Optional
-# Choose one of the following ways to enable reranking:
-# 1) Torch CrossEncoder (sentence-transformers)
-#    RERANKER_BACKEND=torch
-#    CROSS_ENCODER_MODEL=cross-encoder/ms-marco-MiniLM-L-6-v2
-#
-# 2) MLX Reranker (experimental v1; pooled token embeddings + linear head)
-#    RERANKER_BACKEND=mlx
-#    RERANKER_MODEL_ID=vserifsaglam/Qwen3-Reranker-4B-4bit-MLX
-#
-# Auto selection prefers Torch for stability (set RERANKER_BACKEND=auto)
-RERANKER_BACKEND=auto
-RERANKER_MODEL_ID=
-# Alias for convenience; same as RERANKER_MODEL_ID
-RERANKER_MODEL_NAME=
-# Optional overrides
-RERANK_MAX_SEQ_LEN=512
-RERANK_BATCH_SIZE=16
+# Embedding dimension strategy
+# - as_is           → use backend output dimension
+# - hidden_size     → use model hidden_size / HF metadata (2560 for Qwen3-Embedding-4B)
+# - pad_or_truncate → force OUTPUT_EMBEDDING_DIMENSION (pads or truncates vectors)
+DIMENSION_STRATEGY=hidden_size
+# OUTPUT_EMBEDDING_DIMENSION=2560  # uncomment + set DIMENSION_STRATEGY=pad_or_truncate to force a fixed size
+
+# =========================
+# Performance (Embeddings)
+# =========================
+BATCH_SIZE=32
+MAX_BATCH_SIZE=128
+MAX_TEXTS_PER_REQUEST=100
+MAX_PASSAGES_PER_RERANK=1000
+# Logical max tokens per text; actual limits are inferred from model metadata
+MAX_SEQUENCE_LENGTH=8192
+DEVICE_MEMORY_FRACTION=0.8
+REQUEST_TIMEOUT=300
+
+# =========================
+# Reranker (Cross-Encoder) – optional
+# =========================
+# Enable true cross-encoder reranking by setting a model ID.
+# When unset, /api/v1/rerank falls back to embedding-similarity.
+RERANKER_BACKEND=auto           # auto | mlx | torch
+RERANKER_MODEL_ID=              # e.g. cross-encoder/ms-marco-MiniLM-L-6-v2
+# RERANKER_MODEL_NAME=          # alias for RERANKER_MODEL_ID
+# CROSS_ENCODER_MODEL=          # legacy alias, also accepted
+
+# Optional reranker overrides
+RERANK_MAX_SEQ_LEN=512          # pairwise (query+doc) max tokens
+RERANK_BATCH_SIZE=16            # reranker batch size
+
 # MLX-only experimental options:
 # - RERANK_POOLING: mean | cls (default: mean)
 # - RERANK_SCORE_NORM: none | sigmoid | minmax (default: none)
 #   Use sigmoid to bound scores to [0,1] for schema-constrained clients.
 RERANK_POOLING=mean
 RERANK_SCORE_NORM=none
 
-# OpenAI compatibility (scores normalization on native path for OpenAI clients)
+# =========================
+# OpenAI compatibility
+# =========================
+# Automatically apply sigmoid normalization for OpenAI-compatible rerank scores
 # true | false (default true)
 OPENAI_RERANK_AUTO_SIGMOID=true
 
+# =========================
 # Model Cache & Storage
+# =========================
 # MODEL_PATH: Custom path for MLX models (overrides auto cache detection)
 # If empty, uses Hugging Face cache or environment variables below:
 # TRANSFORMERS_CACHE: Override HF transformers cache location
-# HF_HOME: Hugging Face cache home directory  
+# HF_HOME: Hugging Face cache home directory
 # Default cache location: ~/.cache/huggingface/hub/
-# 
+#
 # Examples:
 # MODEL_PATH=/path/to/local/models/Qwen3-Embedding-4B-4bit-DWQ
 # TRANSFORMERS_CACHE=/custom/cache/transformers
 # HF_HOME=/custom/huggingface
 
+# =========================
 # Server
+# =========================
 HOST=0.0.0.0
 PORT=9000
 RELOAD=false
 
-# Performance
-BATCH_SIZE=32
-MAX_BATCH_SIZE=128
-MAX_TEXTS_PER_REQUEST=100
-MAX_PASSAGES_PER_RERANK=1000
-MAX_SEQUENCE_LENGTH=512
-DEVICE_MEMORY_FRACTION=0.8
-REQUEST_TIMEOUT=300
-
-# 🚀 Text Processing Configuration (NEW!)
-# Default text processing options for the service
+# =========================
+# 🚀 Text Processing Defaults
+# =========================
+# Long text handling is automatic (token-aware). These set the defaults.
 DEFAULT_AUTO_TRUNCATE=true
 DEFAULT_TRUNCATION_STRATEGY=smart_truncate
-# DEFAULT_MAX_TOKENS_OVERRIDE=2048
+# DEFAULT_MAX_TOKENS_OVERRIDE=4096  # up to absolute max (from model metadata)
 DEFAULT_RETURN_PROCESSING_INFO=false
 
 # Text processing strategies:
 # - smart_truncate: Preserve sentence boundaries while truncating (recommended)
 # - truncate: Simple token-based truncation
 # - extract: Extract key sentences only
 # - error: Raise error when token limit is exceeded
-#
-# Token limits (automatically detected from model metadata):
-# - Recommended max tokens: 2048 (auto-truncation trigger)
-# - Absolute max tokens: 8192 (hard limit, will raise error)
-# - Users can override recommended limit via max_tokens_override (up to absolute max)
 
+# =========================
 # Logging
+# =========================
 LOG_LEVEL=INFO
 LOG_FORMAT=json
 
-# Security (optional)
+# =========================
+# Security / CORS (optional)
+# =========================
 # ALLOWED_HOSTS=["example.com","api.example.com"]
 # ALLOWED_ORIGINS=["https://example.com","https://app.example.com"]
 
-# Copy to .env and adjust as needed.
+# Copy this file to `.env` and adjust values for your deployment.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,15 @@ All notable changes to the Apple MLX Embed-Rerank API project will be documented
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.5.1] - 2025-11-14
+
+### Fixed
+- Cohere-compatible rerank endpoints now omit the `document` field entirely when `return_documents` is `false`, instead of returning `document: null`. This avoids validation errors in strict clients (e.g. LiteLLM’s `RerankResponse` schema).
+- Cohere v2 rerank tests tightened to assert that `document` is absent when `return_documents=false`, preventing regressions.
+
+### Changed
+- Example `.env.example` updated to reflect the current default configuration (MLX Qwen3 embedding model, `DIMENSION_STRATEGY=hidden_size`, and clarified reranker settings).
+
 ## [1.2.0] - 2025-09-10
  
 ## [1.2.3] - 2025-10-30
diff --git a/app/__init__.py b/app/__init__.py
@@ -19,5 +19,5 @@
 Author: joonsoo-me
 """
 
-__version__ = "1.5.0"
+__version__ = "1.5.1"
 __author__ = "joonsoo-me"
diff --git a/app/routers/cohere_router.py b/app/routers/cohere_router.py
@@ -8,6 +8,7 @@
 from typing import List
 
 from fastapi import APIRouter, Depends, HTTPException
+from fastapi.responses import JSONResponse
 
 from ..backends.base import BackendManager
 from ..models.cohere_models import CohereRerankRequest, CohereRerankResponse, CohereRerankResult, CohereDocument
@@ -25,6 +26,21 @@
     },
 )
 
+
+def _filter_none_values(data):
+    """
+    Recursively filter out keys with None values.
+
+    This ensures fields like `document: null` are omitted entirely
+    when `return_documents` is False, matching Cohere API behavior.
+    """
+    if isinstance(data, dict):
+        return {k: _filter_none_values(v) for k, v in data.items() if v is not None}
+    if isinstance(data, list):
+        return [_filter_none_values(item) for item in data]
+    return data
+
+
 # This will be set by the main app
 _backend_manager: BackendManager = None
 
@@ -113,7 +129,10 @@ async def rerank_v1(request: CohereRerankRequest, service: RerankingService = De
         # Convert back to Cohere format
         cohere_response = convert_to_cohere_response(internal_response, request)
 
-        return cohere_response
+        # Ensure None fields (e.g., document when return_documents=False) are omitted
+        payload = _filter_none_values(cohere_response.model_dump())
+
+        return JSONResponse(content=payload)
 
     except ValueError as e:
         raise HTTPException(status_code=400, detail=f"Invalid input: {str(e)}")
diff --git a/tests/test_cohere_api.py b/tests/test_cohere_api.py
@@ -99,7 +99,7 @@ def test_cohere_v2_rerank():
         assert "index" in res, f"Result {i} missing 'index' field"
         assert "relevance_score" in res, f"Result {i} missing 'relevance_score' field"
         # Should not have document field when return_documents=False
-        assert res.get("document") is None, f"Result {i} should not have document field when return_documents=False"
+        assert "document" not in res, f"Result {i} should not have document field when return_documents=False"
         assert isinstance(res["index"], int), f"Result {i} index should be int"
         assert isinstance(res["relevance_score"], (int, float)), f"Result {i} relevance_score should be numeric"