Skip to content

Commit 0ee45b4

Browse files
authored
Merge pull request #13 from joonsoome/fix/cohere-null-document
fix(cohere): omit null document when return_documents=false
2 parents d049b48 + 4ccffbf commit 0ee45b4

File tree

5 files changed

+93
-45
lines changed

5 files changed

+93
-45
lines changed

.env.example

Lines changed: 62 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,92 +1,112 @@
11
############################################################
22
# Example Environment (.env example)
3+
#
4+
# Copy this file to `.env` and tweak as needed.
35
############################################################
46

5-
# Backend
7+
# =========================
8+
# Backend (Embeddings)
9+
# =========================
10+
# auto | mlx | torch (auto picks MLX on Apple Silicon)
611
BACKEND=auto
12+
# Default Apple‑Silicon‑friendly embedding model
713
MODEL_NAME=mlx-community/Qwen3-Embedding-4B-4bit-DWQ
14+
# Optional: local MLX-converted model directory (overrides HF cache)
815
MODEL_PATH=
9-
CROSS_ENCODER_MODEL=
1016

11-
# Reranker (Cross-Encoder) — Optional
12-
# Choose one of the following ways to enable reranking:
13-
# 1) Torch CrossEncoder (sentence-transformers)
14-
# RERANKER_BACKEND=torch
15-
# CROSS_ENCODER_MODEL=cross-encoder/ms-marco-MiniLM-L-6-v2
16-
#
17-
# 2) MLX Reranker (experimental v1; pooled token embeddings + linear head)
18-
# RERANKER_BACKEND=mlx
19-
# RERANKER_MODEL_ID=vserifsaglam/Qwen3-Reranker-4B-4bit-MLX
20-
#
21-
# Auto selection prefers Torch for stability (set RERANKER_BACKEND=auto)
22-
RERANKER_BACKEND=auto
23-
RERANKER_MODEL_ID=
24-
# Alias for convenience; same as RERANKER_MODEL_ID
25-
RERANKER_MODEL_NAME=
26-
# Optional overrides
27-
RERANK_MAX_SEQ_LEN=512
28-
RERANK_BATCH_SIZE=16
17+
# Embedding dimension strategy
18+
# - as_is → use backend output dimension
19+
# - hidden_size → use model hidden_size / HF metadata (2560 for Qwen3-Embedding-4B)
20+
# - pad_or_truncate → force OUTPUT_EMBEDDING_DIMENSION (pads or truncates vectors)
21+
DIMENSION_STRATEGY=hidden_size
22+
# OUTPUT_EMBEDDING_DIMENSION=2560 # uncomment + set DIMENSION_STRATEGY=pad_or_truncate to force a fixed size
23+
24+
# =========================
25+
# Performance (Embeddings)
26+
# =========================
27+
BATCH_SIZE=32
28+
MAX_BATCH_SIZE=128
29+
MAX_TEXTS_PER_REQUEST=100
30+
MAX_PASSAGES_PER_RERANK=1000
31+
# Logical max tokens per text; actual limits are inferred from model metadata
32+
MAX_SEQUENCE_LENGTH=8192
33+
DEVICE_MEMORY_FRACTION=0.8
34+
REQUEST_TIMEOUT=300
35+
36+
# =========================
37+
# Reranker (Cross-Encoder) – optional
38+
# =========================
39+
# Enable true cross-encoder reranking by setting a model ID.
40+
# When unset, /api/v1/rerank falls back to embedding-similarity.
41+
RERANKER_BACKEND=auto # auto | mlx | torch
42+
RERANKER_MODEL_ID= # e.g. cross-encoder/ms-marco-MiniLM-L-6-v2
43+
# RERANKER_MODEL_NAME= # alias for RERANKER_MODEL_ID
44+
# CROSS_ENCODER_MODEL= # legacy alias, also accepted
45+
46+
# Optional reranker overrides
47+
RERANK_MAX_SEQ_LEN=512 # pairwise (query+doc) max tokens
48+
RERANK_BATCH_SIZE=16 # reranker batch size
49+
2950
# MLX-only experimental options:
3051
# - RERANK_POOLING: mean | cls (default: mean)
3152
# - RERANK_SCORE_NORM: none | sigmoid | minmax (default: none)
3253
# Use sigmoid to bound scores to [0,1] for schema-constrained clients.
3354
RERANK_POOLING=mean
3455
RERANK_SCORE_NORM=none
3556

36-
# OpenAI compatibility (scores normalization on native path for OpenAI clients)
57+
# =========================
58+
# OpenAI compatibility
59+
# =========================
60+
# Automatically apply sigmoid normalization for OpenAI-compatible rerank scores
3761
# true | false (default true)
3862
OPENAI_RERANK_AUTO_SIGMOID=true
3963

64+
# =========================
4065
# Model Cache & Storage
66+
# =========================
4167
# MODEL_PATH: Custom path for MLX models (overrides auto cache detection)
4268
# If empty, uses Hugging Face cache or environment variables below:
4369
# TRANSFORMERS_CACHE: Override HF transformers cache location
44-
# HF_HOME: Hugging Face cache home directory
70+
# HF_HOME: Hugging Face cache home directory
4571
# Default cache location: ~/.cache/huggingface/hub/
46-
#
72+
#
4773
# Examples:
4874
# MODEL_PATH=/path/to/local/models/Qwen3-Embedding-4B-4bit-DWQ
4975
# TRANSFORMERS_CACHE=/custom/cache/transformers
5076
# HF_HOME=/custom/huggingface
5177

78+
# =========================
5279
# Server
80+
# =========================
5381
HOST=0.0.0.0
5482
PORT=9000
5583
RELOAD=false
5684

57-
# Performance
58-
BATCH_SIZE=32
59-
MAX_BATCH_SIZE=128
60-
MAX_TEXTS_PER_REQUEST=100
61-
MAX_PASSAGES_PER_RERANK=1000
62-
MAX_SEQUENCE_LENGTH=512
63-
DEVICE_MEMORY_FRACTION=0.8
64-
REQUEST_TIMEOUT=300
65-
66-
# 🚀 Text Processing Configuration (NEW!)
67-
# Default text processing options for the service
85+
# =========================
86+
# 🚀 Text Processing Defaults
87+
# =========================
88+
# Long text handling is automatic (token-aware). These set the defaults.
6889
DEFAULT_AUTO_TRUNCATE=true
6990
DEFAULT_TRUNCATION_STRATEGY=smart_truncate
70-
# DEFAULT_MAX_TOKENS_OVERRIDE=2048
91+
# DEFAULT_MAX_TOKENS_OVERRIDE=4096 # up to absolute max (from model metadata)
7192
DEFAULT_RETURN_PROCESSING_INFO=false
7293

7394
# Text processing strategies:
7495
# - smart_truncate: Preserve sentence boundaries while truncating (recommended)
7596
# - truncate: Simple token-based truncation
7697
# - extract: Extract key sentences only
7798
# - error: Raise error when token limit is exceeded
78-
#
79-
# Token limits (automatically detected from model metadata):
80-
# - Recommended max tokens: 2048 (auto-truncation trigger)
81-
# - Absolute max tokens: 8192 (hard limit, will raise error)
82-
# - Users can override recommended limit via max_tokens_override (up to absolute max)
8399

100+
# =========================
84101
# Logging
102+
# =========================
85103
LOG_LEVEL=INFO
86104
LOG_FORMAT=json
87105

88-
# Security (optional)
106+
# =========================
107+
# Security / CORS (optional)
108+
# =========================
89109
# ALLOWED_HOSTS=["example.com","api.example.com"]
90110
# ALLOWED_ORIGINS=["https://example.com","https://app.example.com"]
91111

92-
# Copy to .env and adjust as needed.
112+
# Copy this file to `.env` and adjust values for your deployment.

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,15 @@ All notable changes to the Apple MLX Embed-Rerank API project will be documented
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [1.5.1] - 2025-11-14
9+
10+
### Fixed
11+
- Cohere-compatible rerank endpoints now omit the `document` field entirely when `return_documents` is `false`, instead of returning `document: null`. This avoids validation errors in strict clients (e.g. LiteLLM’s `RerankResponse` schema).
12+
- Cohere v2 rerank tests tightened to assert that `document` is absent when `return_documents=false`, preventing regressions.
13+
14+
### Changed
15+
- Example `.env.example` updated to reflect the current default configuration (MLX Qwen3 embedding model, `DIMENSION_STRATEGY=hidden_size`, and clarified reranker settings).
16+
817
## [1.2.0] - 2025-09-10
918

1019
## [1.2.3] - 2025-10-30

app/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,5 @@
1919
Author: joonsoo-me
2020
"""
2121

22-
__version__ = "1.5.0"
22+
__version__ = "1.5.1"
2323
__author__ = "joonsoo-me"

app/routers/cohere_router.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from typing import List
99

1010
from fastapi import APIRouter, Depends, HTTPException
11+
from fastapi.responses import JSONResponse
1112

1213
from ..backends.base import BackendManager
1314
from ..models.cohere_models import CohereRerankRequest, CohereRerankResponse, CohereRerankResult, CohereDocument
@@ -25,6 +26,21 @@
2526
},
2627
)
2728

29+
30+
def _filter_none_values(data):
31+
"""
32+
Recursively filter out keys with None values.
33+
34+
This ensures fields like `document: null` are omitted entirely
35+
when `return_documents` is False, matching Cohere API behavior.
36+
"""
37+
if isinstance(data, dict):
38+
return {k: _filter_none_values(v) for k, v in data.items() if v is not None}
39+
if isinstance(data, list):
40+
return [_filter_none_values(item) for item in data]
41+
return data
42+
43+
2844
# This will be set by the main app
2945
_backend_manager: BackendManager = None
3046

@@ -113,7 +129,10 @@ async def rerank_v1(request: CohereRerankRequest, service: RerankingService = De
113129
# Convert back to Cohere format
114130
cohere_response = convert_to_cohere_response(internal_response, request)
115131

116-
return cohere_response
132+
# Ensure None fields (e.g., document when return_documents=False) are omitted
133+
payload = _filter_none_values(cohere_response.model_dump())
134+
135+
return JSONResponse(content=payload)
117136

118137
except ValueError as e:
119138
raise HTTPException(status_code=400, detail=f"Invalid input: {str(e)}")

tests/test_cohere_api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def test_cohere_v2_rerank():
9999
assert "index" in res, f"Result {i} missing 'index' field"
100100
assert "relevance_score" in res, f"Result {i} missing 'relevance_score' field"
101101
# Should not have document field when return_documents=False
102-
assert res.get("document") is None, f"Result {i} should not have document field when return_documents=False"
102+
assert "document" not in res, f"Result {i} should not have document field when return_documents=False"
103103
assert isinstance(res["index"], int), f"Result {i} index should be int"
104104
assert isinstance(res["relevance_score"], (int, float)), f"Result {i} relevance_score should be numeric"
105105

0 commit comments

Comments
 (0)