|
1 | 1 | ############################################################ |
2 | 2 | # Example Environment (.env example) |
| 3 | +# |
| 4 | +# Copy this file to `.env` and tweak as needed. |
3 | 5 | ############################################################ |
4 | 6 |
|
5 | | -# Backend |
| 7 | +# ========================= |
| 8 | +# Backend (Embeddings) |
| 9 | +# ========================= |
| 10 | +# auto | mlx | torch (auto picks MLX on Apple Silicon) |
6 | 11 | BACKEND=auto |
| 12 | +# Default Apple‑Silicon‑friendly embedding model |
7 | 13 | MODEL_NAME=mlx-community/Qwen3-Embedding-4B-4bit-DWQ |
| 14 | +# Optional: local MLX-converted model directory (overrides HF cache) |
8 | 15 | MODEL_PATH= |
9 | | -CROSS_ENCODER_MODEL= |
10 | 16 |
|
11 | | -# Reranker (Cross-Encoder) — Optional |
12 | | -# Choose one of the following ways to enable reranking: |
13 | | -# 1) Torch CrossEncoder (sentence-transformers) |
14 | | -# RERANKER_BACKEND=torch |
15 | | -# CROSS_ENCODER_MODEL=cross-encoder/ms-marco-MiniLM-L-6-v2 |
16 | | -# |
17 | | -# 2) MLX Reranker (experimental v1; pooled token embeddings + linear head) |
18 | | -# RERANKER_BACKEND=mlx |
19 | | -# RERANKER_MODEL_ID=vserifsaglam/Qwen3-Reranker-4B-4bit-MLX |
20 | | -# |
21 | | -# Auto selection prefers Torch for stability (set RERANKER_BACKEND=auto) |
22 | | -RERANKER_BACKEND=auto |
23 | | -RERANKER_MODEL_ID= |
24 | | -# Alias for convenience; same as RERANKER_MODEL_ID |
25 | | -RERANKER_MODEL_NAME= |
26 | | -# Optional overrides |
27 | | -RERANK_MAX_SEQ_LEN=512 |
28 | | -RERANK_BATCH_SIZE=16 |
| 17 | +# Embedding dimension strategy |
| 18 | +# - as_is → use backend output dimension |
| 19 | +# - hidden_size → use model hidden_size / HF metadata (2560 for Qwen3-Embedding-4B) |
| 20 | +# - pad_or_truncate → force OUTPUT_EMBEDDING_DIMENSION (pads or truncates vectors) |
| 21 | +DIMENSION_STRATEGY=hidden_size |
| 22 | +# OUTPUT_EMBEDDING_DIMENSION=2560 # uncomment + set DIMENSION_STRATEGY=pad_or_truncate to force a fixed size |
| 23 | + |
| 24 | +# ========================= |
| 25 | +# Performance (Embeddings) |
| 26 | +# ========================= |
| 27 | +BATCH_SIZE=32 |
| 28 | +MAX_BATCH_SIZE=128 |
| 29 | +MAX_TEXTS_PER_REQUEST=100 |
| 30 | +MAX_PASSAGES_PER_RERANK=1000 |
| 31 | +# Logical max tokens per text; actual limits are inferred from model metadata |
| 32 | +MAX_SEQUENCE_LENGTH=8192 |
| 33 | +DEVICE_MEMORY_FRACTION=0.8 |
| 34 | +REQUEST_TIMEOUT=300 |
| 35 | + |
| 36 | +# ========================= |
| 37 | +# Reranker (Cross-Encoder) – optional |
| 38 | +# ========================= |
| 39 | +# Enable true cross-encoder reranking by setting a model ID. |
| 40 | +# When unset, /api/v1/rerank falls back to embedding-similarity. |
| 41 | +RERANKER_BACKEND=auto # auto | mlx | torch |
| 42 | +RERANKER_MODEL_ID= # e.g. cross-encoder/ms-marco-MiniLM-L-6-v2 |
| 43 | +# RERANKER_MODEL_NAME= # alias for RERANKER_MODEL_ID |
| 44 | +# CROSS_ENCODER_MODEL= # legacy alias, also accepted |
| 45 | + |
| 46 | +# Optional reranker overrides |
| 47 | +RERANK_MAX_SEQ_LEN=512 # pairwise (query+doc) max tokens |
| 48 | +RERANK_BATCH_SIZE=16 # reranker batch size |
| 49 | + |
29 | 50 | # MLX-only experimental options: |
30 | 51 | # - RERANK_POOLING: mean | cls (default: mean) |
31 | 52 | # - RERANK_SCORE_NORM: none | sigmoid | minmax (default: none) |
32 | 53 | # Use sigmoid to bound scores to [0,1] for schema-constrained clients. |
33 | 54 | RERANK_POOLING=mean |
34 | 55 | RERANK_SCORE_NORM=none |
35 | 56 |
|
36 | | -# OpenAI compatibility (scores normalization on native path for OpenAI clients) |
| 57 | +# ========================= |
| 58 | +# OpenAI compatibility |
| 59 | +# ========================= |
| 60 | +# Automatically apply sigmoid normalization for OpenAI-compatible rerank scores |
37 | 61 | # true | false (default true) |
38 | 62 | OPENAI_RERANK_AUTO_SIGMOID=true |
39 | 63 |
|
| 64 | +# ========================= |
40 | 65 | # Model Cache & Storage |
| 66 | +# ========================= |
41 | 67 | # MODEL_PATH: Custom path for MLX models (overrides auto cache detection) |
42 | 68 | # If empty, uses Hugging Face cache or environment variables below: |
43 | 69 | # TRANSFORMERS_CACHE: Override HF transformers cache location |
44 | | -# HF_HOME: Hugging Face cache home directory |
| 70 | +# HF_HOME: Hugging Face cache home directory |
45 | 71 | # Default cache location: ~/.cache/huggingface/hub/ |
46 | | -# |
| 72 | +# |
47 | 73 | # Examples: |
48 | 74 | # MODEL_PATH=/path/to/local/models/Qwen3-Embedding-4B-4bit-DWQ |
49 | 75 | # TRANSFORMERS_CACHE=/custom/cache/transformers |
50 | 76 | # HF_HOME=/custom/huggingface |
51 | 77 |
|
| 78 | +# ========================= |
52 | 79 | # Server |
| 80 | +# ========================= |
53 | 81 | HOST=0.0.0.0 |
54 | 82 | PORT=9000 |
55 | 83 | RELOAD=false |
56 | 84 |
|
57 | | -# Performance |
58 | | -BATCH_SIZE=32 |
59 | | -MAX_BATCH_SIZE=128 |
60 | | -MAX_TEXTS_PER_REQUEST=100 |
61 | | -MAX_PASSAGES_PER_RERANK=1000 |
62 | | -MAX_SEQUENCE_LENGTH=512 |
63 | | -DEVICE_MEMORY_FRACTION=0.8 |
64 | | -REQUEST_TIMEOUT=300 |
65 | | - |
66 | | -# 🚀 Text Processing Configuration (NEW!) |
67 | | -# Default text processing options for the service |
| 85 | +# ========================= |
| 86 | +# 🚀 Text Processing Defaults |
| 87 | +# ========================= |
| 88 | +# Long text handling is automatic (token-aware). These set the defaults. |
68 | 89 | DEFAULT_AUTO_TRUNCATE=true |
69 | 90 | DEFAULT_TRUNCATION_STRATEGY=smart_truncate |
70 | | -# DEFAULT_MAX_TOKENS_OVERRIDE=2048 |
| 91 | +# DEFAULT_MAX_TOKENS_OVERRIDE=4096 # up to absolute max (from model metadata) |
71 | 92 | DEFAULT_RETURN_PROCESSING_INFO=false |
72 | 93 |
|
73 | 94 | # Text processing strategies: |
74 | 95 | # - smart_truncate: Preserve sentence boundaries while truncating (recommended) |
75 | 96 | # - truncate: Simple token-based truncation |
76 | 97 | # - extract: Extract key sentences only |
77 | 98 | # - error: Raise error when token limit is exceeded |
78 | | -# |
79 | | -# Token limits (automatically detected from model metadata): |
80 | | -# - Recommended max tokens: 2048 (auto-truncation trigger) |
81 | | -# - Absolute max tokens: 8192 (hard limit, will raise error) |
82 | | -# - Users can override recommended limit via max_tokens_override (up to absolute max) |
83 | 99 |
|
| 100 | +# ========================= |
84 | 101 | # Logging |
| 102 | +# ========================= |
85 | 103 | LOG_LEVEL=INFO |
86 | 104 | LOG_FORMAT=json |
87 | 105 |
|
88 | | -# Security (optional) |
| 106 | +# ========================= |
| 107 | +# Security / CORS (optional) |
| 108 | +# ========================= |
89 | 109 | # ALLOWED_HOSTS=["example.com","api.example.com"] |
90 | 110 | # ALLOWED_ORIGINS=["https://example.com","https://app.example.com"] |
91 | 111 |
|
92 | | -# Copy to .env and adjust as needed. |
| 112 | +# Copy this file to `.env` and adjust values for your deployment. |
0 commit comments