Merge pull request #8 from joonsoome/5-feature-automatic-forwarding-helper-for-mxarray-with-fallback-to-mxasarray

joonsoome · web-flow · commit 524b18a9e141 · 2025-10-30T21:02:27.000+09:00
OpenAI compatibility: add base64 embedding encoding + optional dimens…
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,21 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [1.2.0] - 2025-09-10
+ 
+## [1.2.3] - 2025-10-30
+
+### Added
+- OpenAI compatibility: base64 encoding support via `encoding_format="base64"` for `/v1/embeddings`.
+- OpenAI compatibility: optional `dimensions` handling (truncate/pad to requested size).
+
+### Documentation
+- README: Added LightRAG integration note (OpenAI embeddings + Cohere reranking tested successfully).
+- README: Added Qwen Embedding similarity scaling note and recommended starting threshold `COSINE_THRESHOLD=0.0` for LightRAG.
+- README: Example for requesting base64-encoded embeddings and decoding back to float32.
+
+### Notes
+- These updates maintain full compatibility with existing OpenAI SDK usage; default remains `encoding_format="float"`.
+
 
 ### Added
 - 🆕 **Cohere API v1/v2 Compatibility**: Full support for Cohere reranking API
diff --git a/README.md b/README.md
@@ -274,6 +274,31 @@ response = client.embeddings.create(
     model="text-embedding-ada-002"
 )
 # 🚀 10x faster than OpenAI, same code!
+
+"""
+Base64 encoding support
+-----------------------
+
+For OpenAI-compatible calls, you can request base64-encoded embeddings by setting `encoding_format` to `"base64"`. This is useful when transporting vectors through systems that expect strings only.
+
+Example (Python OpenAI SDK):
+
+```python
+response = client.embeddings.create(
+    input=["Hello world"],
+    model="text-embedding-ada-002",
+    encoding_format="base64",  # returns base64-encoded float32 bytes
+)
+
+# embedding string is base64; decode if you need floats again
+import base64, numpy as np
+arr = np.frombuffer(base64.b64decode(response.data[0].embedding), dtype=np.float32)
+```
+
+Notes:
+- `encoding_format` defaults to `"float"` (list[float]).
+- `dimensions` is accepted and will truncate/pad to the requested size when supported.
+"""
 ```
 
 ### TEI Compatible
@@ -311,6 +336,25 @@ response = requests.post("http://localhost:9000/v1/rerank", json={
 })
 ```
 
+---
+
+## 🧩 LightRAG Integration
+
+We validated an end-to-end workflow using LightRAG with this service:
+- Embeddings via the OpenAI-compatible endpoint (`/v1/embeddings`)
+- Reranking via the Cohere-compatible endpoint (`/v1/rerank` or `/v2/rerank`)
+
+Results: the integration tests succeeded using OpenAI embeddings and Cohere reranking.
+
+Qwen Embedding similarity scaling note: when using the Qwen Embedding model, we observed cosine similarity values that appear very small (e.g., `0.02`, `0.03`). This is expected due to vector scaling differences and does not indicate poor retrieval by itself. As a starting point, we recommend disabling the retrieval threshold in LightRAG to avoid filtering out good matches prematurely:
+
+```
+# === Retrieval threshold ===
+COSINE_THRESHOLD=0.0
+```
+
+Adjust upward later based on your dataset and evaluation results.
+
 ### Native API
 
 ```bash
diff --git a/app/__init__.py b/app/__init__.py
@@ -7,7 +7,7 @@
 - Apple Silicon MLX optimization with PyTorch fallback
 - Multi-API compatibility: Native, OpenAI, TEI, and Cohere formats
 
-🚀 NEW in v1.2.2: Fully resolved API compatibility test warnings!
+🚀 NEW in v1.2.3: OpenAI base64 encoding support + docs update
 - Fixed Cohere API tests with proper environment variable handling
 - Resolved pytest environment variable propagation issues
 - Eliminated false warnings while maintaining 100% API compatibility
@@ -17,5 +17,5 @@
 Author: joonsoo-me
 """
 
-__version__ = "1.2.2"
+__version__ = "1.2.3"
 __author__ = "joonsoo-me"
diff --git a/app/routers/openai_router.py b/app/routers/openai_router.py
@@ -17,6 +17,8 @@
 
 import time
 from typing import Any, Dict, List, Optional, Union
+import base64
+import numpy as np
 
 import structlog
 from fastapi import APIRouter, Depends, HTTPException, Request
@@ -149,7 +151,8 @@ class OpenAIEmbeddingData(BaseModel):
     """
 
     object: str = Field(default="embedding", description="Object type identifier")
-    embedding: List[float] = Field(..., description="The embedding vector")
+    # Allow either float list (default) or base64 string when encoding_format="base64"
+    embedding: Union[List[float], str] = Field(..., description="The embedding vector (float list or base64 string)")
     index: int = Field(..., description="Index of the input text")
 
 
@@ -296,8 +299,33 @@ async def create_embeddings(
         # 📊 Calculate comprehensive timing metrics
         total_time = time.time() - start_time
 
-        # 🔄 Transform MLX response to enhanced OpenAI format
-        embedding_data = [OpenAIEmbeddingData(embedding=vector, index=i) for i, vector in enumerate(mlx_result.vectors)]
+        # 🔄 Optionally adjust dimensions if requested
+        vectors: List[List[float]] = mlx_result.vectors
+        target_dims = request.dimensions
+        if target_dims is not None and target_dims > 0:
+            adjusted: List[List[float]] = []
+            for v in vectors:
+                if len(v) == target_dims:
+                    adjusted.append(v)
+                elif len(v) > target_dims:
+                    # Truncate to requested dimensions
+                    adjusted.append(v[:target_dims])
+                else:
+                    # Pad with zeros up to requested dimensions
+                    padded = v + [0.0] * (target_dims - len(v))
+                    adjusted.append(padded)
+            vectors = adjusted
+
+        # 🔄 Transform MLX response to enhanced OpenAI format (support base64 when requested)
+        embedding_data: List[OpenAIEmbeddingData] = []
+        if (request.encoding_format or "float").lower() == "base64":
+            for i, v in enumerate(vectors):
+                arr = np.asarray(v, dtype=np.float32)
+                b64 = base64.b64encode(arr.tobytes()).decode("ascii")
+                embedding_data.append(OpenAIEmbeddingData(embedding=b64, index=i))
+        else:
+            for i, v in enumerate(vectors):
+                embedding_data.append(OpenAIEmbeddingData(embedding=v, index=i))
 
         # 📈 Calculate token usage (approximate word-based counting)
         total_tokens = sum(len(text.split()) for text in texts)