Skip to content

Commit 4b962ba

Browse files
authored
feat(visualizer): add HTTP content negotiation for visualizer API backward compatibility (#34)
* feat(visualizer): add HTTP content negotiation for visualizer API backward compatibility - Default to JSON responses for v2.2.0+ backward compatibility - Support MessagePack via Accept: application/msgpack header - Browser visualizer explicitly requests MessagePack for performance - Add test for JSON backward compatibility - Update docs with JSON and MessagePack client examples * fix(visualizer): improve dependency handling and multilingual JSON support - Add Request to ImportError fallback block for dependency-free imports - Place Request before default params (FastAPI auto-injects Request) - Add Request to ImportError fallback block for dependency-free imports - Replace Request hint to Request | None union type - Add ensure_ascii=False to json.dumps() to preserve non-ASCII characters
1 parent 08ec569 commit 4b962ba

5 files changed

Lines changed: 140 additions & 49 deletions

File tree

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2222
- **Installation**: Added `viz` as alias for `visualization` extra
2323

2424
### Changed
25+
- **Visualizer API Content Negotiation**:
26+
- Added HTTP Accept header content negotiation to `/api/chunk` endpoint
27+
- Clients can request MessagePack via `Accept: application/msgpack` header
2528
- **Core Performance Overhaul**:
2629
- Replaced regex-based `_find_span` with deterministic finder (~2x faster span detection, avoiding regex backtracking)
2730
- Switched from `regex` library to stdlib `re` - ~2x faster for simple patterns
2831
- Replaced `box` library with `dotdict3` - 12x faster (0.467s vs 0.039s per 10k accesses)
2932
- Switched from JSON to MessagePack encoding in visualizer (~30-50% smaller payloads, faster encoding)
3033
- Added `@lru_cache(maxsize=52)` on `_get_special_lang_handler` for caching handler lookups
34+
- Added explicitly MessagePack requests for visualizer when triggered via cli (~30-50% smaller payloads, faster encoding)
3135
- **Fallback Splitter**: Renamed `FallbackSplitter` to `_clean_sentences` (works for 50+ languages)
3236
- **SentenceSplitter Rename**: Renamed `_filter_sentences` method to `split_text`
3337
- **Lazy Imports**:

docs/getting-started/programmatic/visualizer.md

Lines changed: 87 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -201,64 +201,108 @@ python server.py
201201

202202
Use this Python client to chunk files programmatically:
203203

204-
``` py linenums="1"
205-
import requests
204+
=== "JSON (Default - Backward Compatible)"
205+
206+
``` py linenums="1"
207+
import requests
208+
209+
# Connect to your running server
210+
base_url = "http://127.0.0.1:8000"
211+
212+
# Check if token counter is available
213+
response = requests.get(f"{base_url}/api/token_counter_status")
214+
print(response.json()) # {"token_counter_available": false}
215+
216+
# Chunk a file
217+
with open("my_document.txt", "rb") as f:
218+
files = {"file": ("my_document.txt", f, "text/plain")}
219+
data = {
220+
"mode": "document", # or "code"
221+
"params": '{"max_sentences": 3, "overlap_percent": 20}'
222+
}
223+
224+
response = requests.post(f"{base_url}/api/chunk", files=files, data=data)
225+
226+
if response.status_code == 200:
227+
result = response.json()
228+
print(f"Created {result['stats']['chunk_count']} chunks")
229+
230+
# Access chunks
231+
for chunk in result["chunks"]:
232+
print(f"Chunk content: {chunk['content']}")
233+
print(f"Metadata: {chunk['metadata']}")
234+
else:
235+
print(f"Error: {response.status_code} - {response.text}")
236+
```
206237

207-
# Connect to your running server
208-
base_url = "http://127.0.0.1:8000"
238+
=== "MessagePack (Faster - ~30-50% smaller payloads)"
209239

210-
# Check if token counter is available
211-
response = requests.get(f"{base_url}/api/token_counter_status")
212-
print(response.json()) # {"token_counter_available": false}
240+
``` py linenums="1"
241+
import requests
242+
import msgpack
213243

214-
# Chunk a file
215-
with open("my_document.txt", "rb") as f:
216-
files = {"file": ("my_document.txt", f, "text/plain")}
217-
data = {
218-
"mode": "document", # or "code"
219-
"params": '{"max_sentences": 3, "overlap_percent": 20}'
220-
}
244+
# Connect to your running server
245+
base_url = "http://127.0.0.1:8000"
221246

222-
response = requests.post(f"{base_url}/api/chunk", files=files, data=data)
247+
# Request MessagePack response using Accept header
248+
headers = {"Accept": "application/msgpack"}
223249

224-
if response.status_code == 200:
225-
result = response.json()
226-
print(f"Created {result['stats']['chunk_count']} chunks")
250+
with open("my_document.txt", "rb") as f:
251+
files = {"file": ("my_document.txt", f, "text/plain")}
252+
data = {
253+
"mode": "document",
254+
"params": '{"max_sentences": 3, "overlap_percent": 20}'
255+
}
227256

228-
# Access chunks
229-
for chunk in result["chunks"]:
230-
print(f"Chunk content: {chunk['content']}")
231-
print(f"Metadata: {chunk['metadata']}")
232-
else:
233-
print(f"Error: {response.status_code} - {response.text}")
234-
```
257+
response = requests.post(f"{base_url}/api/chunk", files=files, data=data, headers=headers)
258+
259+
if response.status_code == 200:
260+
result = msgpack.unpackb(response.content, raw=False)
261+
print(f"Created {result['stats']['chunk_count']} chunks")
262+
263+
for chunk in result["chunks"]:
264+
print(f"Chunk content: {chunk['content']}")
265+
else:
266+
print(f"Error: {response.status_code} - {response.text}")
267+
```
235268

236269

237270

238271
#### Response Format
239272

240-
The `/api/chunk` endpoint returns:
273+
The `/api/chunk` endpoint supports content negotiation:
274+
275+
- **JSON (default)**: Returns JSON response - backward compatible with v2.2.0 and earlier
276+
- **MessagePack**: Add `Accept: application/msgpack` header for ~30-50% smaller payloads
241277

242-
```json
243-
{
244-
"text": "Original file content...",
245-
"chunks": [
278+
=== "JSON Response"
279+
280+
```json
246281
{
247-
"content": "Chunk text content...",
248-
"metadata": {
249-
"source": "filename.txt",
250-
"chunk_num": 1,
251-
"span": [0, 150],
252-
// ... additional metadata
282+
"text": "Original file content...",
283+
"chunks": [
284+
{
285+
"content": "Chunk text content...",
286+
"metadata": {
287+
"source": "filename.txt",
288+
"chunk_num": 1,
289+
"span": [0, 150],
290+
// ... additional metadata
291+
}
292+
}
293+
],
294+
"stats": {
295+
"text_length": 696,
296+
"chunk_count": 3,
297+
"mode": "document"
253298
}
254299
}
255-
],
256-
"stats": {
257-
"text_length": 696,
258-
"chunk_count": 3,
259-
"mode": "document"
260-
}
261-
}
300+
```
301+
302+
=== "MessagePack Response"
303+
304+
Same structure as JSON, but encoded in MessagePack binary format for smaller payloads.
305+
Decode using: `msgpack.unpackb(response.content, raw=False)`
262306
```
263307
264308
!!! tip "Perfect for Integration"

src/chunklet/visualizer/static/js/app.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,9 @@ async function processUploadedFile() {
365365
// Send POST with FormData
366366
const response = await fetch('/api/chunk', {
367367
method: 'POST',
368+
headers: {
369+
'Accept': 'application/msgpack'
370+
},
368371
body: formData
369372
});
370373

src/chunklet/visualizer/visualizer.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import msgpack
1111
import uvicorn
1212
from charset_normalizer import detect
13-
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
13+
from fastapi import FastAPI, File, Form, HTTPException, Request, UploadFile
1414
from fastapi.responses import HTMLResponse, Response
1515
from fastapi.staticfiles import StaticFiles
1616
except ImportError: # pragma: no cover
@@ -24,6 +24,7 @@
2424
File = lambda x: x # noqa: E731
2525
Form = lambda x: x # noqa: E731
2626
HTTPException = None
27+
Request = None
2728
HTMLResponse = lambda x: x # noqa: E731
2829
Response = lambda x: x # noqa: E731
2930
StaticFiles = None
@@ -124,6 +125,7 @@ async def _get_index(self):
124125
@validate_input
125126
async def _chunk_file(
126127
self,
128+
request: Request,
127129
file: UploadFile = File(...),
128130
mode: str = Form("document"),
129131
params: str = Form("{}"),
@@ -135,9 +137,13 @@ async def _chunk_file(
135137
file: File uploaded by the client.
136138
mode: Determines which chunker to use ("document" or "code").
137139
params: JSON string containing chunking parameters.
140+
request: Optional Request object for content negotiation.
138141
139142
Returns:
140-
MessagePack-encoded response with original text, chunks, and stats.
143+
MessagePack or JSON response with original text, chunks, and stats.
144+
Uses content negotiation: client can request MessagePack via
145+
Accept: application/msgpack header. Defaults to JSON for backward
146+
compatibility.
141147
142148
Raises:
143149
HTTPException: If chunking fails.
@@ -183,9 +189,16 @@ async def _chunk_file(
183189
},
184190
}
185191

192+
accept = request.headers.get("Accept", "") if request else ""
193+
if "application/msgpack" in accept:
194+
return Response(
195+
content=msgpack.packb(response_data, use_bin_type=True),
196+
media_type="application/msgpack",
197+
)
198+
186199
return Response(
187-
content=msgpack.packb(response_data, use_bin_type=True),
188-
media_type="application/msgpack",
200+
content=json.dumps(response_data, ensure_ascii=False),
201+
media_type="application/json",
189202
)
190203

191204
except Exception as e:

tests/test_visualization.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def test_chunk_file(visualizer_server):
101101
sample_file_path = Path(__file__).parent.parent / "samples" / "sample_text.txt"
102102
assert sample_file_path.exists(), f"Sample file not found: {sample_file_path}"
103103

104-
# Test
104+
# Test with MessagePack format (explicit request)
105105
with open(sample_file_path, "rb") as f:
106106
files = {"file": ("sample_text.txt", f, "text/plain")}
107107
data = {
@@ -110,8 +110,9 @@ def test_chunk_file(visualizer_server):
110110
{"max_sentences": 3, "overlap_percent": 20} # Chunk by 3 sentences
111111
),
112112
}
113+
headers = {"Accept": "application/msgpack"}
113114

114-
response = requests.post(url, files=files, data=data)
115+
response = requests.post(url, files=files, data=data, headers=headers)
115116
assert response.status_code == 200
116117

117118
result = msgpack.unpackb(response.content, raw=False)
@@ -129,6 +130,32 @@ def test_chunk_file(visualizer_server):
129130
assert "metadata" in chunk
130131

131132

133+
def test_chunk_file_json_backward_compatible(visualizer_server):
134+
"""Test that JSON response works for backward compatibility."""
135+
url = f"{visualizer_server['url']}/api/chunk"
136+
137+
sample_file_path = Path(__file__).parent.parent / "samples" / "sample_text.txt"
138+
assert sample_file_path.exists(), f"Sample file not found: {sample_file_path}"
139+
140+
# Test JSON (default - backward compatible)
141+
with open(sample_file_path, "rb") as f:
142+
files = {"file": ("sample_text.txt", f, "text/plain")}
143+
data = {
144+
"mode": "document",
145+
"params": json.dumps({"max_sentences": 2}),
146+
}
147+
# No Accept header = JSON default
148+
149+
response = requests.post(url, files=files, data=data)
150+
assert response.status_code == 200
151+
152+
result = response.json()
153+
assert "text" in result
154+
assert "chunks" in result
155+
assert "stats" in result
156+
assert result["stats"]["chunk_count"] > 1
157+
158+
132159
def test_chunk_file_invalid_format(visualizer_server):
133160
"""Test uploading invalid file format."""
134161
url = f"{visualizer_server['url']}/api/chunk"

0 commit comments

Comments
 (0)