Skip to content

Commit 34abcbb

Browse files
authored
Add RenderChatCompletion and RenderCompletion gRPC RPCs via vLLM OpenAIServingRender (#428)
1 parent 729df62 commit 34abcbb

File tree

13 files changed

+797
-232
lines changed

13 files changed

+797
-232
lines changed

Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,13 +263,14 @@ uds-tokenizer-install-deps: detect-python ## Set up venv and install UDS tokeniz
263263
echo "Virtual environment already exists"; \
264264
fi
265265
@echo "Installing dependencies..."
266-
@$(UDS_TOKENIZER_VENV_BIN)/pip install "$(UDS_TOKENIZER_DIR)[test]"
266+
@$(UDS_TOKENIZER_VENV_BIN)/pip install "$(UDS_TOKENIZER_DIR)[test]" --extra-index-url https://download.pytorch.org/whl/cpu
267267

268268
.PHONY: uds-tokenizer-service-test
269269
uds-tokenizer-service-test: uds-tokenizer-install-deps ## Run UDS tokenizer integration tests (starts server automatically)
270270
@printf "\033[33;1m==== Running UDS tokenizer integration tests ====\033[0m\n"
271271
@$(UDS_TOKENIZER_VENV_BIN)/python -m pytest \
272272
$(UDS_TOKENIZER_DIR)/tests/test_integration.py \
273+
$(UDS_TOKENIZER_DIR)/tests/test_renderer.py \
273274
-v --timeout=60
274275

275276
.PHONY: bench

api/tokenizerpb/tokenizer.proto

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,56 @@ message InitializeTokenizerResponse {
110110
string error_message = 2; // Error message if initialization failed
111111
}
112112

113+
// PlaceholderRange represents a range of placeholder tokens for a multimodal item
114+
message PlaceholderRange {
115+
int32 offset = 1; // Start index of the placeholder tokens in the prompt
116+
int32 length = 2; // Number of placeholder tokens
117+
}
118+
119+
// StringList holds a list of strings (used as map values)
120+
message StringList {
121+
repeated string values = 1;
122+
}
123+
124+
// PlaceholderRangeList holds a list of PlaceholderRange (used as map values)
125+
message PlaceholderRangeList {
126+
repeated PlaceholderRange ranges = 1;
127+
}
128+
129+
// MultiModalFeatures contains multimodal metadata produced by the render step
130+
message MultiModalFeatures {
131+
map<string, StringList> mm_hashes = 1; // Per-modality item hashes
132+
map<string, PlaceholderRangeList> mm_placeholders = 2; // Per-modality placeholder ranges
133+
}
134+
135+
// RenderChatCompletionRequest wraps an OpenAI chat completion request for rendering
136+
message RenderChatCompletionRequest {
137+
string request_json = 1; // JSON-serialized OpenAI ChatCompletionRequest
138+
string model_name = 2; // Model name to use for renderer selection
139+
}
140+
141+
// RenderChatCompletionResponse contains the rendered output from OpenAIServingRender
142+
message RenderChatCompletionResponse {
143+
string request_id = 1; // Request ID from the render response
144+
repeated uint32 token_ids = 2; // Token IDs for the rendered input
145+
MultiModalFeatures features = 3; // Multimodal features (populated for MM inputs)
146+
bool success = 4; // Whether the request was successful
147+
string error_message = 5; // Error message if the request failed
148+
}
149+
150+
// RenderCompletionRequest wraps an OpenAI completion request for rendering
151+
message RenderCompletionRequest {
152+
string request_json = 1; // JSON-serialized OpenAI CompletionRequest
153+
string model_name = 2; // Model name to use for renderer selection
154+
}
155+
156+
// RenderCompletionResponse contains the rendered output for each prompt in the completion request
157+
message RenderCompletionResponse {
158+
repeated RenderChatCompletionResponse items = 1; // One item per prompt in the request
159+
bool success = 2; // Whether the request was successful
160+
string error_message = 3; // Error message if the request failed
161+
}
162+
113163
// TokenizationService defines the gRPC service for tokenization
114164
service TokenizationService {
115165
// Tokenize converts a text input to token IDs
@@ -120,4 +170,12 @@ service TokenizationService {
120170

121171
// InitializeTokenizer initializes the tokenizer for a specific model
122172
rpc InitializeTokenizer(InitializeTokenizerRequest) returns (InitializeTokenizerResponse);
173+
174+
// RenderChatCompletion renders an OpenAI chat completion request via OpenAIServingRender,
175+
// returning token IDs and multimodal features as a GenerateRequest-compatible response
176+
rpc RenderChatCompletion(RenderChatCompletionRequest) returns (RenderChatCompletionResponse);
177+
178+
// RenderCompletion renders an OpenAI completion request via OpenAIServingRender,
179+
// returning one rendered item per prompt in the request
180+
rpc RenderCompletion(RenderCompletionRequest) returns (RenderCompletionResponse);
123181
}

services/uds_tokenizer/Dockerfile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ WORKDIR /app
2323

2424
# Copy project metadata and install dependencies
2525
COPY pyproject.toml /app/pyproject.toml
26-
RUN pip install --no-cache-dir .
26+
RUN pip install --no-cache-dir . --extra-index-url https://download.pytorch.org/whl/cpu
2727

2828
# Runtime stage
2929
FROM --platform=$TARGETPLATFORM python:3.12-slim
@@ -60,6 +60,9 @@ RUN mkdir -p /.modelscope && chown -R 65532:65532 /.modelscope
6060
# Create and set permissions for Hugging Face cache directory
6161
RUN mkdir -p /.cache && chown -R 65532:65532 /.cache
6262

63+
# Create non-root user so getpwuid() works (required by torch/vllm)
64+
RUN useradd -u 65532 -m nonroot
65+
6366
# Switch to non-root user
6467
USER 65532:65532
6568

services/uds_tokenizer/pyproject.toml

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,22 @@ version = "0.1.0"
44
description = "UDS Tokenizer Service - gRPC tokenization over Unix Domain Socket"
55
requires-python = ">=3.12"
66
dependencies = [
7-
"pydantic==2.11.7",
8-
"shortuuid==1.0.13",
9-
"transformers==4.53.0",
10-
"safetensors==0.5.3",
11-
"Jinja2==3.1.6",
12-
"modelscope",
13-
"huggingface-hub",
14-
"aiohttp==3.9.5",
15-
"protobuf==6.31.1",
16-
"tiktoken>=0.7.0",
17-
"grpcio==1.76.0",
18-
"grpcio-tools==1.76.0",
19-
"grpcio-reflection==1.76.0",
7+
# vLLM 0.18.0rc1 CPU wheels — Linux-only, arch-conditional via PEP 508 markers
8+
"vllm @ https://wheels.vllm.ai/262ddd0d81a1e4687e209f988d6ea32616e736fa/vllm-0.18.0rc1%2Bcpu-cp38-abi3-manylinux_2_35_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64'",
9+
"vllm @ https://wheels.vllm.ai/262ddd0d81a1e4687e209f988d6ea32616e736fa/vllm-0.18.0rc1%2Bcpu-cp38-abi3-manylinux_2_35_aarch64.whl ; platform_system == 'Linux' and platform_machine == 'aarch64'",
10+
"pydantic>=2.12.0",
11+
"shortuuid>=1.0.13",
12+
"transformers>=4.57.0,<5",
13+
"safetensors>=0.7.0",
14+
"Jinja2>=3.1.6",
15+
"modelscope>=1.35.0",
16+
"huggingface-hub>=0.36.0",
17+
"aiohttp>=3.13.0",
18+
"protobuf>=6.31.1",
19+
"tiktoken>=0.12.0",
20+
"grpcio>=1.78.0",
21+
"grpcio-tools>=1.78.0",
22+
"grpcio-reflection>=1.78.0",
2023
]
2124

2225
[project.optional-dependencies]

services/uds_tokenizer/run_grpc_server.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
from aiohttp import web
2929
from tokenizer_service.tokenizer import TokenizerService
30+
from tokenizer_service.renderer import RendererService
3031
from tokenizer_grpc_service import create_grpc_server
3132
from utils.thread_pool_utils import get_thread_pool
3233

@@ -55,6 +56,7 @@
5556
probe_started_event = threading.Event() # Event to signal when probe server has started
5657
current_config = None
5758
tokenizer_service = None
59+
renderer_service = None
5860
tokenizer_ready = False
5961
shutdown_event = threading.Event() # Event to signal shutdown
6062

@@ -72,10 +74,11 @@ def _signal_handler(signum, frame):
7274

7375
def initialize_tokenizer():
7476
"""Initialize the tokenizer service without pre-loading a specific model"""
75-
global tokenizer_service, current_config, tokenizer_ready
77+
global tokenizer_service, renderer_service, current_config, tokenizer_ready
7678
try:
7779
# Initialize tokenizer service without pre-loading any model
7880
tokenizer_service = TokenizerService() # Empty constructor
81+
renderer_service = RendererService()
7982
tokenizer_ready = True
8083
logging.info("Tokenizer service initialized successfully")
8184
except Exception as e:
@@ -181,7 +184,7 @@ def run_server():
181184

182185
thread_pool = get_thread_pool()
183186
grpc_server = create_grpc_server(
184-
tokenizer_service, UDS_SOCKET_PATH, thread_pool, GRPC_PORT
187+
tokenizer_service, UDS_SOCKET_PATH, thread_pool, renderer_service, GRPC_PORT
185188
)
186189
grpc_server.start()
187190
logging.info(

services/uds_tokenizer/tests/conftest.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
import tokenizerpb.tokenizer_pb2_grpc as tokenizer_pb2_grpc
2525
from tokenizer_service.tokenizer import TokenizerService
26+
from tokenizer_service.renderer import RendererService
2627
from tokenizer_grpc_service import create_grpc_server
2728
from utils.thread_pool_utils import get_thread_pool
2829

@@ -49,24 +50,29 @@ def uds_socket_path() -> Iterator[str]:
4950

5051

5152
@pytest.fixture(scope="session")
52-
def tokenizer_service(uds_socket_path: str) -> Iterator[TokenizerService]:
53-
"""Provide the TokenizerService instance used by the gRPC server."""
54-
service = TokenizerService()
53+
def grpc_server(uds_socket_path: str) -> Iterator[None]:
54+
"""Start and stop the gRPC server for the test session."""
55+
tokenizer_service = TokenizerService()
56+
renderer_service = RendererService()
57+
5558
thread_pool = get_thread_pool()
56-
server = create_grpc_server(service, uds_socket_path, thread_pool)
59+
server = create_grpc_server(
60+
tokenizer_service,
61+
uds_socket_path,
62+
thread_pool,
63+
renderer_service=renderer_service,
64+
)
5765
server.start()
5866

59-
yield service
67+
yield
6068

6169
# Graceful shutdown with matching timeout
6270
stop_future = server.stop(grace=5)
6371
stop_future.wait(timeout=5)
6472

6573

6674
@pytest.fixture(scope="session")
67-
def grpc_channel(
68-
tokenizer_service: TokenizerService, uds_socket_path: str
69-
) -> Iterator[grpc.Channel]:
75+
def grpc_channel(grpc_server: None, uds_socket_path: str) -> Iterator[grpc.Channel]:
7076
"""Create a gRPC channel connected to the test server.
7177
7278
Uses wait_for_ready to automatically retry connection until server is ready.

services/uds_tokenizer/tests/test_integration.py

Lines changed: 15 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@
2626

2727
import grpc
2828
import pytest
29+
from transformers import AutoTokenizer
2930

3031
import tokenizerpb.tokenizer_pb2 as tokenizer_pb2
31-
from tokenizer_service.tokenizer import TokenizerService
3232

3333

3434
# ---------------------------------------------------------------------------
@@ -99,9 +99,7 @@ def test_tokenize_simple_text(self, grpc_stub, test_model):
9999
assert resp.success
100100
assert len(resp.input_ids) > 0
101101

102-
def test_tokenize_returns_offset_pairs(
103-
self, grpc_stub, test_model, tokenizer_service: TokenizerService
104-
):
102+
def test_tokenize_returns_offset_pairs(self, grpc_stub, test_model):
105103
"""Tokenize returns offset_pairs alongside token IDs."""
106104
grpc_stub.InitializeTokenizer(
107105
tokenizer_pb2.InitializeTokenizerRequest(model_name=test_model)
@@ -118,16 +116,14 @@ def test_tokenize_returns_offset_pairs(
118116
assert len(resp.offset_pairs) == 2 * len(resp.input_ids)
119117

120118
# Verify token count matches tokenizer
121-
tokenizer, _ = tokenizer_service.get_tokenizer_for_model(test_model)
119+
tokenizer = AutoTokenizer.from_pretrained(test_model)
122120
expected_tokens = tokenizer.encode("Hello world", add_special_tokens=True)
123121
assert list(resp.input_ids) == expected_tokens
124122

125-
def test_tokenize_without_special_tokens(
126-
self, grpc_stub, tokenizer_service: TokenizerService
127-
):
123+
def test_tokenize_without_special_tokens(self, grpc_stub):
128124
"""Tokenize with add_special_tokens=False omits special tokens."""
129125

130-
model_name = "google-bert/bert-base-uncased"
126+
model_name = "openai/gpt-oss-120b"
131127

132128
grpc_stub.InitializeTokenizer(
133129
tokenizer_pb2.InitializeTokenizerRequest(model_name=model_name)
@@ -147,19 +143,15 @@ def test_tokenize_without_special_tokens(
147143
)
148144
)
149145
assert with_special.success and without_special.success
150-
# With special tokens should produce > tokens as without.
151-
assert len(with_special.input_ids) > len(without_special.input_ids)
152146

153-
# Verify special tokens using actual tokenizer
154-
tokenizer, _ = tokenizer_service.get_tokenizer_for_model(model_name)
155-
156-
# BERT adds [CLS] at start and [SEP] at end
157-
assert with_special.input_ids[0] == tokenizer.cls_token_id
158-
assert with_special.input_ids[-1] == tokenizer.sep_token_id
159-
160-
# Without special tokens should not have [CLS] or [SEP]
161-
assert without_special.input_ids[0] != tokenizer.cls_token_id
162-
assert without_special.input_ids[-1] != tokenizer.sep_token_id
147+
# Verify both match the underlying tokenizer's behavior
148+
tokenizer = AutoTokenizer.from_pretrained(model_name)
149+
assert list(with_special.input_ids) == tokenizer.encode(
150+
"test", add_special_tokens=True
151+
)
152+
assert list(without_special.input_ids) == tokenizer.encode(
153+
"test", add_special_tokens=False
154+
)
163155

164156
def test_tokenize_empty_input(self, grpc_stub, test_model):
165157
grpc_stub.InitializeTokenizer(
@@ -191,9 +183,7 @@ def test_tokenize_long_input(self, grpc_stub, test_model):
191183
assert resp.success
192184
assert len(resp.input_ids) > 100 # Should have many tokens.
193185

194-
def test_tokenize_special_characters(
195-
self, grpc_stub, test_model, tokenizer_service: TokenizerService
196-
):
186+
def test_tokenize_special_characters(self, grpc_stub, test_model):
197187
"""Tokenize handles special / unicode characters."""
198188
grpc_stub.InitializeTokenizer(
199189
tokenizer_pb2.InitializeTokenizerRequest(model_name=test_model)
@@ -210,8 +200,7 @@ def test_tokenize_special_characters(
210200
assert len(resp.input_ids) > 0
211201

212202
# Verify tokenization matches actual tokenizer
213-
tokenizer, _ = tokenizer_service.get_tokenizer_for_model(test_model)
214-
203+
tokenizer = AutoTokenizer.from_pretrained(test_model)
215204
expected_tokens = tokenizer.encode(test_input, add_special_tokens=True)
216205
assert list(resp.input_ids) == expected_tokens
217206

0 commit comments

Comments
 (0)