-
Notifications
You must be signed in to change notification settings - Fork 537
Open
Description
- Config
cat <<'EOF' > /tmp/config.yaml
backend: pytorch
kv_cache_config:
free_gpu_memory_fraction: 0.85
max_num_tokens: 8192
max_batch_size: 16
trust_remote_code: true
return_perf_metrics: true
enable_iter_perf_stats: true
enable_iter_req_stats: true
perf_metrics_max_requests: 1000
EOF
trtllm-serve serve /models/Qwen3-8B \
--host 0.0.0.0 \
--port 8000 \
--backend pytorch \
--extra_llm_api_options /tmp/config.yaml
- On doing curl for /metrics endpoint, output is always empty []
root@qwen-tensorrt-llm-5956796475-d89cn:/app/tensorrt_llm# curl localhost:8000/metrics
[]
- In metrics output, engine_type and modelname are undefiend
root@qwen-tensorrt-llm-5956796475-d89cn:/app/tensorrt_llm# curl localhost:8000/prometheus/metrics
# HELP e2e_request_latency_seconds Histogram of end to end request latency in seconds.
# TYPE e2e_request_latency_seconds histogram
e2e_request_latency_seconds_sum{engine_type="undefined",model_name="undefined"} 3.4878239994868636
e2e_request_latency_seconds_bucket{engine_type="undefined",le="0.3",model_name="undefined"} 0.0
e2e_request_latency_seconds_bucket{engine_type="undefined",le="0.5",model_name="undefined"} 0.0
e2e_request_latency_seconds_bucket{engine_type="undefined",le="0.8",model_name="undefined"} 0.0
- Limited number of metrics are provided.
e2e_request_latency_seconds_bucket
time_to_first_token_seconds_bucket
time_per_output_token_seconds_bucket
request_queue_time_seconds_bucket
request_success_total
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels