Skip to content
Open
Show file tree
Hide file tree
Changes from 121 commits
Commits
Show all changes
138 commits
Select commit Hold shift + click to select a range
501c5ab
opt metrics structure
LJH-LBJ Jan 15, 2026
d261ba5
opt loggers
LJH-LBJ Jan 15, 2026
8538816
Merge branch 'vllm-project:main' into opt_metrics_structure
LJH-LBJ Jan 16, 2026
89493ac
opt metrics structure
LJH-LBJ Jan 16, 2026
f69ed2d
fix bug
LJH-LBJ Jan 16, 2026
e3a44db
fix bug
LJH-LBJ Jan 19, 2026
da0ad3d
fix bug
LJH-LBJ Jan 19, 2026
cb85a3d
fix bug
LJH-LBJ Jan 19, 2026
371beae
fix bug
LJH-LBJ Jan 19, 2026
2b98563
opt loggers
LJH-LBJ Jan 19, 2026
134a901
opt metrics structure
LJH-LBJ Jan 20, 2026
ee12352
opt format
LJH-LBJ Jan 21, 2026
0af170c
Merge branch 'vllm-project:main' into opt_metrics_structure
LJH-LBJ Jan 21, 2026
cf7e2c0
opt metrics structure
LJH-LBJ Jan 21, 2026
eb51d12
opt metrics structure
LJH-LBJ Jan 21, 2026
38785ed
opt metrics structure
LJH-LBJ Jan 21, 2026
aeb5fd6
opt test
LJH-LBJ Jan 21, 2026
924f747
opt loggers
LJH-LBJ Jan 21, 2026
8fd556b
Merge branch 'vllm-project:main' into opt_metrics_structure
LJH-LBJ Jan 22, 2026
a78af4d
fix bug
LJH-LBJ Jan 22, 2026
44c9635
Merge branch 'opt_metrics_structure' of https://github.com/LJH-LBJ/vl…
LJH-LBJ Jan 22, 2026
7c08073
fix bug
LJH-LBJ Jan 22, 2026
443022b
fix bug
LJH-LBJ Jan 22, 2026
fbbce79
fix bug
LJH-LBJ Jan 22, 2026
2b2edfc
fix bug
LJH-LBJ Jan 22, 2026
a42a656
opt metrics in offline
LJH-LBJ Jan 22, 2026
e7f3fae
fix bug
LJH-LBJ Jan 22, 2026
2e87f70
fix bug
LJH-LBJ Jan 22, 2026
93b53f8
fix pre-commit
LJH-LBJ Jan 23, 2026
d584e71
fix bug
LJH-LBJ Jan 23, 2026
d359775
fix bug
LJH-LBJ Jan 23, 2026
6ebe5ee
Merge branch 'main' into opt_metrics_structure
LJH-LBJ Jan 26, 2026
ab248bb
Merge remote-tracking branch 'origin/main' into opt_metrics_structure
LJH-LBJ Jan 26, 2026
04230c8
fix bug
LJH-LBJ Jan 26, 2026
fdfc9b5
fix bug
LJH-LBJ Jan 26, 2026
b5d154a
add audio frames
LJH-LBJ Jan 27, 2026
274a784
add audio frames
LJH-LBJ Jan 27, 2026
43a266b
add image image_num and resolution
LJH-LBJ Jan 27, 2026
e4ff53e
add image image_num and resolution
LJH-LBJ Jan 27, 2026
13a87f2
add image image_num and resolution
LJH-LBJ Jan 27, 2026
bacd480
add audio frames in offline
LJH-LBJ Jan 27, 2026
b339c38
add audio frames in offline
LJH-LBJ Jan 27, 2026
935481c
Merge branch 'main' into opt_metrics_structure
LJH-LBJ Jan 27, 2026
2263dd1
fix pre-commit
LJH-LBJ Jan 27, 2026
f3b88b1
Merge branch 'opt_metrics_structure' of https://github.com/LJH-LBJ/vl…
LJH-LBJ Jan 27, 2026
f0bdfaa
fix pre-commit
LJH-LBJ Jan 27, 2026
da7a271
change enable_stats to log_stats
LJH-LBJ Jan 27, 2026
bcd9ac4
fix bug
LJH-LBJ Jan 27, 2026
abf941e
fix pre-commit
LJH-LBJ Jan 27, 2026
03afeaf
delete 0 row
LJH-LBJ Jan 27, 2026
842af89
delete 0 row
LJH-LBJ Jan 27, 2026
56ecac3
fix pre-commit
LJH-LBJ Jan 27, 2026
cbdac45
delete 0 row
LJH-LBJ Jan 27, 2026
8ee59ce
delete 0 row
LJH-LBJ Jan 27, 2026
48707f0
opt
LJH-LBJ Jan 28, 2026
9d76475
fix pre-commit
LJH-LBJ Jan 28, 2026
2631578
fix bug
LJH-LBJ Jan 28, 2026
e0ce96f
fix pre-commit
LJH-LBJ Jan 28, 2026
04da676
Merge branch 'main' into opt_metrics_structure
LJH-LBJ Jan 28, 2026
26e18b3
Merge branch 'main' into opt_metrics_structure
LJH-LBJ Jan 28, 2026
a8bcbc0
fix pre-commit
LJH-LBJ Jan 28, 2026
114a6a3
fix pre-commit
LJH-LBJ Jan 28, 2026
9126e68
Merge branch 'main' into opt_metrics_structure
LJH-LBJ Jan 28, 2026
0b905cf
fix bug
LJH-LBJ Jan 28, 2026
3481dcc
Merge branch 'main' into opt_metrics_structure
hsliuustc0106 Jan 28, 2026
7665b29
opt
LJH-LBJ Jan 29, 2026
c78d420
opt
LJH-LBJ Jan 29, 2026
2b37f16
opt
LJH-LBJ Jan 29, 2026
78963fb
opt
LJH-LBJ Jan 29, 2026
141d8f8
remove ut in test_async_omni
LJH-LBJ Jan 29, 2026
7c95eb9
fix pre-commit
LJH-LBJ Jan 29, 2026
6687f65
Merge branch 'main' into opt_metrics_structure
LJH-LBJ Jan 29, 2026
68074ac
add test in pipeline.yaml
LJH-LBJ Jan 29, 2026
ef34329
fix bug
LJH-LBJ Jan 29, 2026
bff608c
Merge branch 'main' into opt_metrics_structure
LJH-LBJ Jan 29, 2026
a59c766
fix bug
LJH-LBJ Jan 29, 2026
4918ab1
fix pre-commit
LJH-LBJ Jan 29, 2026
4976551
rerun
LJH-LBJ Jan 29, 2026
d646401
Merge branch 'main' into opt_metrics_structure
LJH-LBJ Jan 29, 2026
5efbd55
Merge branch 'main' into opt_metrics_structure
LJH-LBJ Jan 29, 2026
e83a338
Merge branch 'main' into opt_metrics_structure
LJH-LBJ Jan 30, 2026
55c11c1
opt test
LJH-LBJ Jan 30, 2026
a94349b
Merge branch 'opt_metrics_structure' of https://github.com/LJH-LBJ/vl…
LJH-LBJ Jan 30, 2026
6e63657
rerun
LJH-LBJ Jan 30, 2026
9a31bae
rerun
LJH-LBJ Jan 30, 2026
232da73
Merge branch 'main' into opt_metrics_structure
LJH-LBJ Jan 30, 2026
13b0050
rerun
LJH-LBJ Jan 30, 2026
db0d866
Merge branch 'opt_metrics_structure' of https://github.com/LJH-LBJ/vl…
LJH-LBJ Jan 30, 2026
21de7db
fix bug
LJH-LBJ Jan 30, 2026
dd051b2
fix pre-commit
LJH-LBJ Jan 30, 2026
dd73daf
Merge branch 'main' into opt_metrics_structure
LJH-LBJ Jan 30, 2026
c1c48f9
Merge branch 'main' into opt_metrics_structure
hsliuustc0106 Jan 30, 2026
4e6acbe
Merge branch 'main' into opt_metrics_structure
LJH-LBJ Jan 30, 2026
d3c6f54
rerun
LJH-LBJ Jan 30, 2026
bd6d8cd
Merge branch 'main' into opt_metrics_structure
LJH-LBJ Jan 31, 2026
654073f
Merge branch 'main' into opt_metrics_structure
LJH-LBJ Jan 31, 2026
c9068a7
add doc
LJH-LBJ Jan 31, 2026
6626d62
Merge branch 'main' into opt_metrics_structure
LJH-LBJ Jan 31, 2026
fe0e4b9
add doc
LJH-LBJ Jan 31, 2026
89f3944
Merge branch 'opt_metrics_structure' of https://github.com/LJH-LBJ/vl…
LJH-LBJ Jan 31, 2026
3b311f4
Merge branch 'main' into opt_metrics_structure
LJH-LBJ Feb 3, 2026
4b39808
Merge branch 'main' into opt_metrics_structure
LJH-LBJ Feb 4, 2026
3fff139
fix pre-commit
LJH-LBJ Feb 4, 2026
b9c2d46
fix pre-commit
LJH-LBJ Feb 4, 2026
0bb732e
opt
LJH-LBJ Feb 4, 2026
7c91e96
fix pre-commit
LJH-LBJ Feb 4, 2026
da335c7
opt
LJH-LBJ Feb 4, 2026
5abc397
opt
LJH-LBJ Feb 4, 2026
fb3bacf
fix pre-commit
LJH-LBJ Feb 4, 2026
51f5e0a
fix pre-commit
LJH-LBJ Feb 4, 2026
41db219
opt
LJH-LBJ Feb 4, 2026
3a95be0
fix pre-commit
LJH-LBJ Feb 4, 2026
571f297
Merge branch 'vllm-project:main' into opt_metrics_structure
LJH-LBJ Feb 5, 2026
ca2cb26
fix bug
LJH-LBJ Feb 5, 2026
48a519c
Merge branch 'opt_metrics_structure' of https://github.com/LJH-LBJ/vl…
LJH-LBJ Feb 5, 2026
1bd59d8
fix bug
LJH-LBJ Feb 5, 2026
24f8bc8
fix bug
LJH-LBJ Feb 5, 2026
ef2d5d6
Merge branch 'main' into opt_metrics_structure
LJH-LBJ Feb 5, 2026
23a24ee
fix pre-commit
LJH-LBJ Feb 5, 2026
dd5d7b7
Merge branch 'opt_metrics_structure' of https://github.com/LJH-LBJ/vl…
LJH-LBJ Feb 5, 2026
41c58d4
fix bug
LJH-LBJ Feb 5, 2026
9145181
Merge branch 'main' into opt_metrics_structure
LJH-LBJ Feb 6, 2026
f1195f8
fix ut
LJH-LBJ Feb 6, 2026
4383b01
Merge branch 'opt_metrics_structure' of https://github.com/LJH-LBJ/vl…
LJH-LBJ Feb 6, 2026
41482ff
fix ut
LJH-LBJ Feb 6, 2026
f07d070
Merge branch 'main' into opt_metrics_structure
LJH-LBJ Feb 6, 2026
764151d
fix ut
LJH-LBJ Feb 6, 2026
f1b41d3
Merge branch 'opt_metrics_structure' of https://github.com/LJH-LBJ/vl…
LJH-LBJ Feb 6, 2026
382327e
fix dependencies
LJH-LBJ Feb 6, 2026
75be00c
opt stage_wall_time_ms and move metrics.md tp contributing
LJH-LBJ Feb 6, 2026
3ffa4cd
add stage's final_output_type in StageRequestStats
LJH-LBJ Feb 6, 2026
e352716
fix bug
LJH-LBJ Feb 6, 2026
7faa2e2
rerun
LJH-LBJ Feb 6, 2026
42f6f0f
update doc
LJH-LBJ Feb 6, 2026
e7c502f
opt
LJH-LBJ Feb 6, 2026
a71fa64
Merge branch 'main' into opt_metrics_structure
LJH-LBJ Feb 6, 2026
fd9d3d4
fix pre-commit
LJH-LBJ Feb 6, 2026
00e7b78
Merge branch 'opt_metrics_structure' of https://github.com/LJH-LBJ/vl…
LJH-LBJ Feb 6, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ steps:
tests/diffusion/lora/ \
tests/model_executor/models/qwen2_5_omni/test_audio_length.py \
tests/worker/ \
tests/metrics/test_stats.py \
tests/distributed/omni_connectors/test_kv_flow.py \
--cov=vllm_omni \
--cov-branch \
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/qwen3-omni/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ bash benchmarks/qwen3-omni/vllm_omni/eval_qwen3_moe_omni.sh
```

What it does:
- Runs `examples/offline_inference/qwen3_omni/end2end.py` with `--enable-stats`.
- Runs `examples/offline_inference/qwen3_omni/end2end.py` with `--log-stats`.
- Uses `benchmarks/build_dataset/top100.txt` and writes to:
- Logs: `benchmarks/qwen3-omni/vllm_omni/logs/`
- `omni_llm_pipeline_text.orchestrator.stats.jsonl` — per-stage latency stats.
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/qwen3-omni/vllm_omni/eval_qwen3_moe_omni.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ else
python $end2end_script_path --output-wav $outputs_dir \
--query-type text \
--txt-prompts $build_dataset_path \
--enable-stats \
--log-stats \
--log-dir $log_dir
echo "Logs and outputs are saved in ${log_dir} and ${outputs_dir} respectively:"
echo " - omni_llm_pipeline_text run dir/base name"
Expand Down
8 changes: 5 additions & 3 deletions docs/api/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@ Main entry points for vLLM-Omni inference and serving.
- [vllm_omni.entrypoints.cli.benchmark.serve.OmniBenchmarkServingSubcommand][]
- [vllm_omni.entrypoints.cli.serve.OmniServeCommand][]
- [vllm_omni.entrypoints.client_request_state.ClientRequestState][]
- [vllm_omni.entrypoints.log_utils.OrchestratorMetrics][]
- [vllm_omni.entrypoints.log_utils.StageRequestMetrics][]
- [vllm_omni.entrypoints.log_utils.StageStats][]
- [vllm_omni.entrypoints.omni.Omni][]
- [vllm_omni.entrypoints.omni.OmniBase][]
- [vllm_omni.entrypoints.omni_diffusion.OmniDiffusion][]
Expand Down Expand Up @@ -114,3 +111,8 @@ Worker classes and model runners for distributed inference.
- [vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker][]
- [vllm_omni.worker.gpu_model_runner.OmniGPUModelRunner][]
- [vllm_omni.worker.mixins.OmniWorkerMixin][]


## Metrics

- [vllm_omni.metrics.OrchestratorAggregator][]
156 changes: 156 additions & 0 deletions docs/usage/metrics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@

# Production Metrics:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please take a final check on the md and test the output data again.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Already check .md file and update the test result


You can use these metrics in production to monitor the health and performance of the vLLM-omni system. Typical scenarios include:
- **Performance Monitoring**: Track throughput (e.g., `e2e_avg_tokens_per_s`), latency (e.g., `e2e_total_ms`), and resource utilization to verify that the system meets expected standards.
- **Debugging and Troubleshooting**: Use detailed per-request metrics to diagnose issues, such as high transfer times or unexpected token counts.

## How to Enable and View Metrics

### 1. Start the Service with Metrics Logging

```bash
vllm serve /workspace/models/Qwen3-Omni-30B-A3B-Instruct --omni --port 8014 --log-stats
```

### 2. Send a Request

```bash
python openai_chat_completion_client_for_multimodal_generation.py --query-type use_image
```

### 3. What You Will See

With `--log-stats` enabled, the server will output detailed metrics logs after each request. Example output:


#### Overall Summary

| Field | Value |
|-----------------------------|--------------|
| e2e_requests | 1 |
| e2e_wall_time_ms | 41,299.190 |
| e2e_total_tokens | 5,202 |
| e2e_avg_time_per_request_ms | 41,299.190 |
| e2e_avg_tokens_per_s | 125.959 |
| stage_wall_time_ms | 10,192.289, 30,541.409, 207.496 |

#### RequestE2EStats

| Field | Value |
|-------------------------|------------|
| e2e_total_ms | 41,299.133 |
| e2e_total_tokens | 5,202 |
| transfers_total_time_ms | 245.895 |
| transfers_total_kbytes | 138,089.939|

#### StageRequestStats

| Field | 0 | 1 | 2 |
|------------------------|--------|--------|--------|
| audio_generated_frames | 0 | 0 | 525,525|
| batch_id | 38 | 274 | 0 |
| batch_size | 1 | 1 | 1 |
| num_tokens_in | 4,860 | 4,826 | 4,384 |
| num_tokens_out | 67 | 275 | 0 |
| postprocess_time_ms | 256.158| 0.491 | 0.000 |
| stage_gen_time_ms | 9,910.007|30,379.198|160.745|

#### TransferEdgeStats

| Field | 0->1 | 1->2 |
|---------------------|-------------|------------|
| size_kbytes | 109,277.349 | 28,812.591 |
| tx_time_ms | 78.701 | 18.790 |
| rx_decode_time_ms | 111.865 | 31.706 |
| in_flight_time_ms | 2.015 | 2.819 |


These logs include:
- **Overall summary**: total requests, wall time, average tokens/sec, etc.
- **E2E table**: per-request latency and token counts.
- **Stage table**: per-stage batch and timing details.
- **Transfer table**: data transfer and timing for each edge.

You can use these logs to monitor system health, debug performance, and analyze request-level metrics as described above.

## Parameter Details

| Field | Meaning |
|---------------------------|----------------------------------------------------------------------------------------------|
| `e2e_requests` | Number of completed requests. |
| `e2e_wall_time_ms` | Wall-clock time span from run start to last completion, in ms. |
| `e2e_total_tokens` | Total tokens counted across all completed requests (stage0 input + all stage outputs). |
| `e2e_avg_time_per_request_ms` | Average wall time per request: `e2e_wall_time_ms / e2e_requests`. |
| `e2e_avg_tokens_per_s` | Average token throughput over wall time: `e2e_total_tokens * 1000 / e2e_wall_time_ms`. |
| `stage_wall_time_ms` | Wall-clock time span for each stage, in ms (list format). |

---

## E2E Table (per request)

| Field | Meaning |
|---------------------------|-----------------------------------------------------------------------|
| `e2e_total_ms` | End-to-end latency in ms. |
| `e2e_total_tokens` | Total tokens for the request (stage0 input + all stage outputs). |
| `transfers_total_time_ms` | Sum of transfer edge `total_time_ms` for this request. |
| `transfers_total_kbytes` | Sum of transfer kbytes for this request. |


---

## Stage Table (per stage event / request)

| Field | Meaning |
|---------------------------|-------------------------------------------------------------------------------------------------|
| `batch_id` | Batch index. |
| `batch_size` | Batch size. |
| `num_tokens_in` | Input tokens to the stage. |
| `num_tokens_out` | Output tokens from the stage. |
| `postprocess_time_ms` | Postprocessing time in ms. |
| `stage_gen_time_ms` | Stage compute time in ms, excluding postprocessing time (reported separately as `postprocess_time_ms`). |
| `image_num` | Number of images generated (for diffusion/image stages). |
| `resolution` | Image resolution (for diffusion/image stages). |
| `postprocess_time_ms` | Diffusion/image: post-processing time in ms. |
| `trajectory_timesteps` | Diffusion/image: trajectory timesteps, if available. |

---

## Transfer Table (per edge / request)

| Field | Meaning |
|----------------------|---------------------------------------------------------------------------|
| `size_kbytes` | Total kbytes transferred. |
| `tx_time_ms` | Sender transfer time in ms. |
| `rx_decode_time_ms` | Receiver decode time in ms. |
| `in_flight_time_ms` | In-flight time in ms. |


## Expectation of the Numbers (Verification)

**Formulas:**
- `e2e_total_tokens = Stage0's num_tokens_in + sum(all stages' num_tokens_out)`
- `transfers_total_time_ms = sum(tx_time_ms + rx_decode_time_ms + in_flight_time_ms)` for every edge

**Using the example above:**

### e2e_total_tokens
- Stage0's `num_tokens_in`: **4,860**
- Stage0's `num_tokens_out`: **67**
- Stage1's `num_tokens_out`: **275**
- Stage2's `num_tokens_out`: **0**

So,
```
e2e_total_tokens = 4,860 + 67 + 275 + 0 = 5,202
```
This matches the table value: `e2e_total_tokens = 5,202`.

### transfers_total_time_ms
For each edge:
- 0->1: tx_time_ms (**78.701**) + rx_decode_time_ms (**111.865**) + in_flight_time_ms (**2.015**) = **192.581**
- 1->2: tx_time_ms (**18.790**) + rx_decode_time_ms (**31.706**) + in_flight_time_ms (**2.819**) = **53.315**

Sum: 192.581 + 53.315 = **245.896**

The table shows `transfers_total_time_ms = 245.895`, which matches the calculation (difference is due to rounding).
4 changes: 2 additions & 2 deletions examples/offline_inference/bagel/end2end.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def parse_args():
)

# OmniLLM init args
parser.add_argument("--enable-stats", action="store_true", default=False)
parser.add_argument("--log-stats", action="store_true", default=False)
parser.add_argument("--init-sleep-seconds", type=int, default=20)
parser.add_argument("--batch-timeout", type=int, default=5)
parser.add_argument("--init-timeout", type=int, default=300)
Expand Down Expand Up @@ -120,7 +120,7 @@ def main():

omni_kwargs.update(
{
"log_stats": args.enable_stats,
"log_stats": args.log_stats,
"init_sleep_seconds": args.init_sleep_seconds,
"batch_timeout": args.batch_timeout,
"init_timeout": args.init_timeout,
Expand Down
4 changes: 2 additions & 2 deletions examples/offline_inference/qwen2_5_omni/end2end.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ def main(args):
query_result = query_func()
omni_llm = Omni(
model=model_name,
log_stats=args.enable_stats,
log_stats=args.log_stats,
stage_init_timeout=args.stage_init_timeout,
batch_timeout=args.batch_timeout,
init_timeout=args.init_timeout,
Expand Down Expand Up @@ -439,7 +439,7 @@ def parse_args():
help="Query type.",
)
parser.add_argument(
"--enable-stats",
"--log-stats",
action="store_true",
default=False,
help="Enable writing detailed statistics (default: disabled)",
Expand Down
4 changes: 2 additions & 2 deletions examples/offline_inference/qwen3_omni/end2end.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@ def main(args):
omni_llm = Omni(
model=model_name,
stage_configs_path=args.stage_configs_path,
log_stats=args.enable_stats,
log_stats=args.log_stats,
stage_init_timeout=args.stage_init_timeout,
)

Expand Down Expand Up @@ -455,7 +455,7 @@ def parse_args():
help="Query type.",
)
parser.add_argument(
"--enable-stats",
"--log-stats",
action="store_true",
default=False,
help="Enable writing detailed statistics (default: disabled)",
Expand Down
4 changes: 2 additions & 2 deletions examples/offline_inference/qwen3_tts/end2end.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def main(args):
omni = Omni(
model=model_name,
stage_configs_path=args.stage_configs_path,
log_stats=args.enable_stats,
log_stats=args.log_stats,
stage_init_timeout=args.stage_init_timeout,
)

Expand Down Expand Up @@ -275,7 +275,7 @@ def parse_args():
help="Query type.",
)
parser.add_argument(
"--enable-stats",
"--log-stats",
action="store_true",
default=False,
help="Enable writing detailed statistics (default: disabled)",
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ dependencies = [
"soundfile>=0.13.1",
"cache-dit==1.2.0",
"tqdm>=4.66.0",
"prettytable>=3.9.0",
"torchsde>=0.2.6", # Required for Stable Audio scheduler
"fa3-fwd==0.0.1", # flash attention 3, maintained by @ZJY0516
"openai-whisper>=20250625",
Expand Down
50 changes: 1 addition & 49 deletions tests/e2e/online_serving/test_async_omni.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from vllm import SamplingParams
from vllm.inputs import PromptType

from vllm_omni.entrypoints.async_omni import AsyncOmni, ClientRequestState
from vllm_omni.entrypoints.async_omni import AsyncOmni

os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

Expand Down Expand Up @@ -111,51 +111,3 @@ async def test_abort():
num_generated_tokens, request_id = await task
assert num_generated_tokens == NUM_EXPECTED_TOKENS
await asyncio.sleep(5)


@pytest.mark.asyncio
async def test_build_and_log_summary(monkeypatch):
from vllm_omni.entrypoints.utils import get_final_stage_id_for_e2e

RealCRS = ClientRequestState
capture_metrics = {}

class MockCRS(RealCRS):
def __init__(self, request_id: str):
super().__init__(request_id)
capture_metrics[request_id] = self

monkeypatch.setattr("vllm_omni.entrypoints.async_omni.ClientRequestState", MockCRS)
monkeypatch.setattr("vllm_omni.entrypoints.client_request_state.ClientRequestState", MockCRS)

with ExitStack() as after:
# Avoid SHM IPC in tests to prevent /dev/shm exhaustion and SIGBUS.
engine = AsyncOmni(
model=model,
stage_configs_path=stage_config,
shm_threshold_bytes=sys.maxsize,
)
after.callback(engine.shutdown)
prompt = "Hello my name is Robert and "
NUM_EXPECTED_TOKENS = 64
NUM_REQUESTS = 3
request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]

# Create concurrent requests.
tasks: list[asyncio.Task] = []
for idx, request_id in enumerate(request_ids):
tasks.append(asyncio.create_task(generate(engine, request_id, prompt, NUM_EXPECTED_TOKENS)))

# Confirm the requests are okay.
for idx, task in enumerate(tasks):
await task
output_modalities = ["text"]
final_stage_id_for_e2e = get_final_stage_id_for_e2e(
output_modalities, engine.output_modalities, engine.stage_list
)
summary = capture_metrics[request_ids[idx]].metrics.build_and_log_summary(final_stage_id_for_e2e)

# Check that total tokens matches sum of stage tokens.
assert summary["e2e_total_tokens"] == sum(stage["tokens"] for stage in summary["stages"])
# Check that total time matches sum of stage times.
assert summary["e2e_total_time_ms"] >= sum(stage["total_time_ms"] for stage in summary["stages"])
6 changes: 3 additions & 3 deletions tests/entrypoints/test_omni_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,9 +296,9 @@ def _setup_log_mocks(monkeypatch):
# Mock OrchestratorMetrics to be a simple class that doesn't require file operations

class _FakeOrchestratorMetrics:
def __init__(self, num_stages, enable_stats, wall_start_ts):
def __init__(self, num_stages, log_stats, wall_start_ts):
self.num_stages = num_stages
self.enable_stats = enable_stats
self.log_stats = log_stats
self.stage_first_ts = [None] * num_stages
self.stage_last_ts = [None] * num_stages
self.e2e_done = set()
Expand All @@ -316,7 +316,7 @@ def build_and_log_summary(self, final_stage_id):
return "Fake summary"

monkeypatch.setattr(
"vllm_omni.entrypoints.omni.OrchestratorMetrics",
"vllm_omni.entrypoints.omni.OrchestratorAggregator",
_FakeOrchestratorMetrics,
raising=False,
)
Expand Down
Loading