Skip to content

Commit 637e9dd

Browse files
committed
collect more metrics
1 parent 79ea365 commit 637e9dd

File tree

3 files changed

+57
-13
lines changed

3 files changed

+57
-13
lines changed

perf-changelog.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1055,3 +1055,10 @@
10551055
- "Enable VLLM_USE_FLASHINFER_MOE_INT4=1 for Kimi K2.5 INT4 B200 benchmark"
10561056
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/935
10571057

1058+
- config-keys:
1059+
- gptoss-fp4-b200-vllm
1060+
description:
1061+
- "test"
1062+
pr-link: test
1063+
1064+

utils/bench_serving/benchmark_serving.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -930,18 +930,18 @@ def main(args: argparse.Namespace):
930930
parser.add_argument(
931931
"--percentile-metrics",
932932
type=str,
933-
default="ttft,tpot,itl",
933+
default="ttft,tpot,itl,e2el",
934934
help="Comma-seperated list of selected metrics to report percentils. "
935935
"This argument specifies the metrics to report percentiles. "
936936
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
937-
"Default value is \"ttft,tpot,itl\".")
937+
"Default value is \"ttft,tpot,itl,e2el\".")
938938
parser.add_argument(
939939
"--metric-percentiles",
940940
type=str,
941-
default="99",
941+
default="50,90,99,99.9",
942942
help="Comma-seperated list of percentiles for selected metrics. "
943943
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
944-
"Default value is \"99\". "
944+
"Default value is \"50,90,99,99.9\". "
945945
"Use \"--percentile-metrics\" to select metrics.",
946946
)
947947
parser.add_argument(

utils/summarize.py

Lines changed: 46 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,19 @@
1616
EP = "EP"
1717
DP_ATTENTION = "DP Attention"
1818
CONC = "Conc"
19-
TTFT = "TTFT (ms)"
19+
TTFT_MEDIAN = "TTFT Median (ms)"
20+
TTFT_P90 = "TTFT P90 (ms)"
21+
TTFT_P99 = "TTFT P99 (ms)"
22+
TTFT_P999 = "TTFT P99.9 (ms)"
2023
TPOT = "TPOT (ms)"
21-
INTERACTIVITY = "Interactivity (tok/s/user)"
22-
E2EL = "E2EL (s)"
24+
INTVTY_MEDIAN = "Intvty Median (tok/s/user)"
25+
INTVTY_P90 = "Intvty P90 (tok/s/user)"
26+
INTVTY_P99 = "Intvty P99 (tok/s/user)"
27+
INTVTY_P999 = "Intvty P99.9 (tok/s/user)"
28+
E2EL_MEDIAN = "E2EL Median (s)"
29+
E2EL_P90 = "E2EL P90 (s)"
30+
E2EL_P99 = "E2EL P99 (s)"
31+
E2EL_P999 = "E2EL P99.9 (s)"
2332
TPUT_PER_GPU = "TPUT per GPU"
2433
OUTPUT_TPUT_PER_GPU = "Output TPUT per GPU"
2534
INPUT_TPUT_PER_GPU = "Input TPUT per GPU"
@@ -74,7 +83,12 @@ def main():
7483

7584
single_node_headers = [
7685
MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, TP, EP, DP_ATTENTION,
77-
CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU
86+
CONC,
87+
TTFT_MEDIAN, TTFT_P90, TTFT_P99, TTFT_P999,
88+
TPOT,
89+
INTVTY_MEDIAN, INTVTY_P90, INTVTY_P99, INTVTY_P999,
90+
E2EL_MEDIAN, E2EL_P90, E2EL_P99, E2EL_P999,
91+
TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU
7892
]
7993

8094
single_node_rows = [
@@ -91,9 +105,18 @@ def main():
91105
r['dp_attention'],
92106
r['conc'],
93107
f"{r['median_ttft'] * 1000:.4f}",
108+
f"{r.get('p90_ttft', 0) * 1000:.4f}",
109+
f"{r.get('p99_ttft', 0) * 1000:.4f}",
110+
f"{r.get('p99.9_ttft', 0) * 1000:.4f}",
94111
f"{r['median_tpot'] * 1000:.4f}",
95-
f"{r['median_intvty']:.4f}",
96-
f"{r['median_e2el']:.4f}",
112+
f"{r.get('median_intvty', 0):.4f}",
113+
f"{r.get('p90_intvty', 0):.4f}",
114+
f"{r.get('p99_intvty', 0):.4f}",
115+
f"{r.get('p99.9_intvty', 0):.4f}",
116+
f"{r.get('median_e2el', 0):.4f}",
117+
f"{r.get('p90_e2el', 0):.4f}",
118+
f"{r.get('p99_e2el', 0):.4f}",
119+
f"{r.get('p99.9_e2el', 0):.4f}",
97120
f"{r['tput_per_gpu']:.4f}",
98121
f"{r['output_tput_per_gpu']:.4f}",
99122
f"{r['input_tput_per_gpu']:.4f}",
@@ -114,7 +137,12 @@ def main():
114137
MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL,
115138
PREFILL_TP, PREFILL_EP, PREFILL_DP_ATTN, PREFILL_WORKERS, PREFILL_GPUS,
116139
DECODE_TP, DECODE_EP, DECODE_DP_ATTN, DECODE_WORKERS, DECODE_GPUS,
117-
CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU
140+
CONC,
141+
TTFT_MEDIAN, TTFT_P90, TTFT_P99, TTFT_P999,
142+
TPOT,
143+
INTVTY_MEDIAN, INTVTY_P90, INTVTY_P99, INTVTY_P999,
144+
E2EL_MEDIAN, E2EL_P90, E2EL_P99, E2EL_P999,
145+
TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU
118146
]
119147

120148
multinode_rows = [
@@ -138,9 +166,18 @@ def main():
138166
r['num_decode_gpu'],
139167
r['conc'],
140168
f"{r['median_ttft'] * 1000:.4f}",
169+
f"{r.get('p90_ttft', 0) * 1000:.4f}",
170+
f"{r.get('p99_ttft', 0) * 1000:.4f}",
171+
f"{r.get('p99.9_ttft', 0) * 1000:.4f}",
141172
f"{r['median_tpot'] * 1000:.4f}",
142-
f"{r['median_intvty']:.4f}",
143-
f"{r['median_e2el']:.4f}",
173+
f"{r.get('median_intvty', 0):.4f}",
174+
f"{r.get('p90_intvty', 0):.4f}",
175+
f"{r.get('p99_intvty', 0):.4f}",
176+
f"{r.get('p99.9_intvty', 0):.4f}",
177+
f"{r.get('median_e2el', 0):.4f}",
178+
f"{r.get('p90_e2el', 0):.4f}",
179+
f"{r.get('p99_e2el', 0):.4f}",
180+
f"{r.get('p99.9_e2el', 0):.4f}",
144181
f"{r['tput_per_gpu']:.4f}",
145182
f"{r['output_tput_per_gpu']:.4f}",
146183
f"{r['input_tput_per_gpu']:.4f}",

0 commit comments

Comments
 (0)