Skip to content

Commit 7b4b6a6

Browse files
[Standup] feat: well-lit paths (llm-d#311)
* [Standup] feat: well-lit paths Resolves llm-d#305 by ensuring every single well-lit path defined in llm-d-infra has a corresponding scenario and experiment * Added examples for multi-nic and rdma/roce_gdr (or rdma/ib) * Fix for data collection * Make LLMDBENCH_GATEWAY_API_INFERENCE_EXTENSION_CRD_REVISION and LLMDBENCH_GATEWAY_API_CRD_REVISION overridable * Fix the tag for inference scheduler in well-lit paths Signed-off-by: maugustosilva <maugusto.silva@gmail.com> --------- Signed-off-by: maugustosilva <maugusto.silva@gmail.com>
1 parent 6e1315b commit 7b4b6a6

26 files changed

+816
-363
lines changed

.github/workflows/benchmark1.yaml

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -68,63 +68,63 @@ jobs:
6868
- name: Cleanup target cloud (modelservice)
6969
env:
7070
LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}
71-
run: ./setup/teardown.sh -c cicd -t modelservice -d
71+
run: ./setup/teardown.sh -c ocp_l40_fb -t modelservice -d
7272

7373
- name: Cleanup target cloud (standalone)
7474
env:
7575
LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}
76-
run: ./setup/teardown.sh -c cicd -t standalone -d
76+
run: ./setup/teardown.sh -c ocp_l40_fb -t standalone -d
7777

7878
- name: Standup target cloud (standalone)
7979
env:
8080
LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}
81-
run: ./setup/standup.sh -c cicd -t standalone
81+
run: ./setup/standup.sh -c ocp_l40_fb -t standalone
8282

8383
- name: Run benchmark (standalone, inference-perf)
8484
env:
8585
LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}
86-
run: ./setup/run.sh -c cicd -t standalone
86+
run: ./setup/run.sh -c ocp_l40_fb -t standalone
8787

8888
- name: Run benchmark (standalone, fmperf)
8989
env:
9090
LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}
91-
run: ./setup/run.sh -c cicd -t standalone -l fmperf -w sanity_short-input
91+
run: ./setup/run.sh -c ocp_l40_fb -t standalone -l fmperf -w sanity_short-input
9292

9393
- name: Run benchmark (standalone, guidellm)
9494
env:
9595
LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}
96-
run: ./setup/run.sh -c cicd -t standalone -l guidellm -w sanity_concurrent
96+
run: ./setup/run.sh -c ocp_l40_fb -t standalone -l guidellm -w sanity_concurrent
9797

9898
- name: Run benchmark (standalone, vllm-benchmark)
9999
env:
100100
LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}
101-
run: ./setup/run.sh -c cicd -t standalone -l vllm-benchmark
101+
run: ./setup/run.sh -c ocp_l40_fb -t standalone -l vllm-benchmark
102102

103103
- name: Cleanup target cloud (standalone)
104104
env:
105105
LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}
106-
run: ./setup/teardown.sh -c cicd -t standalone -d
106+
run: ./setup/teardown.sh -c ocp_l40_fb -t standalone -d
107107

108108
- name: E2E target cloud (modelservice, inference-perf)
109109
env:
110110
LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}
111-
run: ./setup/e2e.sh -c cicd -t modelservice --deep
111+
run: ./setup/e2e.sh -c ocp_l40_fb -t modelservice --deep
112112

113113
- name: E2E target cloud (modelservice, fmperf)
114114
env:
115115
LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}
116-
run: ./setup/e2e.sh -c cicd -t modelservice --deep -l fmperf -w sanity_short-input.yaml
116+
run: ./setup/e2e.sh -c ocp_l40_fb -t modelservice --deep -l fmperf -w sanity_short-input.yaml
117117

118118
- name: E2E target cloud (modelservice, guidellm)
119119
env:
120120
LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}
121-
run: ./setup/e2e.sh -c cicd -t modelservice --deep -l guidellm -w sanity_concurrent.yaml
121+
run: ./setup/e2e.sh -c ocp_l40_fb -t modelservice --deep -l guidellm -w sanity_concurrent.yaml
122122

123123

124124
- name: E2E target cloud (modelservice, vllm-benchmark)
125125
env:
126126
LLMDBENCH_HF_TOKEN: ${{ secrets.LLMDBENCH_HF_TOKEN }}
127-
run: ./setup/e2e.sh -c cicd -t modelservice --deep -l vllm-benchmark
127+
run: ./setup/e2e.sh -c ocp_l40_fb -t modelservice --deep -l vllm-benchmark
128128

129129

130130
- name: Install AWS CLI

.github/workflows/ci-pr-benchmark.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,17 +39,17 @@ jobs:
3939
env:
4040
LLMDBENCH_HF_TOKEN: hf-token-placeholder
4141
run: |
42-
./setup/standup.sh -c kind_modelservice_inference-sim -t modelservice -s 0,1,2,4,7,8,9
42+
./setup/standup.sh -c kind_sim_fb -t modelservice -s 0,1,2,4,7,8,9
4343
4444
- name: Run harness (mock)
4545
env:
4646
LLMDBENCH_HF_TOKEN: hf-token-placeholder
4747
LLMD_CONTROL_DRY_RUN: 1 # TODO: harness doesn't work now for kind bc no harness endpoint
4848
run: |
49-
./setup/run.sh -c kind_modelservice_inference-sim --dry-run
49+
./setup/run.sh -c kind_sim_fb --dry-run
5050
5151
- name: Teardown
5252
env:
5353
LLMDBENCH_HF_TOKEN: hf-token-placeholder
5454
run: |
55-
./setup/teardown.sh -c kind_modelservice_inference-sim
55+
./setup/teardown.sh -c kind_sim_fb
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
setup:
2+
factors:
3+
- LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE
4+
levels:
5+
LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE: "default,prefix-cache-estimate-config,prefix-cache-tracking-config"
6+
treatments:
7+
default: "default"
8+
cache_estimate: "prefix-cache-estimate-config"
9+
cache_tracking: "prefix-cache-tracking-config"
10+
run:
11+
factors:
12+
- num_groups
13+
- system_prompt_len
14+
levels:
15+
num_groups: "40,60"
16+
system_prompt_len: "80000,5000,1000"
17+
treatments:
18+
long: "40,8000"
19+
medium: "60,5000"
20+
short: "60,1000"

experiments/precise_prefix_cache_aware.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
setup:
22
factors:
3-
- LLMDBENCH_VLLM_MODELSERVICE_GAIE_PRESETS
3+
- LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE
44
levels:
5-
LLMDBENCH_VLLM_MODELSERVICE_GAIE_PRESETS: "default,prefix-cache-estimate-config,prefix-cache-tracking-config"
5+
LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE: "default,prefix-cache-estimate-config,prefix-cache-tracking-config"
66
treatments:
77
default: "default"
88
cache_estimate: "prefix-cache-estimate-config"

scenarios/cicd.sh

Lines changed: 0 additions & 9 deletions
This file was deleted.

scenarios/cicd/kind_sim_fb.sh

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
# A scenario to capture running inference-sim on a Kind cluster without requiring GPUs
22
export LLMDBENCH_DEPLOY_METHODS=modelservice
33
export LLMDBENCH_VLLM_COMMON_REPLICAS=1
4-
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_NR=0
5-
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_ACCELERATOR_NR=0
4+
export LLMDBENCH_VLLM_COMMON_ACCELERATOR_RESOURCE=
5+
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_NR=
6+
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_ACCELERATOR_NR=
67
export LLMDBENCH_VLLM_COMMON_AFFINITY=kubernetes.io/os:linux
8+
export LLMDBENCH_CONTROL_WAIT_TIMEOUT=90
79
export LLMDBENCH_LLMD_IMAGE_NAME="llm-d-inference-sim"
10+
export LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE="plugins-v2.yaml"
811
export LLMDBENCH_LLMD_ROUTINGSIDECAR_IMAGE_TAG="v0.2.0@sha256:a623a0752af0a71b7b05ebf95517848b5dbc3d8d235c1897035905632d5b7d80"
912
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_MODEL_COMMAND=imageDefault
1013
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_MODEL_COMMAND=imageDefault
@@ -17,5 +20,4 @@ export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_CPU_MEM=100Mi
1720
export LLMDBENCH_VLLM_MODELSERVICE_URI_PROTOCOL="hf"
1821
export LLMDBENCH_DEPLOY_MODEL_LIST="facebook/opt-125m"
1922
export LLMDBENCH_HARNESS_PVC_SIZE=3Gi
20-
export LLMDBENCH_VLLM_MODELSERVICE_INFERENCE_MODEL=true
21-
23+
export LLMDBENCH_VLLM_MODELSERVICE_INFERENCE_MODEL=true

scenarios/disaggregated_vs_llmd.sh

Lines changed: 0 additions & 37 deletions
This file was deleted.
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
# INFERENCE SCHEDULING WELL LIT PATH
2+
# Based on https://github.com/llm-d-incubation/llm-d-infra/tree/main/quickstart/examples/inference-scheduling
3+
# Removed pod monitoring; can be added using LLMDBENCH_VLLM_MODELSERVICE_EXTRA_POD_CONFIG
4+
# Removed extra volumes metrics-volume and torch-compile-volume; they are not needed for this model and tested hardware.
5+
# Use LLMDBENCH_VLLM_MODELSERVICE_EXTRA_VOLUME_MOUNTS and LLMDBENCH_VLLM_MODELSERVICE_EXTRA_VOLUMES to add them if needed.
6+
7+
# IMPORTANT NOTE
8+
# All parameters not defined here or exported externally will be the default values found in setup/env.sh
9+
# Many commonly defined values were left blank (default) so that this scenario is applicable to as many environments as possible.
10+
11+
# Model parameters
12+
# export LLMDBENCH_DEPLOY_MODEL_LIST="Qwen/Qwen3-0.6B"
13+
# export LLMDBENCH_DEPLOY_MODEL_LIST="facebook/opt-125m"
14+
export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-8B-Instruct"
15+
#export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-70B-Instruct"
16+
export LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE=1Ti
17+
18+
19+
# Workload parameters
20+
export LLMDBENCH_HARNESS_EXPERIMENT_PROFILE=shared_prefix_synthetic.yaml
21+
export LLMDBENCH_HARNESS_NAME=inference-perf
22+
23+
# Routing configuration (via gaie)
24+
#export LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE="plugins-v2.yaml" (default is default-plugins.yaml)
25+
export LLMDBENCH_LLMD_INFERENCESCHEDULER_IMAGE_TAG=v0.2.1
26+
27+
# Routing configuration (via modelservice)
28+
# export LLMDBENCH_VLLM_MODELSERVICE_INFERENCE_MODEL=false # already the default
29+
# export LLMDBENCH_LLMD_ROUTINGSIDECAR_CONNECTOR=nixlv2 # already the default
30+
31+
# Common parameters across standalone and llm-d (prefill and decode) pods
32+
export LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN=16000
33+
export LLMDBENCH_VLLM_COMMON_BLOCK_SIZE=64
34+
35+
# Affinity to select node with appropriate accelerator (leave uncommented to automatically detect GPU)
36+
#export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-H100-80GB-HBM3
37+
#export LLMDBENCH_VLLM_COMMON_AFFINITY=gpu.nvidia.com/model:H200
38+
#export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-L40S
39+
#export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-A100-SXM4-80GB
40+
41+
# Uncomment to request specific network devices
42+
#export LLMDBENCH_VLLM_COMMON_NETWORK_RESOURCE=rdma/roce_gdr
43+
#export LLMDBENCH_VLLM_COMMON_NETWORK_RESOURCE=rdma/ib
44+
#export LLMDBENCH_VLLM_COMMON_NETWORK_NR=4
45+
46+
export LLMDBENCH_VLLM_COMMON_ENVVARS_TO_YAML=$(mktemp)
47+
cat << EOF > $LLMDBENCH_VLLM_COMMON_ENVVARS_TO_YAML
48+
- name: UCX_TLS
49+
value: "cuda_ipc,cuda_copy,tcp"
50+
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
51+
value: "5557"
52+
- name: VLLM_NIXL_SIDE_CHANNEL_HOST
53+
valueFrom:
54+
fieldRef:
55+
fieldPath: status.podIP
56+
- name: VLLM_LOGGING_LEVEL
57+
value: DEBUG
58+
- name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
59+
value: "1"
60+
EOF
61+
62+
export LLMDBENCH_VLLM_MODELSERVICE_EXTRA_CONTAINER_CONFIG=$(mktemp)
63+
cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_EXTRA_CONTAINER_CONFIG}
64+
ports:
65+
- containerPort: 5557
66+
protocol: TCP
67+
- containerPort: 8200
68+
name: metrics
69+
protocol: TCP
70+
EOF
71+
72+
# Prefill parameters
73+
export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS=0
74+
75+
# Decode parameters
76+
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_ACCELERATOR_NR=4
77+
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_NR=16
78+
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_MEM=64Gi
79+
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS=2
80+
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_INFERENCE_PORT=8200
81+
# Uncomment the following line to enable multi-nic
82+
#export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PODANNOTATIONS=deployed-by:$(id -un),modelservice:llm-d-benchmark,k8s.v1.cni.cncf.io/networks:multi-nic-compute
83+
# Uncomment the following two lines to enable roce/gdr (or switch to rdma/ib for infiniband)
84+
#export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_RESOURCE=rdma/roce_gdr
85+
#export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_NR=4
86+
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_MODEL_COMMAND=vllmServe
87+
export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS="[\
88+
--enforce-eager____\
89+
--block-size____REPLACE_ENV_LLMDBENCH_VLLM_COMMON_BLOCK_SIZE____\
90+
--kv-transfer-config____'{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'____\
91+
--tensor-parallel-size____REPLACE_ENV_LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_NR____\
92+
--disable-log-requests____\
93+
--disable-uvicorn-access-log____\
94+
--max-model-len____REPLACE_ENV_LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN\
95+
]"
96+
97+
# Local directory to copy benchmark runtime files and results
98+
export LLMDBENCH_CONTROL_WORK_DIR=~/data/inference-scheduling

0 commit comments

Comments
 (0)