Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove hardcoded model dependencies in benchmark script #848

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 23 additions & 6 deletions benchmarks/inference/mii/run_all.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,35 @@ TP_SIZES["7b"]="1"
TP_SIZES["13b"]="1:2:4"
TP_SIZES["70b"]="4:8"

# model dependent parameters
# LLAMA-2
MAX_PROMPT_LENGTH=4000
PROMPT_LENGTH_LIST=(2600 1200)
MAX_NEW_TOKENS_LIST=(60 128)

# Falcon
# MAX_PROMPT_LENGTH=2000
# PROMPT_LENGTH_LIST=(1900 1200)
# MAX_NEW_TOKENS_LIST=(60 128)

for PARAM_SIZE in ${PARAM_SIZES[@]}; do
MODEL_NAME=meta-llama/Llama-2-${PARAM_SIZE}-hf
# MODEL_NAME=tiiuae/falcon-${PARAM_SIZE}

IFS=':' read -ra TP_VALUES <<< ${TP_SIZES[${PARAM_SIZE}]}
for TP in ${TP_VALUES[@]}; do
DEPLOYMENT_NAME=llama2-${PARAM_SIZE}-tp${TP}-b${RAGGED_BATCH_SIZE}
python server.py --model_name meta-llama/Llama-2-${PARAM_SIZE}-hf -d ${DEPLOYMENT_NAME} -m ${TP} -b ${RAGGED_BATCH_SIZE} start

DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=60 bash ./run_benchmark_client.sh
DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=128 bash ./run_benchmark_client.sh
DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=1200 MAX_NEW_TOKENS=60 bash ./run_benchmark_client.sh
DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=1200 MAX_NEW_TOKENS=128 bash ./run_benchmark_client.sh
# DEPLOYMENT_NAME=falcon-${PARAM_SIZE}-tp${TP}-b${RAGGED_BATCH_SIZE}

echo "Starting server"
python server.py --model_name ${MODEL_NAME} -d ${DEPLOYMENT_NAME} -m ${TP} -b ${RAGGED_BATCH_SIZE} start

for PROMPT_LENGTH in ${PROMPT_LENGTH_LIST[@]}; do
for MAX_NEW_TOKENS in ${MAX_NEW_TOKENS_LIST[@]}; do
source ./run_benchmark_client.sh
done
done

echo "Stopping server"
python server.py -d ${DEPLOYMENT_NAME} stop
sleep 120
Expand Down
30 changes: 24 additions & 6 deletions benchmarks/inference/mii/run_all_vllm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,36 @@ TP_SIZES["7b"]="1"
TP_SIZES["13b"]="1:2:4"
TP_SIZES["70b"]="4:8"

# model dependent parameters
# LLAMA-2
MAX_PROMPT_LENGTH=4000
PROMPT_LENGTH_LIST=(2600 1200)
MAX_NEW_TOKENS_LIST=(60 128)

# # Falcon
# MAX_PROMPT_LENGTH=2000
# PROMPT_LENGTH_LIST=(1900 1200)
# MAX_NEW_TOKENS_LIST=(60 128)

for PARAM_SIZE in ${PARAM_SIZES[@]}; do

MODEL_NAME=meta-llama/Llama-2-${PARAM_SIZE}-hf
# MODEL_NAME=tiiuae/falcon-${PARAM_SIZE}

IFS=':' read -ra TP_VALUES <<< ${TP_SIZES[${PARAM_SIZE}]}
for TP in ${TP_VALUES[@]}; do
DEPLOYMENT_NAME=vllm-llama2-${PARAM_SIZE}-tp${TP}
python -m vllm.entrypoints.api_server --host 127.0.0.1 --port 26500 --tensor-parallel-size ${TP} --model meta-llama/Llama-2-${PARAM_SIZE}-hf &
# DEPLOYMENT_NAME=falcon-${PARAM_SIZE}-tp${TP}-b${RAGGED_BATCH_SIZE}

echo "Starting server"
python -m vllm.entrypoints.api_server --host 127.0.0.1 --port 26500 --tensor-parallel-size ${TP} --model ${MODEL_NAME} &
sleep 60

DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=60 VLLM="--vllm" bash ./run_benchmark_client.sh
DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=128 VLLM="--vllm" bash ./run_benchmark_client.sh
DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=1200 MAX_NEW_TOKENS=60 VLLM="--vllm" bash ./run_benchmark_client.sh
DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=1200 MAX_NEW_TOKENS=128 VLLM="--vllm" bash ./run_benchmark_client.sh
for PROMPT_LENGTH in ${PROMPT_LENGTH_LIST[@]}; do
for MAX_NEW_TOKENS in ${MAX_NEW_TOKENS_LIST[@]}; do
VLLM="--vllm"
source ./run_benchmark_client.sh
done
done

echo "Stopping server"
pkill -u ${USER} -f vllm.entrypoints.api_server
Expand Down
21 changes: 14 additions & 7 deletions benchmarks/inference/mii/run_benchmark_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@

from postprocess_results import get_summary, ResponseDetails

MAX_PROMPT_LENGTH = 4000
PROMPT_LENGTH_VAR = 0.3
MAX_NEW_TOKENS_VAR = 0.3

Expand All @@ -33,6 +32,15 @@ def parse_args():
type=int,
default=60,
help="min and max num tokens argument for huggingface")
parser.add_argument("-p",
"--max_prompt_length",
type=int,
default=60,
help="Maximum prompt length allowed")
parser.add_argument("-m",
"--model_name",
type=str,
default="model")
parser.add_argument("-d",
"--deployment_name",
type=str,
Expand Down Expand Up @@ -197,7 +205,7 @@ def _run_parallel(deployment_name, warmup, barrier, query_queue, result_queue, c
print(f"Worker ({pid}) finished. session_id: {session_id}")


def run_client(client_num, deployment_name, prompt_length, max_new_tokens, num_queries, warmup, stream, vllm, use_thread=False):
def run_client(client_num, model_name, deployment_name, prompt_length, max_new_tokens, max_prompt_length, num_queries, warmup, stream, vllm, use_thread=False):
"""
Run MII client for benchmarking. The scenario is a bit complicated:
1. The main process puts `num_queries` queries into the input queue
Expand Down Expand Up @@ -228,10 +236,9 @@ def run_client(client_num, deployment_name, prompt_length, max_new_tokens, num_q
for p in processes:
p.start()

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
tokenizer = AutoTokenizer.from_pretrained(model_name)
query_generator = RandomQueryGenerator(all_text, tokenizer, seed=42)
MAX_PROMPT_LENGTH = 4000
request_text = query_generator.get_random_request_text(prompt_length, prompt_length*PROMPT_LENGTH_VAR, MAX_PROMPT_LENGTH, num_queries + warmup*client_num)
request_text = query_generator.get_random_request_text(prompt_length, prompt_length*PROMPT_LENGTH_VAR, max_prompt_length, num_queries + warmup*client_num)

for t in request_text:
req_max_new_tokens = int(np.random.normal(max_new_tokens, MAX_NEW_TOKENS_VAR*max_new_tokens))
Expand Down Expand Up @@ -262,9 +269,9 @@ def run_client(client_num, deployment_name, prompt_length, max_new_tokens, num_q
if args.out_json_path is not None and not args.out_json_path.parent.exists():
raise ValueError(f"Parent directory of {args.out_json_path}")

response_details = run_client(args.client_num, args.deployment_name,
response_details = run_client(args.client_num, args.model_name, args.deployment_name,
args.prompt_length,
args.max_new_tokens, args.num_queries, args.warmup,
args.max_new_tokens, args.max_prompt_length, args.num_queries, args.warmup,
args.stream, args.vllm, args.use_thread)

args_dict = vars(args)
Expand Down
6 changes: 4 additions & 2 deletions benchmarks/inference/mii/run_benchmark_client.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
#!/bin/bash

MODEL_NAME=${MODEL_NAME:-meta-llama/Llama-2-7b-hf}
DEPLOYMENT_NAME=${DEPLOYMENT_NAME:-llama2-7b}
VLLM=${VLLM:-""}

CLIENT_NUMS=${CLIENT_NUMS:-1 2 4 6 8 12 16 20 24 28 32}
MAX_NEW_TOKENS=${MAX_NEW_TOKENS:-60}
PROMPT_LENGTH=${PROMPT_LENGTH:-3072}
MAX_PROMPT_LENGTH=${PROMPT_LENGTH:-4000}
REQUEST_NUM=${REQUEST_NUM:-512}

LOG_DIR=logs.${DEPLOYMENT_NAME}
Expand All @@ -15,8 +16,9 @@ for client_num in ${CLIENT_NUMS[@]}; do
RESULT_FILE=${DEPLOYMENT_NAME}_c${client_num}_p${PROMPT_LENGTH}_g${MAX_NEW_TOKENS}.json

python run_benchmark_client.py -w 1 \
-m ${MODEL_NAME} \
-d ${DEPLOYMENT_NAME} -n ${REQUEST_NUM} -c ${client_num} \
-k ${MAX_NEW_TOKENS} -l ${PROMPT_LENGTH} \
-k ${MAX_NEW_TOKENS} -l ${PROMPT_LENGTH} -p ${MAX_PROMPT_LENGTH}\
-o ${LOG_DIR}/${RESULT_FILE} \
${VLLM} --stream \
2>&1 | tee ${LOG_DIR}/bench_client_num_c${client_num}_p${PROMPT_LENGTH}_g${MAX_NEW_TOKENS}.log
Expand Down
Loading