Skip to content

Commit

Permalink
Update run-docs to avoid code duplication (#1439)
Browse files Browse the repository at this point in the history
* Update run-docs to avoid duplicate code

Update run-docs to avoid duplicate code

* Update run-docs

Add back command explaining seemingly extraneous `echo exit 1`

* Update build_native.sh

Update to C++11 ABI for AOTI, similar to ET

* Update run-docs

* Update run-docs

Update to run distributed inference test with open-llama instead of llama3.1

* Update run-docs

Open-llama -> stories to avoid tokens.

* Update README.md

Remove -l 3 since no longer necessary after Angea's change

* Update quantization.md

remove -l 3 from aoti run , and write -l3 for et_run

* Update run-docs

-l 3:-l 2 -> -l3:-l2

after modifying the command lines.  Hopefull this is legal for et_run

* Update run.cpp

Update to support non-space separated args

* Update run.cpp

typo

* Create cuda-32.json

Add a gs=32 cuda.json for test runs with stories15M

* Create mobile-32.json

add gs=32 variant of mobile for tests

* Update run-docs

Use gs=32 variants with stories models

* Update run-docs

undo gs32

* Update run-readme-pr-mps.yml

Extend timeout to avoid timeout of mps quantization test

* Update run.cpp

enforce that and argument must have at least length 2, and refine check for uniarg (ie arg plus flag value in one option) to be args with more than 2 characters

* Update run.cpp

typos

---------

Co-authored-by: Jack-Khuu <[email protected]>
  • Loading branch information
mikekgfb and Jack-Khuu authored Jan 30, 2025
1 parent 5684175 commit 4356b4c
Show file tree
Hide file tree
Showing 7 changed files with 105 additions and 161 deletions.
202 changes: 62 additions & 140 deletions .ci/scripts/run-docs
Original file line number Diff line number Diff line change
@@ -1,145 +1,67 @@
# /bin/bash -x
#!/bin/bash -x

if [ "X$1" == "X" ]; then
# Check if an argument was provided
if [ -z "$1" ]; then
echo "Must specify document to run"
exit 1
fi

if [ "$1" == "readme" ]; then
echo "::group::Create script to run README"
python3 torchchat/utils/scripts/updown.py --create-sections --file README.md --replace 'llama3.1:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-readme.sh
# for good measure, if something happened to updown processor,
# and it did not error out, fail with an exit 1
echo "exit 1" >> ./run-readme.sh
echo "::endgroup::"

echo "::group::Run README"
echo "*******************************************"
cat ./run-readme.sh
echo "*******************************************"
bash -x ./run-readme.sh
echo "::endgroup::"

exit 0
fi

if [ "$1" == "quantization" ]; then
echo "::group::Create script to run quantization"
python3 torchchat/utils/scripts/updown.py --create-sections --file docs/quantization.md --replace llama3:stories15M --suppress huggingface-cli,HF_TOKEN > ./run-quantization.sh
# for good measure, if something happened to updown processor,
# and it did not error out, fail with an exit 1
echo "exit 1" >> ./run-quantization.sh
echo "::endgroup::"

echo "::group::Run quantization"
echo "*******************************************"
cat ./run-quantization.sh
echo "*******************************************"
bash -x ./run-quantization.sh
echo "::endgroup::"

exit 0
fi

if [ "$1" == "gguf" ]; then
echo "::group::Create script to run gguf"
python3 torchchat/utils/scripts/updown.py --file docs/GGUF.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-gguf.sh
# for good measure, if something happened to updown processor,
# and it did not error out, fail with an exit 1
echo "exit 1" >> ./run-gguf.sh
echo "::endgroup::"

echo "::group::Run gguf"
echo "*******************************************"
cat ./run-gguf.sh
echo "*******************************************"
bash -x ./run-gguf.sh
echo "::endgroup::"
fi


if [ "$1" == "advanced" ]; then
echo "::group::Create script to run advanced"
python3 torchchat/utils/scripts/updown.py --file docs/ADVANCED-USERS.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-advanced.sh
# for good measure, if something happened to updown processor,
# and it did not error out, fail with an exit 1
echo "exit 1" >> ./run-advanced.sh
echo "::endgroup::"

echo "::group::Run advanced"
echo "*******************************************"
cat ./run-advanced.sh
echo "*******************************************"
bash -x ./run-advanced.sh
echo "::endgroup::"
fi

if [ "$1" == "evaluation" ]; then
echo "::group::Create script to run evaluation"
python3 torchchat/utils/scripts/updown.py --file torchchat/utils/docs/evaluation.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-evaluation.sh
# for good measure, if something happened to updown processor,
# and it did not error out, fail with an exit 1
echo "exit 1" >> ./run-evaluation.sh
echo "::endgroup::"

echo "::group::Run evaluation"
echo "*******************************************"
cat ./run-evaluation.sh
echo "*******************************************"
bash -x ./run-evaluation.sh
fi

if [ "$1" == "multimodal" ]; then

# Expecting that this might fail this test as-is, because
# it's the first on-pr test depending on github secrets for access with HF token access

echo "::group::Create script to run multimodal"
python3 torchchat/utils/scripts/updown.py --file docs/multimodal.md > ./run-multimodal.sh
# for good measure, if something happened to updown processor,
# and it did not error out, fail with an exit 1
echo "exit 1" >> ./run-multimodal.sh
echo "::endgroup::"

echo "::group::Run multimodal"
echo "*******************************************"
cat ./run-multimodal.sh
echo "*******************************************"
bash -x ./run-multimodal.sh
echo "::endgroup::"
fi

if [ "$1" == "native" ]; then

echo "::group::Create script to run native-execution"
python3 torchchat/utils/scripts/updown.py --file docs/native-execution.md > ./run-native.sh
# for good measure, if something happened to updown processor,
# and it did not error out, fail with an exit 1
echo "exit 1" >> ./run-native.sh
echo "::endgroup::"

echo "::group::Run native-execution"
echo "*******************************************"
cat ./run-native.sh
echo "*******************************************"
bash -x ./run-native.sh
echo "::endgroup::"
fi

if [ "$1" == "distributed" ]; then

echo "::group::Create script to run distributed"
python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --replace 'llama3.1:stories110M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-distributed.sh
python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --suppress huggingface-cli,HF_TOKEN > ./run-distributed.sh
# for good measure, if something happened to updown processor,
# and it did not error out, fail with an exit 1
echo "exit 1" >> ./run-distributed.sh
echo "::endgroup::"

echo "::group::Run distributed"
echo "*******************************************"
cat ./run-distributed.sh
echo "*******************************************"
bash -x ./run-distributed.sh
echo "::endgroup::"
fi
# Pre-initialize variables
filepath=""
parameters="--replace 'llama3:stories15M,-l3:-l2' --suppress huggingface-cli,HF_TOKEN"
script_name="./run-${1}.sh" # Dynamically initialize script name

# Use a case statement to handle the $1 argument
case "$1" in
"readme")
filepath="README.md"
;;
"quantization")
filepath="docs/quantization.md"
;;
"gguf")
filepath="docs/GGUF.md"
;;
"advanced")
filepath="docs/ADVANCED-USERS.md"
;;
"evaluation")
filepath="torchchat/utils/docs/evaluation.md"
;;
"multimodal")
filepath="docs/multimodal.md"
parameters="" # Clear parameters
;;
"native")
filepath="docs/native-execution.md"
parameters="" # Clear parameters
;;
"distributed")
filepath="docs/distributed.md"
parameters="--replace 'llama3.1:stories110M,-l3:-l2' --suppress huggingface-cli,HF_TOKEN" # Use stories110M to avoid need for authentication
;;
"local")
filepath="docs/local-model.md"
parameters="" # Clear parameters
;;

*)
echo "Unknown option: $1"
exit 1
;;
esac

# Generate the script
echo "::group::Create script to run $1"
python3 torchchat/utils/scripts/updown.py --file "$filepath" $parameters > "$script_name"
# if something happened to updown processor, and it did not error out, fail with an exit 1
echo "exit 1" >> "$script_name"
echo "::endgroup::"

# Run the script
echo "::group::Run $1"
echo "*******************************************"
cat "$script_name"
echo "*******************************************"
bash -x "$script_name"
echo "::endgroup::"
5 changes: 3 additions & 2 deletions .github/workflows/run-readme-pr-mps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ jobs:
conda create -y -n test-readme-mps-macos python=3.10.11 llvm-openmp
conda activate test-readme-mps-macos
set -x
# NS: Remove previous installation of torch first
# as this script does not isntall anything into conda env but rather as system dep
# NS: Remove previous installation of torch first
# as this script does not install anything into conda env but rather as system dep
pip3 uninstall -y torch || true
set -eou pipefail
Expand All @@ -37,6 +37,7 @@ jobs:
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
with:
runner: macos-m1-14
timeout: 60
script: |
set -x
conda create -y -n test-quantization-mps-macos python=3.10.11
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ torchchat/utils/scripts/build_native.sh et

Execute using the runner
```bash
cmake-out/et_run llama3.1.pte -z `python3 torchchat.py where llama3.1`/tokenizer.model -l 3 -i "Once upon a time"
cmake-out/et_run llama3.1.pte -z `python3 torchchat.py where llama3.1`/tokenizer.model -i "Once upon a time"
```

</details>
Expand Down
4 changes: 2 additions & 2 deletions docs/quantization.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --dso-path llama3_1.so
If you built the AOTI runner with link_torchao_ops as discussed in the setup section, you can also use the C++ runner:

```
OMP_NUM_THREADS=6 ./cmake-out/aoti_run llama3_1.so -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l 3 -i "Once upon a time,"
OMP_NUM_THREADS=6 ./cmake-out/aoti_run llama3_1.so -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -i "Once upon a time," # -l 3
```

#### ExecuTorch
Expand All @@ -193,7 +193,7 @@ python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"e
Note: only the ExecuTorch C++ runner in torchchat when built using the instructions in the setup can run the exported *.pte file. It will not work with the `python torchchat.py generate` command.

```
./cmake-out/et_run llama3_1.pte -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l 3 -i "Once upon a time,"
./cmake-out/et_run llama3_1.pte -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l3 -i "Once upon a time,"
```

## Experimental TorchAO MPS lowbit kernels
Expand Down
44 changes: 28 additions & 16 deletions runner/run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -803,41 +803,53 @@ int main(int argc, char *argv[]) {
} else {
error_usage();
}
for (int i = 2; i < argc; i += 2) {
for (int i = 2; i < argc; i += 1) {
// do some basic validation
if (i + 1 >= argc) {
error_usage();
} // must have arg after flag
char *parm = argv[i+1];
// uniarg means the arg comes right after the letter in accordance with posix
int uniarg = strlen(argv[i]) > 2;

if (argv[i][0] != '-') {
error_usage();
} // must start with dash
if (strlen(argv[i]) != 2) {

if (strlen(argv[i]) < 2) {
error_usage();
} // must be -x (one dash, one letter)
} // must have at least dash '-' and option letter

if (uniarg) {
parm=&argv[i][2];
} else if (i + 1 >= argc) {
error_usage();
} // must have arg after option if flag is not contiguous to option

// read in the args
if (argv[i][1] == 't') {
temperature = atof(argv[i + 1]);
temperature = atof(parm);
} else if (argv[i][1] == 'p') {
topp = atof(argv[i + 1]);
topp = atof(parm);
} else if (argv[i][1] == 's') {
rng_seed = atoi(argv[i + 1]);
rng_seed = atoi(parm);
} else if (argv[i][1] == 'n') {
steps = atoi(argv[i + 1]);
steps = atoi(parm);
} else if (argv[i][1] == 'v') {
vocab_size = atoi(argv[i + 1]);
vocab_size = atoi(parm);
} else if (argv[i][1] == 'i') {
prompt = argv[i + 1];
prompt = parm;
} else if (argv[i][1] == 'z') {
tokenizer_path = argv[i + 1];
tokenizer_path = parm;
} else if (argv[i][1] == 'm') {
mode = argv[i + 1];
mode = parm;
} else if (argv[i][1] == 'y') {
system_prompt = argv[i + 1];
system_prompt = parm;
} else if (argv[i][1] == 'l') {
llama_ver = atoi(argv[i + 1]);
llama_ver = atoi(parm);
} else {
error_usage();
}

// account for parameter
i += (uniarg)?0:1;
}

if (model_path == NULL) {
Expand Down
5 changes: 5 additions & 0 deletions torchchat/quant_config/cuda-32.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"executor": {"accelerator": "cuda"},
"precision": {"dtype": "bf16"},
"linear:int4": {"groupsize" : 32}
}
4 changes: 4 additions & 0 deletions torchchat/quant_config/mobile-32.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"embedding": {"bitwidth": 4, "groupsize" : 32},
"linear:a8w4dq": {"groupsize" : 32}
}

0 comments on commit 4356b4c

Please sign in to comment.