pytorch · Jack-Khuu · Jan 30, 2025 · Dec 23, 2024 · Dec 24, 2024 · Dec 27, 2024
diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs
@@ -1,145 +1,67 @@
-# /bin/bash -x
+#!/bin/bash -x
 
-if [ "X$1" == "X" ]; then
+# Check if an argument was provided
+if [ -z "$1" ]; then
   echo "Must specify document to run"
   exit 1
 fi
 
-if [ "$1" == "readme" ]; then
-        echo "::group::Create script to run README"
-        python3 torchchat/utils/scripts/updown.py --create-sections --file README.md --replace 'llama3.1:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-readme.sh
-        # for good measure, if something happened to updown processor,
-        # and it did not error out, fail with an exit 1
-        echo "exit 1" >> ./run-readme.sh
-        echo "::endgroup::"
-
-        echo "::group::Run README"
-        echo "*******************************************"
-        cat ./run-readme.sh
-        echo "*******************************************"
-        bash -x ./run-readme.sh
-        echo "::endgroup::"
-
-        exit 0
-fi
-
-if [ "$1" == "quantization" ]; then
-        echo "::group::Create script to run quantization"
-        python3 torchchat/utils/scripts/updown.py --create-sections --file docs/quantization.md --replace llama3:stories15M --suppress huggingface-cli,HF_TOKEN > ./run-quantization.sh
-        # for good measure, if something happened to updown processor,
-        # and it did not error out, fail with an exit 1
-        echo "exit 1" >> ./run-quantization.sh
-        echo "::endgroup::"
-
-        echo "::group::Run quantization"
-        echo "*******************************************"
-        cat ./run-quantization.sh
-        echo "*******************************************"
-        bash -x ./run-quantization.sh
-        echo "::endgroup::"
-
-        exit 0
-fi
-
-if [ "$1" == "gguf" ]; then
-        echo "::group::Create script to run gguf"
-        python3 torchchat/utils/scripts/updown.py --file docs/GGUF.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-gguf.sh
-        # for good measure, if something happened to updown processor,
-        # and it did not error out, fail with an exit 1
-        echo "exit 1" >> ./run-gguf.sh
-        echo "::endgroup::"
-
-        echo "::group::Run gguf"
-        echo "*******************************************"
-        cat ./run-gguf.sh
-        echo "*******************************************"
-        bash -x ./run-gguf.sh
-        echo "::endgroup::"
-fi
-
-
-if [ "$1" == "advanced" ]; then
-        echo "::group::Create script to run advanced"
-        python3 torchchat/utils/scripts/updown.py --file docs/ADVANCED-USERS.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-advanced.sh
-        # for good measure, if something happened to updown processor,
-        # and it did not error out, fail with an exit 1
-        echo "exit 1" >> ./run-advanced.sh
-        echo "::endgroup::"
-
-        echo "::group::Run advanced"
-        echo "*******************************************"
-        cat ./run-advanced.sh
-        echo "*******************************************"
-        bash -x ./run-advanced.sh
-        echo "::endgroup::"
-fi
-
-if [ "$1" == "evaluation" ]; then
-        echo "::group::Create script to run evaluation"
-        python3 torchchat/utils/scripts/updown.py --file torchchat/utils/docs/evaluation.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-evaluation.sh
-        # for good measure, if something happened to updown processor,
-        # and it did not error out, fail with an exit 1
-        echo "exit 1" >> ./run-evaluation.sh
-        echo "::endgroup::"
-
-        echo "::group::Run evaluation"
-        echo "*******************************************"
-        cat ./run-evaluation.sh
-        echo "*******************************************"
-        bash -x ./run-evaluation.sh
-fi
-
-if [ "$1" == "multimodal" ]; then
-
-   # Expecting that this might fail this test as-is, because 
-   # it's the first on-pr test depending on github secrets for access with HF token access
-
-        echo "::group::Create script to run multimodal"
-        python3 torchchat/utils/scripts/updown.py --file docs/multimodal.md > ./run-multimodal.sh
-        # for good measure, if something happened to updown processor,
-        # and it did not error out, fail with an exit 1
-        echo "exit 1" >> ./run-multimodal.sh
-        echo "::endgroup::"
-
-        echo "::group::Run multimodal"
-        echo "*******************************************"
-        cat ./run-multimodal.sh
-        echo "*******************************************"
-        bash -x ./run-multimodal.sh
-        echo "::endgroup::"
-fi
-
-if [ "$1" == "native" ]; then
-
-        echo "::group::Create script to run native-execution"
-        python3 torchchat/utils/scripts/updown.py --file docs/native-execution.md > ./run-native.sh
-        # for good measure, if something happened to updown processor,
-        # and it did not error out, fail with an exit 1
-        echo "exit 1" >> ./run-native.sh
-        echo "::endgroup::"
-
-        echo "::group::Run native-execution"
-        echo "*******************************************"
-        cat ./run-native.sh
-        echo "*******************************************"
-        bash -x ./run-native.sh
-        echo "::endgroup::"
-fi
-
-if [ "$1" == "distributed" ]; then
-
-        echo "::group::Create script to run distributed"
-        python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --replace 'llama3.1:stories110M,-l 3:-l 2'  --suppress huggingface-cli,HF_TOKEN  > ./run-distributed.sh
-        python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --suppress huggingface-cli,HF_TOKEN > ./run-distributed.sh
-        # for good measure, if something happened to updown processor,
-        # and it did not error out, fail with an exit 1
-        echo "exit 1" >> ./run-distributed.sh
-        echo "::endgroup::"
-
-        echo "::group::Run distributed"
-        echo "*******************************************"
-        cat ./run-distributed.sh
-        echo "*******************************************"
-        bash -x ./run-distributed.sh
-        echo "::endgroup::"
-fi
+# Pre-initialize variables
+filepath=""
+parameters="--replace 'llama3:stories15M,-l3:-l2' --suppress huggingface-cli,HF_TOKEN"
+script_name="./run-${1}.sh"  # Dynamically initialize script name
+
+# Use a case statement to handle the $1 argument
+case "$1" in
+  "readme")
+    filepath="README.md"
+    ;;
+  "quantization")
+    filepath="docs/quantization.md"
+    ;;
+  "gguf")
+    filepath="docs/GGUF.md"
+    ;;
+  "advanced")
+    filepath="docs/ADVANCED-USERS.md"
+    ;;
+  "evaluation")
+    filepath="torchchat/utils/docs/evaluation.md"
+    ;;
+  "multimodal")
+    filepath="docs/multimodal.md"
+    parameters=""  # Clear parameters
+    ;;
+  "native")
+    filepath="docs/native-execution.md"
+    parameters=""  # Clear parameters
+    ;;
+  "distributed")
+    filepath="docs/distributed.md"
+    parameters="--replace 'llama3.1:stories110M,-l3:-l2' --suppress huggingface-cli,HF_TOKEN"  # Use stories110M to avoid need for authentication
+    ;;
+  "local")
+    filepath="docs/local-model.md"
+    parameters=""  # Clear parameters
+    ;;
+
+  *)
+    echo "Unknown option: $1"
+    exit 1
+    ;;
+esac
+
+# Generate the script
+echo "::group::Create script to run $1"
+python3 torchchat/utils/scripts/updown.py --file "$filepath" $parameters > "$script_name"
+# if something happened to updown processor, and it did not error out, fail with an exit 1
+echo "exit 1" >> "$script_name"
+echo "::endgroup::"
+
+# Run the script
+echo "::group::Run $1"
+echo "*******************************************"
+cat "$script_name"
+echo "*******************************************"
+bash -x "$script_name"
+echo "::endgroup::"
diff --git a/.github/workflows/run-readme-pr-mps.yml b/.github/workflows/run-readme-pr-mps.yml
@@ -15,8 +15,8 @@ jobs:
           conda create -y -n test-readme-mps-macos python=3.10.11 llvm-openmp
           conda activate test-readme-mps-macos
           set -x
-          # NS: Remove previous installation  of torch first
-          # as this script does not isntall anything into conda env but rather as system dep
+          # NS: Remove previous installation of torch first
+          # as this script does not install anything into conda env but rather as system dep
           pip3 uninstall -y torch || true
           set -eou pipefail
 
@@ -37,6 +37,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
       runner: macos-m1-14
+      timeout: 60
       script: |
           set -x
           conda create -y -n test-quantization-mps-macos python=3.10.11

diff --git a/README.md b/README.md
@@ -413,7 +413,7 @@ torchchat/utils/scripts/build_native.sh et
 
 Execute using the runner
 ```bash
-cmake-out/et_run llama3.1.pte -z `python3 torchchat.py where llama3.1`/tokenizer.model -l 3 -i "Once upon a time"
+cmake-out/et_run llama3.1.pte -z `python3 torchchat.py where llama3.1`/tokenizer.model -i "Once upon a time"
 ```
 
 </details>

diff --git a/docs/quantization.md b/docs/quantization.md
@@ -182,7 +182,7 @@ OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --dso-path llama3_1.so
 If you built the AOTI runner with link_torchao_ops as discussed in the setup section, you can also use the C++ runner:
 
 ```
-OMP_NUM_THREADS=6 ./cmake-out/aoti_run llama3_1.so -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l 3 -i "Once upon a time,"
+OMP_NUM_THREADS=6 ./cmake-out/aoti_run llama3_1.so -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -i "Once upon a time," # -l 3
 ```
 
 #### ExecuTorch
@@ -193,7 +193,7 @@ python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"e
 Note: only the ExecuTorch C++ runner in torchchat when built using the instructions in the setup can run the exported *.pte file.  It will not work with the `python torchchat.py generate` command.
 
 ```
-./cmake-out/et_run llama3_1.pte -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l 3 -i "Once upon a time,"
+./cmake-out/et_run llama3_1.pte -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l3 -i "Once upon a time,"
 ```
 
 ## Experimental TorchAO MPS lowbit kernels

diff --git a/runner/run.cpp b/runner/run.cpp
@@ -803,41 +803,53 @@ int main(int argc, char *argv[]) {
   } else {
     error_usage();
   }
-  for (int i = 2; i < argc; i += 2) {
+  for (int i = 2; i < argc; i += 1) {
     // do some basic validation
-    if (i + 1 >= argc) {
-      error_usage();
-    } // must have arg after flag
+    char *parm = argv[i+1];
+    // uniarg means the arg comes right after the letter in accordance with posix
+    int uniarg = strlen(argv[i]) > 2; 
+
     if (argv[i][0] != '-') {
       error_usage();
     } // must start with dash
-    if (strlen(argv[i]) != 2) {
+
+    if (strlen(argv[i]) < 2) {
       error_usage();
-    } // must be -x (one dash, one letter)
+    } // must have at least dash '-' and option letter
+
+    if (uniarg) {
+      parm=&argv[i][2];
+    } else if (i + 1 >= argc) {
+      error_usage();
+    } // must have arg after option if flag is not contiguous to option
+
     // read in the args
     if (argv[i][1] == 't') {
-      temperature = atof(argv[i + 1]);
+      temperature = atof(parm);
     } else if (argv[i][1] == 'p') {
-      topp = atof(argv[i + 1]);
+      topp = atof(parm);
     } else if (argv[i][1] == 's') {
-      rng_seed = atoi(argv[i + 1]);
+      rng_seed = atoi(parm);
     } else if (argv[i][1] == 'n') {
-      steps = atoi(argv[i + 1]);
+      steps = atoi(parm);
     } else if (argv[i][1] == 'v') {
-      vocab_size = atoi(argv[i + 1]);
+      vocab_size = atoi(parm);
     } else if (argv[i][1] == 'i') {
-      prompt = argv[i + 1];
+      prompt = parm;
     } else if (argv[i][1] == 'z') {
-      tokenizer_path = argv[i + 1];
+      tokenizer_path = parm;
     } else if (argv[i][1] == 'm') {
-      mode = argv[i + 1];
+      mode = parm;
     } else if (argv[i][1] == 'y') {
-      system_prompt = argv[i + 1];
+      system_prompt = parm;
     } else if (argv[i][1] == 'l') {
-      llama_ver = atoi(argv[i + 1]);
+      llama_ver = atoi(parm);
     } else {
       error_usage();
     }
+
+    // account for parameter
+    i += (uniarg)?0:1;
   }
 
   if (model_path == NULL) {

diff --git a/torchchat/quant_config/cuda-32.json b/torchchat/quant_config/cuda-32.json
@@ -0,0 +1,5 @@
+{
+    "executor": {"accelerator": "cuda"},
+    "precision": {"dtype": "bf16"},
+    "linear:int4": {"groupsize" : 32}
+}
diff --git a/torchchat/quant_config/mobile-32.json b/torchchat/quant_config/mobile-32.json
@@ -0,0 +1,4 @@
+{
+    "embedding": {"bitwidth": 4, "groupsize" : 32},
+    "linear:a8w4dq": {"groupsize" : 32}
+}