From 92e0a9d54134dda66d2579083200f6707528b51f Mon Sep 17 00:00:00 2001
From: vmpuri <puri@meta.com>
Date: Thu, 24 Oct 2024 12:43:48 -0700
Subject: [PATCH 01/83] Replace WeightOnlyInt8Linear with TorchAO
 int8_weight_only quantization

---
 torchchat/utils/quantize.py | 229 +++++++++++-------------------------
 1 file changed, 66 insertions(+), 163 deletions(-)

diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py
index 31c639dfd..bda695ae2 100644
--- a/torchchat/utils/quantize.py
+++ b/torchchat/utils/quantize.py
@@ -26,7 +26,7 @@
 
 # from functools import reduce
 # from math import gcd
-from typing import Dict, Optional, Callable, Any, List
+from typing import Any, Callable, Dict, List, Optional
 
 import torch
 import torch.nn as nn
@@ -37,6 +37,7 @@
 from torchao.quantization.quant_api import (
     int4_weight_only,
     Int4WeightOnlyQuantizer,
+    int8_weight_only,
     Int8DynActInt4WeightQuantizer,
     quantize_,
 )
@@ -45,8 +46,8 @@
     find_multiple,
     get_device_str,
     get_precision,
-    set_precision,
     name_to_dtype,
+    set_precision,
     state_dict_device,
     use_et_backend,
 )
@@ -60,28 +61,36 @@
 
 import inspect
 
+
 def get_named_parameters(func: Callable) -> List[str]:
     # Get the signature of the function
     signature = inspect.signature(func)
-    
+
     # Extract the parameters from the signature
     parameters = signature.parameters
-    
+
     # Filter and return named parameters
     named_params = [
-        name for name, param in parameters.items()
-        if param.kind in (inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.KEYWORD_ONLY)
+        name
+        for name, param in parameters.items()
+        if param.kind
+        in (inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.KEYWORD_ONLY)
     ]
     return named_params
 
-def validate_args(named_params: List[str], q_kwargs: Dict[str, Any], quantizer: Optional[str] = None) -> Dict[str, Any]:
+
+def validate_args(
+    named_params: List[str], q_kwargs: Dict[str, Any], quantizer: Optional[str] = None
+) -> Dict[str, Any]:
     for key in q_kwargs.keys():
         if key not in named_params:
-            print(f"Specification for quantizer {quantizer} has extraneous key {key}. Ignoring.")
+            print(
+                f"Specification for quantizer {quantizer} has extraneous key {key}. Ignoring."
+            )
             del q_kwargs[key]
     return q_kwargs
-    
-            
+
+
 #########################################################################
 ###                  torchchat quantization API                       ###
 
@@ -110,21 +119,30 @@ def quantize_model(
         if quantizer not in quantizer_class_dict:
             raise RuntimeError(f"unknown quantizer {quantizer} specified")
         else:
+            ao_quant = True
             # Use tensor subclass API for int4 weight only.
             if device == "cuda" and quantizer == "linear:int4":
                 quantize_(model, int4_weight_only(q_kwargs["groupsize"]))
+            elif quantizer == "linear:int8":
+                print("quantizer is linear int8")
+                quantize_(model, int8_weight_only())
+            else:
+                ao_quant = False
+            if ao_quant:
                 if not support_tensor_subclass:
                     unwrap_tensor_subclass(model)
                 continue
-            
+
             if quantizer in ["linear:a8wxdq", "embedding:wx"]:
                 # These quantizers require float32 input weights.  Note that after quantization,
                 # the weights will no longer be float32, but lowbit integers
                 if get_precision() != torch.float32:
-                    print(f"Quantizer {quantizer} requires float32 inputs, but received {get_precision()}.  Changing dtype to float32.  Note that after quantization, the weights will be lowbit integers, not float32.")
+                    print(
+                        f"Quantizer {quantizer} requires float32 inputs, but received {get_precision()}.  Changing dtype to float32.  Note that after quantization, the weights will be lowbit integers, not float32."
+                    )
                     set_precision(torch.float32)
-                
-            # We set global precision from quantize options if it is specified at cli.py:485 
+
+            # We set global precision from quantize options if it is specified at cli.py:485
             # so the precision returned by get_precision() is always the authoritative precision/dtype in torchchat
             precision = get_precision()
 
@@ -141,14 +159,19 @@ def quantize_model(
             model = quant_handler.quantize(model)
 
 
-
 #########################################################################
 ###                QuantHandler API definition                        ###
 ###               (unify with torchao in future)                      ###
 
 
 class QuantHandler:
-    def __init__(self, model: Optional[nn.Module] = None, device="cpu", precision=None, tokenizer=None):
+    def __init__(
+        self,
+        model: Optional[nn.Module] = None,
+        device="cpu",
+        precision=None,
+        tokenizer=None,
+    ):
         self.model_ = model
         self.device = device
         self.tokenizer = tokenizer
@@ -176,7 +199,15 @@ def quantize(self, model: nn.Module) -> nn.Module:
 
 
 class PrecisionHandler(QuantHandler):
-    def __init__(self, model: Optional[nn.Module]=None, device="cpu", precision=None, tokenizer=None, *, dtype):
+    def __init__(
+        self,
+        model: Optional[nn.Module] = None,
+        device="cpu",
+        precision=None,
+        tokenizer=None,
+        *,
+        dtype,
+    ):
         self.model_ = model
         self.device = device
         self.tokenizer = tokenizer
@@ -205,7 +236,15 @@ def quantized_model(self) -> nn.Module:
 
 
 class ExecutorHandler(QuantHandler):
-    def __init__(self, model: Optional[nn.Module]=None, device="cpu", precision=None, tokenizer=None, *, accelerator):
+    def __init__(
+        self,
+        model: Optional[nn.Module] = None,
+        device="cpu",
+        precision=None,
+        tokenizer=None,
+        *,
+        accelerator,
+    ):
         self.model_ = model
 
         if isinstance(accelerator, str):
@@ -529,147 +568,6 @@ def linear_int8_et(input, weight, scales):
     )
 
 
-class WeightOnlyInt8Linear(nn.Module):
-    __constants__ = ["in_features", "out_features"]
-    in_features: int
-    out_features: int
-    weight: torch.Tensor
-    scales: torch.Tensor
-
-    def __init__(
-        self,
-        in_features,
-        out_features,
-        bias=None,
-        device=None,
-        dtype=None,
-        *,
-        weight: Optional[torch.Tensor] = None,
-        scales: Optional[torch.Tensor] = None,
-        groupsize: Optional[int] = None,
-    ):
-        super().__init__()
-        if dtype is None:
-            dtype = torch.get_default_dtype()
-
-        if device is None:
-            device = "cpu"
-
-        assert not bias, "Bias is not supported by LinearInt8"
-        self.in_features = in_features
-        self.out_features = out_features
-
-        assert (weight is None) == bool(
-            scales is None
-        ), "must specify both weights and scales, or neither"
-        if weight is None:
-            weight = torch.empty(
-                (out_features, in_features),
-                dtype=torch.int8,
-                device=device,
-            )
-            if groupsize is None or (groupsize == 0):
-                scales = torch.empty(out_features, dtype=dtype, device=device)
-            else:
-                n_groups = (in_features + groupsize - 1) // groupsize
-                scales = torch.empty(out_features, n_groups, dtype=dtype, device=device)
-
-        self.register_buffer("weight", weight.to(device))
-        self.register_buffer("scales", scales.to(device))
-
-        if use_et_backend():
-            self.forward = self.et_forward
-        else:
-            self.forward = self.aoti_forward
-
-    def aoti_forward(self, input: torch.Tensor) -> torch.Tensor:
-        return linear_int8_aoti(input, self.weight, self.scales)
-
-    def et_forward(self, input: torch.Tensor) -> torch.Tensor:
-        return linear_int8_et(input, self.weight, self.scales)
-
-
-class WeightOnlyInt8QuantHandler(QuantHandler):
-    def __init__(
-        self,
-        model: Optional[nn.Module] = None,
-        device = None,
-        precision=None,
-        tokenizer=None,
-        *,
-        node_type: str = "*",
-        bitwidth: Optional[int] = None,
-        groupsize: Optional[int] = None,
-    ):
-        self.model_ = model
-        self.device = device
-        self.groupsize = groupsize
-        self.node_type = node_type
-        if bitwidth is None:
-            self.bitwidth = 8
-        else:
-            self.bitwidth = bitwidth
-
-    @torch.no_grad()
-    def quantize(self, module):
-        # cur_state_dict = state_dict_device(self.model_.state_dict())
-        # dict_device = "cpu"  # self.device
-
-        if self.bitwidth == 4:
-            range_min = -8
-            range_max = 7
-        elif self.bitwidth == 8:
-            range_min = -128
-            range_max = 127
-        else:
-            raise ValueError(f"Unsupported bitwidth {self.bitwidth}")
-
-        for name, child in module.named_children():
-            # print(f"name: {name}")
-            if isinstance(child, nn.Linear):
-                if (
-                    (self.node_type == "*")
-                    or (self.node_type == "output" and name == "output")
-                    or (self.node_type == "!output" and name != "output")
-                ):
-                    # print(f"{name, child}")
-                    input_weight = child.weight.float()
-                    # print(f"{name, child}")
-                    # print(f"in_features: {child.in_features}")
-                    # print(f"out_features: {child.out_features}")
-
-                    # print(f"expanded weight shape {input_weight.shape}")
-                    weight, scales, _ = dynamically_quantize_per_channel(
-                        input_weight,
-                        range_min,
-                        range_max,
-                        torch.int8,
-                        self.groupsize,
-                        scales_dtype=child.weight.dtype,
-                    )
-
-                    setattr(
-                        module,
-                        name,
-                        WeightOnlyInt8Linear(
-                            in_features=child.in_features,
-                            out_features=child.out_features,
-                            device=self.device,
-                            # update variables from quantization
-                            weight=weight,
-                            scales=scales,
-                            groupsize=self.groupsize,
-                        ),
-                    )
-            else:
-                self.quantize(child)
-
-        return module
-
-    def quantized_model(self) -> nn.Module:
-        return self.quantize(self.model_)
-
-
 #########################################################################
 #####                   embedding table quantization               ######
 ###                    (unify with torchao in future)                 ###
@@ -886,10 +784,10 @@ def quantized_model(self) -> nn.Module:
 # class references
 quantizer_class_dict = {
     "embedding": EmbeddingOnlyQuantHandler,
-    "linear:int8": WeightOnlyInt8QuantHandler,
     "precision": PrecisionHandler,
     "executor": ExecutorHandler,
     "linear:int4": Int4WeightOnlyQuantizer,
+    "linear:int8": int8_weight_only,
     "linear:a8w4dq": Int8DynActInt4WeightQuantizer,
 }
 
@@ -932,11 +830,16 @@ def quantized_model(self) -> nn.Module:
         print("Slow fallback kernels will be used.")
 
 except Exception as e:
+
     class ErrorHandler(QuantHandler):
-        def __init__(self, model: Optional[nn.Module]=None, device="cpu", precision=None):
+        def __init__(
+            self, model: Optional[nn.Module] = None, device="cpu", precision=None
+        ):
             global torchao_experimental_load_error
-            raise Exception(f"Note: Failed to load torchao experimental quantizer with error: {torchao_experimental_load_error}")
-            
+            raise Exception(
+                f"Note: Failed to load torchao experimental quantizer with error: {torchao_experimental_load_error}"
+            )
+
     torchao_experimental_load_error = e
     quantizer_class_dict["linear:a8wxdq"] = ErrorHandler
     quantizer_class_dict["embedding:wx"] = ErrorHandler

From 93d9876cdedd1c0e3ceff894bb7ba431d4ac2508 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Tue, 12 Nov 2024 23:15:11 +0100
Subject: [PATCH 02/83] fix: enforce python version install requirements
 (#1345)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous Python version check was incorrect, allowing installations
on unsupported interpreter versions, which caused installation failures.
Additionally, we now respect the specified interpreter version if
provided, consistently using it throughout the installation process by
enforcing it with pip.

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 install/install_requirements.sh | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/install/install_requirements.sh b/install/install_requirements.sh
index 6344509d8..2b623e831 100755
--- a/install/install_requirements.sh
+++ b/install/install_requirements.sh
@@ -9,26 +9,39 @@ set -eou pipefail
 
 # Install required python dependencies for developing
 # Dependencies are defined in .pyproject.toml
-PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE:-python}
-if [[ -z ${CONDA_DEFAULT_ENV:-} ]] || [[ ${CONDA_DEFAULT_ENV:-} == "base" ]] || [[ ! -x "$(command -v python)" ]];
+if [ -z "${PYTHON_EXECUTABLE:-}" ];
 then
-  PYTHON_EXECUTABLE=python3
+  if [[ -z ${CONDA_DEFAULT_ENV:-} ]] || [[ ${CONDA_DEFAULT_ENV:-} == "base" ]] || [[ ! -x "$(command -v python)" ]];
+  then
+    PYTHON_EXECUTABLE=python3
+  fi
 fi
+echo "Using python executable: $PYTHON_EXECUTABLE"
 
+PYTHON_SYS_VERSION="$($PYTHON_EXECUTABLE -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")"
 # Check python version. Expect 3.10.x or 3.11.x
-printf "import sys\nif sys.version_info.major != 3 or sys.version_info.minor < 10 :\n\tprint('Please use Python >=3.10');sys.exit(1)\n" | $PYTHON_EXECUTABLE
-if [[ $? -ne 0 ]]
+if ! $PYTHON_EXECUTABLE -c "
+import sys
+if sys.version_info < (3, 10) or sys.version_info >= (3, 12):
+    sys.exit(1)
+";
 then
+  echo "Python version must be 3.10.x or 3.11.x. Detected version: $PYTHON_SYS_VERSION"
   exit 1
 fi
 
 if [[ "$PYTHON_EXECUTABLE" == "python" ]];
 then
   PIP_EXECUTABLE=pip
-else
+elif [[ "$PYTHON_EXECUTABLE" == "python3" ]];
+then
   PIP_EXECUTABLE=pip3
+else
+  PIP_EXECUTABLE=pip${PYTHON_SYS_VERSION}
 fi
 
+echo "Using pip executable: $PIP_EXECUTABLE"
+
 #
 # First install requirements in install/requirements.txt. Older torch may be
 # installed from the dependency of other models. It will be overridden by

From d1d6aa1f31f1e3883905752624cd9d4e6a78ff2f Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Tue, 12 Nov 2024 17:09:34 -0800
Subject: [PATCH 03/83] Remove last references to use_distributed argument
 (#1353)

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 torchchat/cli/builder.py | 74 +---------------------------------------
 torchchat/generate.py    | 16 ++-------
 2 files changed, 3 insertions(+), 87 deletions(-)

diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
index fb2bfb299..f67cb9d0a 100644
--- a/torchchat/cli/builder.py
+++ b/torchchat/cli/builder.py
@@ -16,12 +16,6 @@
 import torch._inductor.config
 import torch.nn as nn
 
-from torch.distributed.device_mesh import DeviceMesh
-from torch.distributed.elastic.multiprocessing.errors import record
-from torch.distributed.elastic.utils.distributed import get_free_port
-
-from torchchat.distributed import launch_distributed, ParallelDims, parallelize_llama
-
 from torchchat.model import Model, ModelArgs, ModelType
 
 from torchchat.model_config.model_config import resolve_model_config
@@ -464,77 +458,11 @@ def _load_model_default(builder_args: BuilderArgs) -> Model:
     return model
 
 
-def _maybe_init_distributed(
-    builder_args: BuilderArgs,
-) -> Tuple[Optional[DeviceMesh], Optional[ParallelDims]]:
-    """
-    Initialize distributed related setups if the user specified
-    using distributed inference. If not, this is a no-op.
-
-    Args:
-        builder_args (:class:`BuilderArgs`):
-            Command args for model building.
-    Returns:
-        Tuple[Optional[DeviceMesh], Optional[ParallelDims]]:
-            - The first element is an optional DeviceMesh object,
-            which which describes the mesh topology of devices for the DTensor.
-            - The second element is an optional ParallelDims object,
-            which represents the parallel dimensions configuration.
-    """
-    if not builder_args.use_distributed:
-        return None, None
-    dist_config = "llama3_8B.toml"  # TODO - integrate with chat cmd line
-
-    world_mesh, parallel_dims = launch_distributed(dist_config)
-
-    assert (
-        world_mesh is not None and parallel_dims is not None
-    ), f"failed to launch distributed using {dist_config}"
-
-    return world_mesh, parallel_dims
-
-
-def _maybe_parallelize_model(
-    model: nn.Module,
-    builder_args: BuilderArgs,
-    world_mesh: DeviceMesh,
-    parallel_dims: ParallelDims,
-) -> nn.Module:
-    """
-    We parallelize the module and load the distributed checkpoint to the model
-    if the user specifies using distributed inference. If not, this is a no-op.
-
-    Args:
-        model (:class:`nn.Module`):
-            Module to be parallelized.
-        builder_args (:class:`BuilderArgs`):
-            Command args for model building.
-        world_mesh (:class:`DeviceMesh`):
-            Object which describes the mesh topology
-            of devices for the DTensor.
-        parallel_dims (:class:`ParallelDims`):
-            Object which represents the parallel dimensions configuration.
-    Returns:
-        A :class:`nn.Module` object which is parallelized and checkpoint loaded
-        if the user specifies using distributed inference.
-    """
-    if world_mesh is None:
-        return model
-    assert parallel_dims is not None
-    print("Applying model parallel to model ...")
-    parallelize_llama(model, world_mesh, parallel_dims)
-    return load_checkpoints_to_model(model, builder_args, world_mesh)
-
-
 def _load_model(builder_args: BuilderArgs) -> Model:
-    # world_mesh, parallel_dims = _maybe_init_distributed(builder_args)
     if builder_args.gguf_path:
         model = _load_model_gguf(builder_args)
-    # elif builder_args.use_distributed:
-    #    model = _init_model_on_meta_device(builder_args)
     else:
         model = _load_model_default(builder_args)
-    # model = _maybe_parallelize_model(model, builder_args, world_mesh, parallel_dims)
 
     if builder_args.dso_path or builder_args.aoti_package_path:
         # AOTI-compoiled model will load its own weights.
@@ -706,4 +634,4 @@ def tokenizer_setting_to_name(tiktoken: bool, tokenizers: bool) -> str:
         return "TikToken"
     if tokenizers:
         return "Tokenizers"
-    return "SentencePiece"
\ No newline at end of file
+    return "SentencePiece"
diff --git a/torchchat/generate.py b/torchchat/generate.py
index dd423b58a..fcae18d87 100644
--- a/torchchat/generate.py
+++ b/torchchat/generate.py
@@ -915,13 +915,6 @@ def chat(
             ]
         )
         if generator_args.compile:
-            if (
-                self.is_speculative and self.builder_args.use_distributed
-            ):  # and ("cuda" in builder_args.device):
-                torch._inductor.config.triton.cudagraph_trees = (
-                    False  # Bug with cudagraph trees in this case
-                )
-
             if self.builder_args.device == "cpu":
                 if generator_args.max_autotune:
                     kwargs = {"mode": "max-autotune"}
@@ -1091,9 +1084,7 @@ def callback(x, *, done_generating=False):
 
                 torch._inductor.config.profiler_mark_wrapper_call = True
                 torch._inductor.config.cpp.enable_kernel_profile = True
-            if (i != generator_args.num_samples - 1 or not self.profile) or (
-                self.builder_args.use_distributed and self.rank != 0
-            ):
+            if i != generator_args.num_samples - 1 or not self.profile:
                 import contextlib
 
                 prof = contextlib.nullcontext()
@@ -1136,10 +1127,7 @@ def callback(x, *, done_generating=False):
                     print(prof.key_averages().table(sort_by="self_cpu_time_total"))
                 else:
                     print(prof.key_averages().table(sort_by="self_cuda_time_total"))
-                if self.builder_args.use_distributed:
-                    prof.export_chrome_trace(f"{self.profile}_rank_{self.rank}.json")
-                else:
-                    prof.export_chrome_trace(f"{self.profile}.json")
+                prof.export_chrome_trace(f"{self.profile}.json")
 
             if start_pos >= max_seq_length:
                 print(

From a286e58b2b6b1810c79138162815efa7fefca7e6 Mon Sep 17 00:00:00 2001
From: Jesse White <5281939+byjlw@users.noreply.github.com>
Date: Tue, 12 Nov 2024 17:10:23 -0800
Subject: [PATCH 04/83] Add cstdint to tokenizer (missing include) (#1339)

* toeknizer was missing an include

* fix a nit

---------

Co-authored-by: Jesse <jesse@byjlw.com>
Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 tokenizer/base64.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tokenizer/base64.h b/tokenizer/base64.h
index dfeefef55..12b8703a8 100644
--- a/tokenizer/base64.h
+++ b/tokenizer/base64.h
@@ -25,6 +25,7 @@
 #pragma once
 
 #include <cassert>
+#include <cstdint>
 #include <string>
 #include <string_view>
 

From a655d5848171d6df5df61c234ed885bb3341c5c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Wed, 13 Nov 2024 02:11:14 +0100
Subject: [PATCH 05/83] Setup a SIGINT handler to gracefully exit the program
 once the user presses ctrl+c (#1352)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Setup a SIGINT handler to gracefully exit the program once the user
presses ctrl+c.

Signed-off-by: Sébastien Han <seb@redhat.com>
Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 torchchat.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/torchchat.py b/torchchat.py
index 35cdcabae..1eeee0120 100644
--- a/torchchat.py
+++ b/torchchat.py
@@ -6,7 +6,7 @@
 
 import argparse
 import logging
-import subprocess
+import signal
 import sys
 
 # MPS ops missing with Multimodal torchtune
@@ -25,7 +25,15 @@
 default_device = "cpu"
 
 
+def signal_handler(sig, frame):
+    print("\nInterrupted by user. Bye!\n")
+    sys.exit(0)
+
+
 if __name__ == "__main__":
+    # Set the signal handler for SIGINT
+    signal.signal(signal.SIGINT, signal_handler)
+
     # Initialize the top-level parser
     parser = argparse.ArgumentParser(
         prog="torchchat",

From 8811c7e4d61e8d22f8c1cbe15f74aa56565dcf6a Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Tue, 12 Nov 2024 17:12:38 -0800
Subject: [PATCH 06/83] Update cli.py to make --device/--dtype pre-empt
 quantize dict-specified values (#1359)

* Update cli.py to make --device/--dtype pre-empt quantize dict-specified values

Users may expect that cli parameters override the JSON, as per #1278.
Invert logic - case split:
1 - if none (no value) is specified, use value specified in quantize dict, if present; else
2 - if value is specified, override the respective handler if present.

* Fix typo in cli.py

fix typo

---------

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 torchchat/cli/cli.py | 36 ++++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/torchchat/cli/cli.py b/torchchat/cli/cli.py
index a8a2c7da8..f7d00181b 100644
--- a/torchchat/cli/cli.py
+++ b/torchchat/cli/cli.py
@@ -21,6 +21,8 @@
 logger = logging.getLogger(__name__)
 
 default_device = os.getenv("TORCHCHAT_DEVICE", "fast")
+default_dtype = os.getenv("TORCHCHAT_PRECISION", "fast")
+
 default_model_dir = Path(
     os.getenv("TORCHCHAT_MODELDIR", "~/.torchchat/model-cache")
 ).expanduser()
@@ -149,9 +151,9 @@ def _add_model_config_args(parser, verb: str) -> None:
 
     model_config_parser.add_argument(
         "--dtype",
-        default="fast",
+        default=None,
         choices=allowable_dtype_names(),
-        help="Override the dtype of the model (default is the checkpoint dtype). Options: bf16, fp16, fp32, fast16, fast",
+        help="Override the dtype of the model. Options: bf16, fp16, fp32, fast16, fast",
     )
     model_config_parser.add_argument(
         "--quantize",
@@ -165,9 +167,9 @@ def _add_model_config_args(parser, verb: str) -> None:
     model_config_parser.add_argument(
         "--device",
         type=str,
-        default=default_device,
+        default=None,
         choices=["fast", "cpu", "cuda", "mps"],
-        help="Hardware device to use. Options: cpu, cuda, mps",
+        help="Hardware device to use. Options: fast, cpu, cuda, mps",
     )
 
 
@@ -513,20 +515,34 @@ def arg_init(args):
     if isinstance(args.quantize, str):
         args.quantize = json.loads(args.quantize)
 
-    # if we specify dtype in quantization recipe, replicate it as args.dtype
-    args.dtype = args.quantize.get("precision", {}).get("dtype", args.dtype)
+    # if we specify dtype in quantization recipe, allow args.dtype top override if specified
+    if args.dtype is None:
+        args.dtype = args.quantize.get("precision", {}).get("dtype", default_dtype)
+    else:
+        precision_handler = args.quantize.get("precision", None)
+        if precision_handler:
+            if precision_handler["dtype"] != args.dtype:
+                print('overriding json-specified dtype {precision_handler["dtype"]} with cli dtype {args.dtype}')
+                precision_handler["dtype"] = args.dtype
 
     if getattr(args, "output_pte_path", None):
-        if args.device not in ["cpu", "fast"]:
+        if args.device not in [None, "cpu", "fast"]:
             raise RuntimeError("Device not supported by ExecuTorch")
         args.device = "cpu"
     else:
         # Localized import to minimize expensive imports
         from torchchat.utils.build_utils import get_device_str
 
-        args.device = get_device_str(
-            args.quantize.get("executor", {}).get("accelerator", args.device)
-        )
+        if args.device is None:
+            args.device = get_device_str(
+                args.quantize.get("executor", {}).get("accelerator", default_device)
+            )
+        else:
+            executor_handler = args.quantize.get("executor", None)
+            if executor_handler:
+                if executor_handler["accelerator"] != args.device:
+                    print('overriding json-specified device {executor_handler["accelerator"]} with cli device {args.device}')
+                    executor_handler["accelerator"] = args.device
 
     if "mps" in args.device:
         if getattr(args, "compile", False) or getattr(args, "compile_prefill", False):

From 483928bb1d4bf2c4c183ecfd6c47cfe8cb44f333 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Wed, 13 Nov 2024 11:47:17 -0800
Subject: [PATCH 07/83] Update Caching logic to only trigger on the first
 inference sample (#1369)

* Only set up during the first sample

* Cleaner
---
 torchchat/generate.py | 48 +++++++++++++++++++++++--------------------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/torchchat/generate.py b/torchchat/generate.py
index fcae18d87..4a67195fb 100644
--- a/torchchat/generate.py
+++ b/torchchat/generate.py
@@ -591,6 +591,7 @@ def generate(
             Dict[str, Any]
         ] = None,  # List of Image prompt tensors for multimodal models
         start_pos: int = 0,
+        skip_cache_setup: bool = False,
         draft_model: Model,
         speculate_k: Optional[int] = 8,
         sequential_prefill=True,
@@ -614,26 +615,27 @@ def generate(
         max_new_tokens = min(max_new_tokens, max_seq_length - start_pos - prompt_length)
         # set up caches only if first inference
         if start_pos == 0:
-            model = model.to(device=device)
-            with torch.device(device):
-                if (
-                    self.is_torchtune_model
-                    or self.model.config.model_type == ModelType.Flamingo
-                ):
-                    # 6404 is one-gpu affordable max_seq_length for single image input
-                    model.setup_caches(
-                        batch_size=1,
-                        dtype=self.dtype,
-                        encoder_max_seq_len=6404,
-                        decoder_max_seq_len=max_seq_length,
-                    )
-                else:
-                    model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length)
-                if is_speculative and draft_model is not model:
-                    draft_model.setup_caches(
-                        max_batch_size=1,
-                        max_seq_length=max_seq_length,
-                    )
+            if not skip_cache_setup:
+                model = model.to(device=device)
+                with torch.device(device):
+                    if (
+                        self.is_torchtune_model
+                        or self.model.config.model_type == ModelType.Flamingo
+                    ):
+                        # 6404 is one-gpu affordable max_seq_length for single image input
+                        model.setup_caches(
+                            batch_size=1,
+                            dtype=self.dtype,
+                            encoder_max_seq_len=6404,
+                            decoder_max_seq_len=max_seq_length,
+                        )
+                    else:
+                        model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length)
+                    if is_speculative and draft_model is not model:
+                        draft_model.setup_caches(
+                            max_batch_size=1,
+                            max_seq_length=max_seq_length,
+                        )
             if model.config.model_type == ModelType.Flamingo:
                 model.reset_caches()
 
@@ -1013,6 +1015,7 @@ def chat(
         )
         for i in range(num_samples):
             device_sync(device=self.builder_args.device)
+            is_first_sample: bool = i == 0
             if generator_args.chat_mode:
                 prompt = input("User: ")
                 if prompt == "/bye":
@@ -1038,7 +1041,7 @@ def chat(
                             ]
                         )
                         self.system_prompt = None
-                    elif i == 0:
+                    elif is_first_sample:
                         encoded = self.chat_formatter.encode_dialog_prompt(
                             [{"role": "user", "content": prompt}]
                         )
@@ -1107,6 +1110,7 @@ def callback(x, *, done_generating=False):
                     top_k=generator_args.top_k,
                     sequential_prefill=generator_args.sequential_prefill,
                     start_pos=start_pos,
+                    skip_cache_setup=not is_first_sample,
                     max_seq_length=max_seq_length,
                 )
                 for token_tensor, metrics in generator_func:
@@ -1116,7 +1120,7 @@ def callback(x, *, done_generating=False):
                     if metrics is not None:
                         aggregate_metrics.update(metrics)
                     yield token_tensor, metrics
-            jit_compile = (i == 0) and (
+            jit_compile = is_first_sample and (
                 generator_args.compile or generator_args.compile_prefill
             )
             compilation_time = time.perf_counter() - t0

From add35e816d707daa5e7861b6ae2d6d180e021bff Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Wed, 13 Nov 2024 12:02:30 -0800
Subject: [PATCH 08/83] Minor typo + Update install_requirements.sh to support
 python 3.10 >= (#1368)

* Update install_requirements.sh to support python 3.10 >= , <3.13

* Update install_requirements.sh

* Update install_requirements.sh
---
 install/install_requirements.sh | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/install/install_requirements.sh b/install/install_requirements.sh
index 2b623e831..635789de6 100755
--- a/install/install_requirements.sh
+++ b/install/install_requirements.sh
@@ -14,19 +14,21 @@ then
   if [[ -z ${CONDA_DEFAULT_ENV:-} ]] || [[ ${CONDA_DEFAULT_ENV:-} == "base" ]] || [[ ! -x "$(command -v python)" ]];
   then
     PYTHON_EXECUTABLE=python3
+  else
+    PYTHON_EXECUTABLE=python
   fi
 fi
 echo "Using python executable: $PYTHON_EXECUTABLE"
 
 PYTHON_SYS_VERSION="$($PYTHON_EXECUTABLE -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")"
-# Check python version. Expect 3.10.x or 3.11.x
+# Check python version. Expect at least 3.10.x
 if ! $PYTHON_EXECUTABLE -c "
 import sys
-if sys.version_info < (3, 10) or sys.version_info >= (3, 12):
+if sys.version_info < (3, 10):
     sys.exit(1)
 ";
 then
-  echo "Python version must be 3.10.x or 3.11.x. Detected version: $PYTHON_SYS_VERSION"
+  echo "Python version must be at least 3.10.x. Detected version: $PYTHON_SYS_VERSION"
   exit 1
 fi
 

From 008fea0d22d0e9c700c3465c08282897bc6f216e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Thu, 14 Nov 2024 02:15:37 +0100
Subject: [PATCH 09/83] fix: Remove dup gguf dependency (#1371)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`gguf` was listed twice on the dependency list.

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 install/requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/install/requirements.txt b/install/requirements.txt
index d051d29cd..8fb1832ba 100644
--- a/install/requirements.txt
+++ b/install/requirements.txt
@@ -14,7 +14,6 @@ snakeviz
 sentencepiece
 # numpy version range required by GGUF util
 numpy >= 1.17, < 2.0
-gguf
 blobfile
 tomli >= 1.1.0 ; python_version < "3.11"
 openai

From d2e4995a57877287a6df588b83017f13ebfc375b Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Wed, 13 Nov 2024 18:04:34 -0800
Subject: [PATCH 10/83] Bug Fix: Check for explicit cli device (fast) (#1374)

---
 torchchat/cli/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchchat/cli/cli.py b/torchchat/cli/cli.py
index f7d00181b..3a7c85937 100644
--- a/torchchat/cli/cli.py
+++ b/torchchat/cli/cli.py
@@ -533,7 +533,7 @@ def arg_init(args):
         # Localized import to minimize expensive imports
         from torchchat.utils.build_utils import get_device_str
 
-        if args.device is None:
+        if args.device is None or args.device == "fast":
             args.device = get_device_str(
                 args.quantize.get("executor", {}).get("accelerator", default_device)
             )

From bc2c2d0e149198185d3a0e7e6ce633dd4748be0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Fri, 15 Nov 2024 18:46:04 +0100
Subject: [PATCH 11/83] fix: do not print perf stat when NaN (#1375)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If the chat is exited or interrupted it will still print the stats with
NaN values which is unnecessary.

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 torchchat/generate.py | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/torchchat/generate.py b/torchchat/generate.py
index 4a67195fb..66f26ff9f 100644
--- a/torchchat/generate.py
+++ b/torchchat/generate.py
@@ -1189,12 +1189,27 @@ def callback(x, *, done_generating=False):
                 f"Mean Accepted: {sum([idx * i for idx, i in enumerate(counts_aggregated)])/sum(counts_aggregated)}"
             )
 
-        print(
-            f"\n      Average tokens/sec (total): {torch.mean(torch.tensor(aggregate_metrics['tokens_per_sec'])).item():.2f} \
-                \nAverage tokens/sec (first token): {torch.mean(torch.tensor(aggregate_metrics['first_token_per_sec'])).item():.2f} \
-                \nAverage tokens/sec (next tokens): {torch.mean(torch.tensor(aggregate_metrics['next_tokens_per_sec'])).item():.2f} \n\
+        avg_tokens_sec = torch.mean(
+            torch.tensor(aggregate_metrics["tokens_per_sec"])
+        ).item()
+        avg_first_token_sec = torch.mean(
+            torch.tensor(aggregate_metrics["first_token_per_sec"])
+        ).item()
+        avg_next_tokens_sec = torch.mean(
+            torch.tensor(aggregate_metrics["next_tokens_per_sec"])
+        ).item()
+
+        if not (
+            torch.isnan(torch.tensor(avg_tokens_sec))
+            or torch.isnan(torch.tensor(avg_first_token_sec))
+            or torch.isnan(torch.tensor(avg_next_tokens_sec))
+        ):
+            print(
+                f"\n      Average tokens/sec (total): {avg_tokens_sec:.2f} \
+                \nAverage tokens/sec (first token): {avg_first_token_sec:.2f} \
+                \nAverage tokens/sec (next tokens): {avg_next_tokens_sec:.2f} \n\
                 "
-        )
+            )
         if torch.cuda.is_available():
             print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
 

From 4eb7fbbc83ebb7b5f9ecc48a8d809b3e3396fc14 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Sat, 16 Nov 2024 03:51:33 +0100
Subject: [PATCH 12/83] fix: Fail gracefully when "model" arg is missing when
 downloading (#1372)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let's gracefully fail if no model is given to the `download` command.

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 torchchat/cli/download.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/torchchat/cli/download.py b/torchchat/cli/download.py
index f145c93fb..f334eb555 100644
--- a/torchchat/cli/download.py
+++ b/torchchat/cli/download.py
@@ -110,6 +110,8 @@ def _download_direct(
 def download_and_convert(
     model: str, models_dir: Path, hf_token: Optional[str] = None
 ) -> None:
+    if model is None:
+        raise ValueError("'download' command needs a model name or alias.")
     model_config = resolve_model_config(model)
     model_dir = models_dir / model_config.name
 
@@ -234,4 +236,8 @@ def where_main(args) -> None:
 
 # Subcommand to download model artifacts.
 def download_main(args) -> None:
-    download_and_convert(args.model, args.model_directory, args.hf_token)
+    try:
+        download_and_convert(args.model, args.model_directory, args.hf_token)
+    except ValueError as e:
+        print(e, file=sys.stderr)
+        sys.exit(1)

From d62680c8690341a56930b4155b57e0964586bee8 Mon Sep 17 00:00:00 2001
From: YanbingJiang <yanbing.jiang@intel.com>
Date: Wed, 20 Nov 2024 00:09:02 +0800
Subject: [PATCH 13/83] Ignore tokens per sec from jit_compile iteration
 (#1378)

* Remove tokens per sec in aggregate_metrics when jit_compile

* Add warning to user

* Update

---------

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 torchchat/generate.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/torchchat/generate.py b/torchchat/generate.py
index 66f26ff9f..9b4c6430a 100644
--- a/torchchat/generate.py
+++ b/torchchat/generate.py
@@ -1149,9 +1149,11 @@ def callback(x, *, done_generating=False):
                 print(
                     f"just-in-time compilation time (incl run time): {compilation_time:.2} seconds"
                 )
-            aggregate_metrics["tokens_per_sec"].append(tokens_sec)
-            aggregate_metrics["first_token_per_sec"].append(first_token_sec)
-            aggregate_metrics["next_tokens_per_sec"].append(next_tokens_sec)
+            else:
+                # aggregate_metrics will not append when is jit_compile, which will affect the average numbers.
+                aggregate_metrics["tokens_per_sec"].append(tokens_sec)
+                aggregate_metrics["first_token_per_sec"].append(first_token_sec)
+                aggregate_metrics["next_tokens_per_sec"].append(next_tokens_sec)
 
             logging.info(
                 f"\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\
@@ -1205,7 +1207,8 @@ def callback(x, *, done_generating=False):
             or torch.isnan(torch.tensor(avg_next_tokens_sec))
         ):
             print(
-                f"\n      Average tokens/sec (total): {avg_tokens_sec:.2f} \
+                f"\nWarning: Excluding compile in calculations \
+                \n      Average tokens/sec (total): {avg_tokens_sec:.2f} \
                 \nAverage tokens/sec (first token): {avg_first_token_sec:.2f} \
                 \nAverage tokens/sec (next tokens): {avg_next_tokens_sec:.2f} \n\
                 "

From c0630a64a99b86f184082ede33166f4d6bc896c3 Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Tue, 19 Nov 2024 16:14:03 -0700
Subject: [PATCH 14/83] Download fix (#1366)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix: allow multiple weight mapping files for mistral

Downloading a Mistral model fails because it includes multiple weight
mapping files. The regression was introduced in commit
`766bee9f4a1fcb187fae543a525495d3ff482097`. I'm unclear on the original
intent, but perhaps the exception was meant to apply only to Granite
models. This isn’t an ideal fix, but it does enable Mistral to be
downloaded and used for chat.

Signed-off-by: Sébastien Han <seb@redhat.com>

* fix(download): Fix safetensors/bin/pth download logic

The previous logic didn't handle .bin files, so if a model (like mistral)
has both .bin and .safetensors, it would download both.

Branch: download-fix

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(convert hf): Better logic to handle multiple weight mapping files

This will not actually be needed for mistral with the fix in download to
handle .bin files, but it may be needed for other models, so it's worth
having.

Branch: download-fix

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

---------

Signed-off-by: Sébastien Han <seb@redhat.com>
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
Co-authored-by: Sébastien Han <seb@redhat.com>
---
 torchchat/cli/convert_hf_checkpoint.py | 35 ++++++++++++++++++--------
 torchchat/cli/download.py              |  9 ++++---
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/torchchat/cli/convert_hf_checkpoint.py b/torchchat/cli/convert_hf_checkpoint.py
index f428e4cc6..122ab0f28 100644
--- a/torchchat/cli/convert_hf_checkpoint.py
+++ b/torchchat/cli/convert_hf_checkpoint.py
@@ -39,19 +39,14 @@ def convert_hf_checkpoint(
     config = TransformerArgs.from_params(config_args)
     print(f"Model config {config.__dict__}")
 
-    # Load the json file containing weight mapping
+    # Find all candidate weight mapping index files
     model_map_json_matches = [Path(m) for m in glob.glob(str(model_dir / "*.index.json"))]
-    assert len(model_map_json_matches) <= 1, "Found multiple weight mapping files"
-    if len(model_map_json_matches):
-        model_map_json = model_map_json_matches[0]
-    else:
-        model_map_json = model_dir / "pytorch_model.bin.index.json"
 
     # If there is no weight mapping, check for a consolidated model and
     # tokenizer we can move. Llama 2 and Mistral have weight mappings, while
     # Llama 3 has a consolidated model and tokenizer.
     # Otherwise raise an error.
-    if not model_map_json.is_file():
+    if not model_map_json_matches:
         consolidated_pth = model_dir / "original" / "consolidated.00.pth"
         tokenizer_pth = model_dir / "original" / "tokenizer.model"
         if consolidated_pth.is_file() and tokenizer_pth.is_file():
@@ -68,11 +63,30 @@ def convert_hf_checkpoint(
             return
         else:
             raise RuntimeError(
-                f"Could not find {model_map_json} or {consolidated_pth} plus {tokenizer_pth}"
+                f"Could not find a valid model weight map or {consolidated_pth} plus {tokenizer_pth}"
             )
 
-    with open(model_map_json) as json_map:
-        bin_index = json.load(json_map)
+    # Load the json file(s) containing weight mapping
+    #
+    # NOTE: If there are multiple index files, there are two possibilities:
+    #   1. The files could be mapped to different weight format files (e.g. .bin
+    #       vs .safetensors)
+    #   2. The files could be split subsets of the mappings that need to be
+    #       merged
+    #
+    # In either case, we can simply keep the mappings where the target file is
+    # valid in the model dir.
+    bin_index = {}
+    for weight_map_file in model_map_json_matches:
+        with open(weight_map_file, "r") as handle:
+            weight_map = json.load(handle)
+        valid_mappings = {
+            k: model_dir / v
+            for (k, v) in weight_map.get("weight_map", {}).items()
+            if (model_dir / v).is_file()
+        }
+        bin_index.update(valid_mappings)
+    bin_files = set(bin_index.values())
 
     weight_map = {
         "model.embed_tokens.weight": "tok_embeddings.weight",
@@ -96,7 +110,6 @@ def convert_hf_checkpoint(
         "model.norm.weight": "norm.weight",
         "lm_head.weight": "output.weight",
     }
-    bin_files = {model_dir / bin for bin in bin_index["weight_map"].values()}
 
     def permute(w, n_heads):
         return (
diff --git a/torchchat/cli/download.py b/torchchat/cli/download.py
index f334eb555..4da2bc390 100644
--- a/torchchat/cli/download.py
+++ b/torchchat/cli/download.py
@@ -35,11 +35,12 @@ def _download_hf_snapshot(
         model_info = model_info(model_config.distribution_path, token=hf_token)
         model_fnames = [f.rfilename for f in model_info.siblings]
 
-        # Check the model config for preference between safetensors and pth
+        # Check the model config for preference between safetensors and pth/bin
         has_pth = any(f.endswith(".pth") for f in model_fnames)
+        has_bin = any(f.endswith(".bin") for f in model_fnames)
         has_safetensors = any(f.endswith(".safetensors") for f in model_fnames)
 
-        # If told to prefer safetensors, ignore pth files
+        # If told to prefer safetensors, ignore pth/bin files
         if model_config.prefer_safetensors:
             if not has_safetensors:
                 print(
@@ -47,10 +48,10 @@ def _download_hf_snapshot(
                     file=sys.stderr,
                 )
                 exit(1)
-            ignore_patterns = "*.pth"
+            ignore_patterns = ["*.pth", "*.bin"]
 
         # If the model has both, prefer pth files over safetensors
-        elif has_pth and has_safetensors:
+        elif (has_pth or has_bin) and has_safetensors:
             ignore_patterns = "*safetensors*"
 
         # Otherwise, download everything

From fe76c858e33dff56edf0e85c869d8ddf3810dc06 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Tue, 19 Nov 2024 15:15:08 -0800
Subject: [PATCH 15/83] Update builder.py (#1387)

Wording of error message to include AOTI package
---
 torchchat/cli/builder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
index f67cb9d0a..7a0f761a5 100644
--- a/torchchat/cli/builder.py
+++ b/torchchat/cli/builder.py
@@ -74,7 +74,7 @@ def __post_init__(self):
             or (self.pte_path and Path(self.pte_path).is_file())
         ):
             raise RuntimeError(
-                "need to specified a valid checkpoint path, checkpoint dir, gguf path, DSO path, or PTE path"
+                "need to specified a valid checkpoint path, checkpoint dir, gguf path, DSO path, AOTI PACKAGE or PTE path"
             )
 
         if self.aoti_package_path and self.pte_path:
@@ -91,7 +91,7 @@ def __post_init__(self):
             for param, param_msg in ignored_params:
                 if param:
                     print(
-                        f"Warning: {param_msg} ignored because an exported DSO or PTE path was specified"
+                        f"Warning: {param_msg} ignored because an exported model was specified using a DSO, AOTI PACKAGE or PTE path argument"
                     )
         else:
             self.prefill_possible = True

From 8478e5d854311e49217191b3bb51256c9852fbb9 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Tue, 19 Nov 2024 15:18:44 -0800
Subject: [PATCH 16/83] Add multimodal to possible tests (#1382)

* Update multimodal.md

Complete markup for testing

* Update run-docs

Add ability to run on docs/multimodal.md

* Update run-readme-pr.yml
---
 .ci/scripts/run-docs                | 20 +++++++++++++
 .github/workflows/run-readme-pr.yml | 45 ++++++++++++++++++++++++++++-
 docs/multimodal.md                  |  6 ++++
 3 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs
index a09944ad5..cc88bedac 100755
--- a/.ci/scripts/run-docs
+++ b/.ci/scripts/run-docs
@@ -91,3 +91,23 @@ if [ "$1" == "evaluation" ]; then
         echo "*******************************************"
         bash -x ./run-evaluation.sh
 fi
+
+if [ "$1" == "multimodal" ]; then
+
+   # Expecting that this might fail this test as-is, because 
+   # it's the first on-pr test depending on githib secrets for access with HF token access
+
+        echo "::group::Create script to run multimodal"
+        python3 torchchat/utils/scripts/updown.py --file docs/multimodal.md > ./run-multimodal.sh
+        # for good measure, if something happened to updown processor,
+        # and it did not error out, fail with an exit 1
+        echo "exit 1" >> ./run-multimodal.sh
+        echo "::endgroup::"
+
+        echo "::group::Run multimodal"
+        echo "*******************************************"
+        cat ./run-multimodal.sh
+        echo "*******************************************"
+        bash -x ./run-multimodal.sh
+        echo "::endgroup::"
+fi
diff --git a/.github/workflows/run-readme-pr.yml b/.github/workflows/run-readme-pr.yml
index cd6a95681..4e5e6d014 100644
--- a/.github/workflows/run-readme-pr.yml
+++ b/.github/workflows/run-readme-pr.yml
@@ -243,4 +243,47 @@ jobs:
         echo "::group::Completion"
         echo "tests complete"
         echo "*******************************************"
-        echo "::endgroup::"
\ No newline at end of file
+        echo "::endgroup::"
+
+  test-multimodal-any:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
+
+        .ci/scripts/run-docs multimodal
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
+
+  test-multimodal-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs multimodal
diff --git a/docs/multimodal.md b/docs/multimodal.md
index f3e3f0fe2..6a3cb2be8 100644
--- a/docs/multimodal.md
+++ b/docs/multimodal.md
@@ -14,9 +14,11 @@ This page goes over the different commands you can run with LLama 3.2 11B Vision
 
 While we strongly encourage you to use the Hugging Face checkpoint (which is the default for torchchat when utilizing the commands with the argument `llama3.2-11B`), we also provide support for manually providing the checkpoint. This can be done by replacing the `llama3.2-11B` argument in the commands below with the following:
 
+[skip default]: begin
 ```
 --checkpoint-path <file.pth> --tokenizer-path <tokenizer.model> --params-path torchchat/model_params/Llama-3.2-11B-Vision.json
 ```
+[skip default]: end
 
 ##  Generation
 This generates text output based on a text prompt and (optional) image prompt.
@@ -48,6 +50,7 @@ Setting `stream` to "true" in the request emits a response in chunks. If `stream
 
 **Example Input + Output**
 
+[skip default]: begin
 ```
 curl http://127.0.0.1:5000/v1/chat/completions \
   -H "Content-Type: application/json" \
@@ -75,6 +78,7 @@ curl http://127.0.0.1:5000/v1/chat/completions \
 ```
 {"id": "chatcmpl-cb7b39af-a22e-4f71-94a8-17753fa0d00c", "choices": [{"message": {"role": "assistant", "content": "The image depicts a simple black and white cartoon-style drawing of an animal face. It features a profile view, complete with two ears, expressive eyes, and a partial snout. The animal looks to the left, with its eye and mouth implied, suggesting that the drawn face might belong to a rabbit, dog, or pig. The graphic face has a bold black outline and a smaller, solid black nose. A small circle, forming part of the face, has a white background with two black quirkly short and long curved lines forming an outline of what was likely a mouth, complete with two teeth. The presence of the curve lines give the impression that the animal is smiling or speaking. Grey and black shadows behind the right ear and mouth suggest that this face is looking left and upwards. Given the prominent outline of the head and the outline of the nose, it appears that the depicted face is most likely from the side profile of a pig, although the ears make it seem like a dog and the shape of the nose makes it seem like a rabbit. Overall, it seems that this image, possibly part of a character illustration, is conveying a playful or expressive mood through its design and positioning."}, "finish_reason": "stop"}], "created": 1727487574, "model": "llama3.2", "system_fingerprint": "cpu_torch.float16", "object": "chat.completion"}%
 ```
+[skip default]: end
 
 </details>
 
@@ -90,6 +94,8 @@ First, follow the steps in the Server section above to start a local server. The
 streamlit run torchchat/usages/browser.py
 ```
 
+[skip default]: end
+
 ---
 
 # Future Work

From 5e18de7da7226a6c864d3c0a33366384751c2515 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Tue, 19 Nov 2024 18:27:20 -0800
Subject: [PATCH 17/83] Fix typo in RuntimeException in builder.py (#1386)

Fix English usage

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 torchchat/cli/builder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
index 7a0f761a5..a39a2ed95 100644
--- a/torchchat/cli/builder.py
+++ b/torchchat/cli/builder.py
@@ -74,7 +74,7 @@ def __post_init__(self):
             or (self.pte_path and Path(self.pte_path).is_file())
         ):
             raise RuntimeError(
-                "need to specified a valid checkpoint path, checkpoint dir, gguf path, DSO path, AOTI PACKAGE or PTE path"
+                "need to specify a valid checkpoint path, checkpoint dir, gguf path, DSO path, AOTI PACKAGE or PTE path"
             )
 
         if self.aoti_package_path and self.pte_path:

From 8475c79d3619f2f4f4439de0cf38927abf3b0847 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Tue, 19 Nov 2024 22:18:50 -0500
Subject: [PATCH 18/83] Bug fix: Enable fast to override quantize json (#1377)

* Bug fix: Enable fast to override quantize json

* collapse conditional
---
 torchchat/cli/cli.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/torchchat/cli/cli.py b/torchchat/cli/cli.py
index 3a7c85937..a7f7bbba2 100644
--- a/torchchat/cli/cli.py
+++ b/torchchat/cli/cli.py
@@ -533,16 +533,16 @@ def arg_init(args):
         # Localized import to minimize expensive imports
         from torchchat.utils.build_utils import get_device_str
 
-        if args.device is None or args.device == "fast":
+        if args.device is None:
             args.device = get_device_str(
                 args.quantize.get("executor", {}).get("accelerator", default_device)
             )
         else:
+            args.device = get_device_str(args.device)
             executor_handler = args.quantize.get("executor", None)
-            if executor_handler:
-                if executor_handler["accelerator"] != args.device:
-                    print('overriding json-specified device {executor_handler["accelerator"]} with cli device {args.device}')
-                    executor_handler["accelerator"] = args.device
+            if executor_handler and executor_handler["accelerator"] != args.device:
+                print(f'overriding json-specified device {executor_handler["accelerator"]} with cli device {args.device}')
+                executor_handler["accelerator"] = args.device
 
     if "mps" in args.device:
         if getattr(args, "compile", False) or getattr(args, "compile_prefill", False):

From 731936d6f9f2ff37920531a19f613cf3e649cfed Mon Sep 17 00:00:00 2001
From: Joe Bowser <bowserj@gmail.com>
Date: Fri, 22 Nov 2024 21:28:24 -0800
Subject: [PATCH 19/83] Changing the referenced AAR so that it uses the AAR
 from the docs (#1390)

---
 .gitignore                                            | 4 ++++
 torchchat/edge/android/torchchat/app/build.gradle.kts | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 74d0a28fa..61ab1ee4d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,10 @@ runner-et/cmake-out/*
 runner-aoti/cmake-out/*
 cmake-out/
 
+# Example project Android Studio ignore
+torchchat/edge/android/torchchat/.idea/*
+
+
 # pte files
 *.pte
 
diff --git a/torchchat/edge/android/torchchat/app/build.gradle.kts b/torchchat/edge/android/torchchat/app/build.gradle.kts
index e0c9c196b..a98a70cab 100644
--- a/torchchat/edge/android/torchchat/app/build.gradle.kts
+++ b/torchchat/edge/android/torchchat/app/build.gradle.kts
@@ -57,7 +57,7 @@ dependencies {
   implementation("androidx.constraintlayout:constraintlayout:2.2.0-alpha12")
   implementation("com.facebook.fbjni:fbjni:0.5.1")
   implementation("com.google.code.gson:gson:2.8.6")
-  implementation(files("libs/executorch-llama.aar"))
+  implementation(files("libs/executorch.aar"))
   implementation("com.google.android.material:material:1.12.0")
   implementation("androidx.activity:activity:1.9.0")
   testImplementation("junit:junit:4.13.2")

From 554cf86ca3ef63e8fbd5807d87b28c7a19378786 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Tue, 26 Nov 2024 12:00:57 -0800
Subject: [PATCH 20/83] Typo fixes in native-execution.md (#1394)

Typo fixes in native-execution.md
---
 docs/native-execution.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/native-execution.md b/docs/native-execution.md
index 790547e21..c22d3c3ba 100644
--- a/docs/native-execution.md
+++ b/docs/native-execution.md
@@ -16,14 +16,14 @@ The 'llama runner' is a native standalone application capable of
 running a model exported and compiled ahead-of-time with either
 Executorch (ET) or AOT Inductor (AOTI). Which model format to use
 depends on your requirements and preferences.  Executorch models are
-optimized for portability across a range of decices, including mobile
+optimized for portability across a range of devices, including mobile
 and edge devices.  AOT Inductor models are optimized for a particular
 target architecture, which may result in better performance and
 efficiency.
 
 Building the runners is straightforward with the included cmake build
 files and is covered in the next sections.  We will showcase the
-runners using ~~stories15M~~ llama2 7B and llama3.
+runners using  llama2 7B and llama3.
 
 ## What can you do with torchchat's llama runner for native execution?
 
@@ -160,7 +160,7 @@ and native execution environments, respectively.
 
 After exporting a model, you will want to verify that the model
 delivers output of high quality, and works as expected.  Both can be
-achieved with the Python environment.  All torchchat Python comands
+achieved with the Python environment.  All torchchat Python commands
 can work with exported models.  Instead of loading the model from a
 checkpoint or GGUF file, use the `--dso-path model.so` and
 `--pte-path model.pte` for loading both types of exported models. This

From dadaade5aa86afc5427f302fb0a4fce11deb3309 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Tue, 26 Nov 2024 12:02:30 -0800
Subject: [PATCH 21/83] Improvements for readability in ADVANCED-USERS.md
 (#1393)

* Various spelling corrections
* Remove empty performance tables
* Remove CONTRIBUTING section that is covered in the project root README
---
 docs/ADVANCED-USERS.md | 90 +++++++++---------------------------------
 1 file changed, 18 insertions(+), 72 deletions(-)

diff --git a/docs/ADVANCED-USERS.md b/docs/ADVANCED-USERS.md
index 417a823f8..8f66b8a29 100644
--- a/docs/ADVANCED-USERS.md
+++ b/docs/ADVANCED-USERS.md
@@ -18,10 +18,10 @@ Torchchat is currently in a pre-release state and under extensive development.
 [shell default]: TORCHCHAT_ROOT=${PWD} ./torchchat/utils/scripts/install_et.sh
 
 
-This is the advanced users guide, if you're looking to get started
+This is the advanced users' guide, if you're looking to get started
 with LLMs, please refer to the README at the root directory of the
 torchchat distro.  This is an advanced user guide, so we will have
-many more concepts and options to discuss and taking advantage of them
+many more concepts and options to discuss and take advantage of them
 may take some effort.
 
 We welcome community contributions of all kinds.  If you find
@@ -41,7 +41,7 @@ While we strive to support a broad range of models, we can't test them
 all. We classify supported models as tested ✅, work in progress 🚧 or
 some restrictions ❹.
 
-We invite community contributions of new model suport and test results!
+We invite community contributions of new model support and test results!
 
 | Model | Tested | Eager | torch.compile | AOT Inductor | ExecuTorch | Fits on Mobile |
 |-----|--------|-------|-----|-----|-----|-----|
@@ -86,7 +86,7 @@ Server C++ runtime | n/a | run.cpp model.pte | ✅ |
 Mobile C++ runtime | n/a | app model.pte | ✅ |
 Mobile C++ runtime | n/a | app + AOTI | 🚧 |
 
-**Getting help:** Each command implements the --help option to give addititonal information about available options:
+**Getting help:** Each command implements the --help option to give additional information about available options:
 
 [skip default]: begin
 ```
@@ -96,8 +96,8 @@ python3 torchchat.py [ export | generate | chat | eval | ... ] --help
 
 Exported models can be loaded back into torchchat for chat or text
 generation, letting you experiment with the exported model and valid
-model quality. The python interface is the same in all cases and is
-used for testing nad test harnesses too.
+model quality. The Python interface is the same in all cases and is
+used for testing and test harnesses, too.
 
 Torchchat comes with server C++ runtimes to execute AOT Inductor and
 ExecuTorch models. A mobile C++ runtimes allow you to deploy
@@ -115,7 +115,7 @@ Some common models are recognized by torchchat based on their filename
 through `Model.from_name()` to perform a fuzzy match against a
 table of known model architectures. Alternatively, you can specify the
 index into that table with the option `--params-table ${INDEX}` where
-the index is the lookup key key in the [the list of known
+the index is the lookup key in the [the list of known
 pconfigurations](https://github.com/pytorch/torchchat/tree/main/torchchat/model_params)
 For example, for the stories15M model, this would be expressed as
 `--params-table stories15M`. (We use the model constructor
@@ -237,7 +237,7 @@ which chooses the best 16-bit floating point type.
 
 The virtual device fast and virtual floating point data types fast and
 fast16 are best used for eager/torch.compiled execution.  For export,
-specify the your device choice for the target system with --device for
+specify your device choice for the target system with --device for
 AOTI-exported DSO models, and using ExecuTorch delegate selection for
 ExecuTorch-exported PTE models.
 
@@ -250,8 +250,7 @@ python3 torchchat.py generate [--compile] --checkpoint-path ${MODEL_PATH} --prom
 To improve performance, you can compile the model with `--compile`
 trading off the time to first token processed with time per token.  To
 improve performance further, you may also compile the prefill with
-`--compile_prefill`. This will increase further compilation times though. The
-`--compile-prefill` option is not compatible with `--prefill-prefill`.
+`--compile-prefill`. This will increase further compilation times though. 
 
 Parallel prefill is not yet supported by exported models, and may be
 supported in a future release.
@@ -265,7 +264,7 @@ the introductory README.
 In addition to running eval on models in eager mode and JIT-compiled
 mode with `torch.compile()`, you can also load dso and pte models back
 into the PyTorch to evaluate the accuracy of exported model objects
-(e.g., after applying quantization or other traqnsformations to
+(e.g., after applying quantization or other transformations to
 improve speed or reduce model size).
 
 Loading exported models back into a Python-based Pytorch allows you to
@@ -297,14 +296,14 @@ for ExecuTorch.)
 
 We export the stories15M model with the following command for
 execution with the ExecuTorch runtime (and enabling execution on a
-wide range of community and vendor supported backends):
+wide range of community and vendor-supported backends):
 
 ```
 python3 torchchat.py export --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_NAME}.pte
 ```
 
 Alternatively, we may generate a native instruction stream binary
-using AOT Inductor for CPU oor GPUs (the latter using Triton for
+using AOT Inductor for CPU or GPUs (the latter using Triton for
 optimizations such as operator fusion):
 
 ```
@@ -319,10 +318,10 @@ the exported model artifact back into a model container with a
 compatible API surface for the `model.forward()` function.  This
 enables users to test, evaluate and exercise the exported model
 artifact with familiar interfaces, and in conjunction with
-pre-exiisting Python model unit tests and common environments such as
+pre-existing Python model unit tests and common environments such as
 Jupyter notebooks and/or Google colab.
 
-Here is how to load an exported model into the python environment on the example of using an exported model with `generate.oy`.
+Here is how to load an exported model into the Python environment using an exported model with the `generate` command.
 
 ```
 python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --pte-path ${MODEL_NAME}.pte --device cpu --prompt "Once upon a time"
@@ -452,7 +451,7 @@ strategies:
 You can find instructions for quantizing models in
 [docs/quantization.md](file:///./quantization.md).  Advantageously,
 quantization is available in eager mode as well as during export,
-enabling you to do an early exploration of your quantization setttings
+enabling you to do an early exploration of your quantization settings
 in eager mode.  However, final accuracy should always be confirmed on
 the actual execution target, since all targets have different build
 processes, compilers, and kernel implementations with potentially
@@ -464,9 +463,8 @@ significant impact on accuracy.
 
 ## Native (Stand-Alone) Execution of Exported Models
 
-Refer to the [README](README.md] for an introduction toNative
-execution on servers, desktops and laptops is described under
-[runner-build.md].  Mobile and Edge executipon for Android and iOS are
+Refer to the [README](README.md] for an introduction to native
+execution on servers, desktops, and laptops.  Mobile and Edge execution for Android and iOS are
 described under [torchchat/edge/docs/Android.md] and [torchchat/edge/docs/iOS.md], respectively.
 
 
@@ -475,7 +473,7 @@ described under [torchchat/edge/docs/Android.md] and [torchchat/edge/docs/iOS.md
 
 PyTorch and ExecuTorch support a broad range of devices for running
 PyTorch with python (using either eager or eager + `torch.compile`) or
-in a python-free environment with AOT Inductor and ExecuTorch.
+in a Python-free environment with AOT Inductor and ExecuTorch.
 
 
 | Hardware | OS | Eager | Eager + Compile | AOT Compile | ET Runtime |
@@ -499,58 +497,6 @@ in a python-free environment with AOT Inductor and ExecuTorch.
 *Key*: n/t -- not tested
 
 
-## Runtime performance with Llama 7B, in tokens per second (4b quantization)
-
-| Hardware | OS | eager | eager + compile | AOT compile | ET Runtime |
-|-----|------|-----|-----|-----|-----|
-| x86 | Linux | ? | ? | ? | ? |
-| x86 | macOS | ? | ? | ? | ? |
-| aarch64 | Linux | ? | ? | ? | ? |
-| aarch64 | macOS | ? | ? | ? | ? |
-| AMD GPU | Linux | ? | ? | ? | ? |
-| Nvidia GPU | Linux | ? | ? | ? | ? |
-| MPS | macOS | ? | ? | ? | ? |
-| MPS | iOS | ? | ? | ? | ? |
-| aarch64 | Android | ? | ? | ? | ? |
-| Mobile GPU (Vulkan) | Android | ? | ? | ? | ? |
-| CoreML | iOS | | ? | ? | ? | ? |
-| Hexagon DSP | Android | | ? | ? | ? | ? |
-| Raspberry Pi 4/5 | Raspbian | ? | ? | ? | ? |
-| Raspberry Pi 4/5 | Android | ? | ? | ? | ? |
-| ARM 32b (up to v7) | any | | ? | ? | ? | ? |
-
-
-## Runtime performance with Llama3, in tokens per second (4b quantization)
-
-| Hardware | OS | eager | eager + compile | AOT compile | ET Runtime |
-|-----|------|-----|-----|-----|-----|
-| x86 | Linux | ? | ? | ? | ? |
-| x86 | macOS | ? | ? | ? | ? |
-| aarch64 | Linux | ? | ? | ? | ? |
-| aarch64 | macOS | ? | ? | ? | ? |
-| AMD GPU | Linux | ? | ? | ? | ? |
-| Nvidia GPU | Linux | ? | ? | ? | ? |
-| MPS | macOS | ? | ? | ? | ? |
-| MPS | iOS | ? | ? | ? | ? |
-| aarch64 | Android | ? | ? | ? | ? |
-| Mobile GPU (Vulkan) | Android | ? | ? | ? | ? |
-| CoreML | iOS | | ? | ? | ? | ? |
-| Hexagon DSP | Android | | ? | ? | ? | ? |
-| Raspberry Pi 4/5 | Raspbian | ? | ? | ? | ? |
-| Raspberry Pi 4/5 | Android | ? | ? | ? | ? |
-| ARM 32b (up to v7) | any | | ? | ? | ? | ? |
-
-
-
-
-# CONTRIBUTING to torchchat
-
-We welcome any feature requests, bug reports, or pull requests from
-the community. See the [CONTRIBUTING](CONTRIBUTING.md) for
-instructions how to contribute to torchchat.
-
-
-
 # LICENSE
 
 Torchchat is released under the [BSD 3 license](./LICENSE). However

From c7bb8b96f6c4efa546a3dc771fe7d3af1c02eff8 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Tue, 26 Nov 2024 12:03:59 -0800
Subject: [PATCH 22/83] Update multimodal.md to exercise server as part of test
 (#1391)

Similar to #1384 to exercise the server , but for multimodal

1 - Run server:
1a - in background
1b - capture server_pid

2 - enable query using curl

3 - shutdown server with server pid captured in server_pid
---
 docs/multimodal.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/docs/multimodal.md b/docs/multimodal.md
index 6a3cb2be8..cd249a1fb 100644
--- a/docs/multimodal.md
+++ b/docs/multimodal.md
@@ -41,6 +41,9 @@ python3 torchchat.py server llama3.2-11B
 ```
 [skip default]: end
 
+[shell default]: python3 torchchat.py server llama3.2-11B & server_pid=$!
+
+
 In another terminal, query the server using `curl`. This query might take a few minutes to respond.
 
 <details>
@@ -50,7 +53,6 @@ Setting `stream` to "true" in the request emits a response in chunks. If `stream
 
 **Example Input + Output**
 
-[skip default]: begin
 ```
 curl http://127.0.0.1:5000/v1/chat/completions \
   -H "Content-Type: application/json" \
@@ -74,12 +76,14 @@ curl http://127.0.0.1:5000/v1/chat/completions \
     "max_tokens": 300
   }'
 ```
-
+[skip default]: begin
 ```
 {"id": "chatcmpl-cb7b39af-a22e-4f71-94a8-17753fa0d00c", "choices": [{"message": {"role": "assistant", "content": "The image depicts a simple black and white cartoon-style drawing of an animal face. It features a profile view, complete with two ears, expressive eyes, and a partial snout. The animal looks to the left, with its eye and mouth implied, suggesting that the drawn face might belong to a rabbit, dog, or pig. The graphic face has a bold black outline and a smaller, solid black nose. A small circle, forming part of the face, has a white background with two black quirkly short and long curved lines forming an outline of what was likely a mouth, complete with two teeth. The presence of the curve lines give the impression that the animal is smiling or speaking. Grey and black shadows behind the right ear and mouth suggest that this face is looking left and upwards. Given the prominent outline of the head and the outline of the nose, it appears that the depicted face is most likely from the side profile of a pig, although the ears make it seem like a dog and the shape of the nose makes it seem like a rabbit. Overall, it seems that this image, possibly part of a character illustration, is conveying a playful or expressive mood through its design and positioning."}, "finish_reason": "stop"}], "created": 1727487574, "model": "llama3.2", "system_fingerprint": "cpu_torch.float16", "object": "chat.completion"}%
 ```
 [skip default]: end
 
+[shell default]: kill ${server_pid}
+
 </details>
 
 ## Browser

From b0abf27736dbe4e054777d3f61bccf21cc804e5a Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Tue, 3 Dec 2024 14:49:50 -0800
Subject: [PATCH 23/83] Update quantization.md link to quantize.py (#1392)

https://github.com/pytorch/torchchat/issues/1385
---
 docs/quantization.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/quantization.md b/docs/quantization.md
index 3415d8cb8..5007946bb 100644
--- a/docs/quantization.md
+++ b/docs/quantization.md
@@ -59,7 +59,7 @@ for valid `bitwidth` and `groupsize` values.
 | linear with dynamic activations (symmetric) | `'{"linear:a8w4dq" : {"groupsize" : <groupsize>}}'`|
 | embedding | `'{"embedding": {"bitwidth": <bitwidth>, "groupsize":<groupsize>}}'` |
 
-See the available quantization schemes [here](https://github.com/pytorch/torchchat/blob/main/torchchat/utils/quantize.py#L1260-L1266).
+See the available quantization schemes [here](https://github.com/pytorch/torchchat/blob/b809b69e03f8f4b75a4b27b0778f0d3695ce94c2/torchchat/utils/quantize.py#L887-L894).
 
 In addition to quantization, the [accelerator](model_customization.md#device)
 and [precision](model_customization.md#model-precision) can also be specified.

From b870f7e8153c9460696488696ce37fdd6fe238c6 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu0820@users.noreply.github.com>
Date: Fri, 6 Dec 2024 14:12:56 -0800
Subject: [PATCH 24/83] Bump torch pin to 20241010 (#1400)

* Bump torch pin to 20241010

As titled

* bump ET and gguf

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
---
 .github/workflows/pull.yml      | 1 +
 install/.pins/et-pin.txt        | 2 +-
 install/install_requirements.sh | 4 ++--
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index ee7270a5d..c48436a80 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -731,6 +731,7 @@ jobs:
 
           git clone https://github.com/ggerganov/llama.cpp.git
           pushd llama.cpp
+          git checkout 64ed2091b24b2f9747148fdf49a34ed5938762c3
           make
           popd
 
diff --git a/install/.pins/et-pin.txt b/install/.pins/et-pin.txt
index e61fae3a5..bb70ed39d 100644
--- a/install/.pins/et-pin.txt
+++ b/install/.pins/et-pin.txt
@@ -1 +1 @@
-72b3bb3194c611f7c4861e6f3b24af5de868af72
+98e4dd524f2cb08414ee015b27616229cabc06ba
diff --git a/install/install_requirements.sh b/install/install_requirements.sh
index 635789de6..a39c55cc8 100755
--- a/install/install_requirements.sh
+++ b/install/install_requirements.sh
@@ -62,10 +62,10 @@ echo "Using pip executable: $PIP_EXECUTABLE"
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # PYTORCH_NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-PYTORCH_NIGHTLY_VERSION=dev20241002
+PYTORCH_NIGHTLY_VERSION=dev20241010
 
 # Nightly version for torchvision
-VISION_NIGHTLY_VERSION=dev20241002
+VISION_NIGHTLY_VERSION=dev20241010
 
 # Nightly version for torchtune
 TUNE_NIGHTLY_VERSION=dev20241010

From 4e621ce85427b94a7de6d676febc133ae5d19a06 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu0820@users.noreply.github.com>
Date: Fri, 6 Dec 2024 16:18:32 -0800
Subject: [PATCH 25/83] Use pytorch-labs/tokenizers and remove tokenizer/
 (#1401)

* Use pytorch-labs/tokenizers and remove tokenizer/

Summary: Use our shiny new repo
https://github.com/pytorch-labs/tokenizers.

Test Plan: Rely on CI jobs

Reviewers:

Subscribers:

Tasks:

Tags:

* Fix pull.yml

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
---
 .github/workflows/pull.yml              |   4 +-
 .gitmodules                             |  12 +-
 CMakeLists.txt                          |   9 +-
 runner/run.cpp                          | 517 +++++++++++-------------
 runner/third-party/tokenizers           |   1 +
 tokenizer/CMakeLists.txt                |  29 --
 tokenizer/__init__.py                   |   0
 tokenizer/base.py                       |  32 --
 tokenizer/base64.h                      | 187 ---------
 tokenizer/hf_tokenizer.py               |  92 -----
 tokenizer/sentencepiece.cpp             | 125 ------
 tokenizer/third-party/abseil-cpp        |   1 -
 tokenizer/third-party/re2               |   1 -
 tokenizer/third-party/sentencepiece     |   1 -
 tokenizer/tiktoken.cpp                  | 390 ------------------
 tokenizer/tiktoken.py                   | 241 -----------
 tokenizer/tokenizer.h                   | 147 -------
 torchchat/utils/scripts/build_native.sh |   2 +-
 18 files changed, 244 insertions(+), 1547 deletions(-)
 create mode 160000 runner/third-party/tokenizers
 delete mode 100644 tokenizer/CMakeLists.txt
 delete mode 100644 tokenizer/__init__.py
 delete mode 100644 tokenizer/base.py
 delete mode 100644 tokenizer/base64.h
 delete mode 100644 tokenizer/hf_tokenizer.py
 delete mode 100644 tokenizer/sentencepiece.cpp
 delete mode 160000 tokenizer/third-party/abseil-cpp
 delete mode 160000 tokenizer/third-party/re2
 delete mode 160000 tokenizer/third-party/sentencepiece
 delete mode 100644 tokenizer/tiktoken.cpp
 delete mode 100644 tokenizer/tiktoken.py
 delete mode 100644 tokenizer/tokenizer.h

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index c48436a80..c86e8ab62 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -942,7 +942,7 @@ jobs:
           path: |
             ./et-build
             ./torchchat/utils/scripts
-          key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}-${{ hashFiles('**/install_et.sh') }}
+          key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}-${{ hashFiles('**/install_et.sh', '**/build_native.sh') }}
       - if: ${{ steps.install-et.outputs.cache-hit != 'true' }}
         continue-on-error: true
         run: |
@@ -1053,7 +1053,7 @@ jobs:
 
           # Pull submodules (re2, abseil) for Tiktoken
           git submodule sync
-          git submodule update --init
+          git submodule update --init --recursive
           ./runner/build_android.sh
           echo "Tests complete."
 
diff --git a/.gitmodules b/.gitmodules
index 7681823df..76bc1b9fd 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,9 +1,3 @@
-[submodule "tokenizer/third-party/abseil-cpp"]
-	path = tokenizer/third-party/abseil-cpp
-	url = https://github.com/abseil/abseil-cpp.git
-[submodule "tokenizer/third-party/re2"]
-	path = tokenizer/third-party/re2
-	url = https://github.com/google/re2.git
-[submodule "tokenizer/third-party/sentencepiece"]
-	path = tokenizer/third-party/sentencepiece
-	url = https://github.com/google/sentencepiece.git
+[submodule "runner/third-party/tokenizers"]
+	path = runner/third-party/tokenizers
+	url = https://github.com/pytorch-labs/tokenizers
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 61fd4d5a6..e004dbfcb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,18 +7,21 @@ ELSE()
 ENDIF()
 
 project(Torchchat)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes")
 
 # include tokenizer
-add_subdirectory(tokenizer)
+add_subdirectory(runner/third-party/tokenizers)
 
 # include et_run executable
 include(runner/et.cmake)
 if(TARGET et_run)
-    target_link_libraries(et_run PUBLIC tokenizer microkernels-prod)
+    target_link_libraries(et_run PUBLIC tokenizers microkernels-prod)
+    target_include_directories(et_run PUBLIC runner/third-party/tokenizers/include)
 endif()
 
 # include aoti_run executable
 include(runner/aoti.cmake)
 if(TARGET aoti_run)
-    target_link_libraries(aoti_run tokenizer)
+    target_link_libraries(aoti_run tokenizers)
+    target_include_directories(aoti_run PUBLIC runner/third-party/tokenizers/include)
 endif()
diff --git a/runner/run.cpp b/runner/run.cpp
index abfbb4584..f2b8e8e6b 100644
--- a/runner/run.cpp
+++ b/runner/run.cpp
@@ -7,20 +7,21 @@ LICENSE file in the root directory of this source tree.
 */
 
 /* Inference for Llama-2 Transformer model in pure C++ */
+#include "sentencepiece.h"
+#include "tiktoken.h"
+#include <algorithm>
+#include <cinttypes>
+#include <cstdint>
+#include <cstdlib>
 #include <ctype.h>
+#include <iterator>
 #include <math.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <time.h>
-#include <tokenizer.h>
-#include <algorithm>
-#include <cstdint>
-#include <cstdlib>
-#include <iterator>
 #include <string>
-
+#include <time.h>
 #ifdef DEBUG
 #include <cassert>
 #include <iostream>
@@ -47,13 +48,25 @@ torch::Device aoti_device(torch::kCPU);
 #endif
 
 using exec_aten::ScalarType;
-using torch::executor::EValue;
-using executorch::extension::TensorPtr;
 using executorch::extension::make_tensor_ptr;
+using executorch::extension::TensorPtr;
+using torch::executor::EValue;
 using torch::executor::Module;
 using torch::executor::Result;
 #endif
 
+using tokenizers::SPTokenizer;
+using tokenizers::Tiktoken;
+using tokenizers::Tokenizer;
+
+#define UNWRAP(x)                                                              \
+  ({                                                                           \
+    if (!(x).ok()) {                                                           \
+      fprintf(stderr, "Got error code % " PRIu32, x.error());                  \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+    std::move(x.get());                                                        \
+  })
 // ----------------------------------------------------------------------------
 // Transformer model
 
@@ -65,56 +78,56 @@ enum ModelType {
 
 ModelType get_model_type(int model_int) {
   switch (model_int) {
-    case 2:
-      return LLAMA2_MODEL;
-      break;
-    case 3:
-      return LLAMA3_MODEL;
-      break;
-    default:
-      return UNKNOWN_MODEL;
+  case 2:
+    return LLAMA2_MODEL;
+    break;
+  case 3:
+    return LLAMA3_MODEL;
+    break;
+  default:
+    return UNKNOWN_MODEL;
   }
 }
 
 typedef struct {
   int vocab_size; // vocabulary size, usually 256 (byte-level)
-  int seq_len; // max sequence length
+  int seq_len;    // max sequence length
 } Config;
 
 typedef struct {
-  float* logits; // output logits
-  int64_t* toks; // tokens seen so far; no kv-cache :(
+  float *logits; // output logits
+  int64_t *toks; // tokens seen so far; no kv-cache :(
 } RunState;
 
 typedef struct {
-  Config config; // the hyperparameters of the architecture (the blueprint)
+  Config config;  // the hyperparameters of the architecture (the blueprint)
   RunState state; // buffers for the "wave" of activations in the forward pass
 
 #ifdef __AOTI_MODEL__
-  torch::inductor::AOTIModelPackageLoader* runner;
+  torch::inductor::AOTIModelPackageLoader *runner;
 #else // __ET_MODEL__
-  Module* runner;
+  Module *runner;
 #endif
 
 } Transformer;
 
-void malloc_run_state(RunState* s, Config* p) {
+void malloc_run_state(RunState *s, Config *p) {
   // we calloc instead of malloc to keep valgrind happy
-  s->logits = (float*)calloc(p->vocab_size, sizeof(float));
-  s->toks = (int64_t*)calloc(p->seq_len, sizeof(int64_t));
+  s->logits = (float *)calloc(p->vocab_size, sizeof(float));
+  s->toks = (int64_t *)calloc(p->seq_len, sizeof(int64_t));
   if (!s->logits || !s->toks) {
     fprintf(stderr, "malloc failed!\n");
     exit(EXIT_FAILURE);
   }
 }
 
-void free_run_state(RunState* s) {
+void free_run_state(RunState *s) {
   free(s->logits);
   free(s->toks);
 }
 
-void read_checkpoint(char* checkpoint, Config* config) {
-  FILE* file = fopen(checkpoint, "rb");
+void read_checkpoint(char *checkpoint, Config *config) {
+  FILE *file = fopen(checkpoint, "rb");
   if (!file) {
     fprintf(stderr, "Couldn't open file %s\n", checkpoint);
     exit(EXIT_FAILURE);
@@ -128,11 +141,8 @@ void read_checkpoint(char* checkpoint, Config* config) {
   config->vocab_size = abs(config->vocab_size);
 }
 
-void build_transformer(
-    Transformer* t,
-    char* model_path,
-    int vocab_size,
-    int seq_len) {
+void build_transformer(Transformer *t, char *model_path, int vocab_size,
+                       int seq_len) {
   // read in the Config and the Weights from the model
   // read_checkpoint(model_path, &t->config);
   // allocate the RunState buffers
@@ -142,7 +152,9 @@ void build_transformer(
 
 #ifdef __AOTI_MODEL__
   t->runner = new torch::inductor::AOTIModelPackageLoader(model_path);
-  aoti_device = t->runner->get_metadata()["AOTI_DEVICE_KEY"] == "cpu" ? torch::Device(torch::kCPU) : torch::Device(torch::kCUDA);
+  aoti_device = t->runner->get_metadata()["AOTI_DEVICE_KEY"] == "cpu"
+                    ? torch::Device(torch::kCPU)
+                    : torch::Device(torch::kCUDA);
 #else //__ET_MODEL__
   t->runner = new Module(
       /* path to PTE model */ model_path,
@@ -150,7 +162,7 @@ void build_transformer(
 #endif
 }
 
-void free_transformer(Transformer* t) {
+void free_transformer(Transformer *t) {
   // free the RunState buffers
   free_run_state(&t->state);
   delete t->runner;
@@ -159,7 +171,7 @@ void free_transformer(Transformer* t) {
 // ----------------------------------------------------------------------------
 // neural net blocks; the dynamics of the Transformer
 
-void softmax(float* x, int size) {
+void softmax(float *x, int size) {
   // find max value (for numerical stability)
   float max_val = x[0];
   for (int i = 1; i < size; i++) {
@@ -179,9 +191,9 @@ void softmax(float* x, int size) {
   }
 }
 
-float* forward(Transformer* transformer, int token, int pos) {
-  Config* p = &transformer->config;
-  RunState* s = &transformer->state;
+float *forward(Transformer *transformer, int token, int pos) {
+  Config *p = &transformer->config;
+  RunState *s = &transformer->state;
   s->toks[pos] = token;
   long token_buffer[1] = {token};
   long pos_buffer[1] = {pos};
@@ -194,8 +206,8 @@ float* forward(Transformer* transformer, int token, int pos) {
   torch::Tensor token_tensor =
       torch::from_blob(token_buffer, {1, 1}, torch::kLong);
   torch::Tensor pos_tensor = torch::from_blob(pos_buffer, {1}, torch::kLong);
-  std::vector<torch::Tensor> inputs{
-      token_tensor.to(aoti_device), pos_tensor.to(aoti_device)};
+  std::vector<torch::Tensor> inputs{token_tensor.to(aoti_device),
+                                    pos_tensor.to(aoti_device)};
 
   torch::Tensor result = transformer->runner->run(inputs)[0]
                              .to(torch::dtype(torch::kFloat32))
@@ -204,7 +216,8 @@ float* forward(Transformer* transformer, int token, int pos) {
   memcpy(s->logits, logits, p->vocab_size * sizeof(float));
 #else // __ET_MODEL__
   TensorPtr pos_managed = make_tensor_ptr({1}, pos_buffer, ScalarType::Long);
-  TensorPtr tokens_managed = make_tensor_ptr({1, 1}, token_buffer, ScalarType::Long);
+  TensorPtr tokens_managed =
+      make_tensor_ptr({1, 1}, token_buffer, ScalarType::Long);
   std::vector<EValue> inputs;
   auto tmp1 = EValue(tokens_managed);
   auto tmp2 = EValue(pos_managed);
@@ -221,17 +234,12 @@ float* forward(Transformer* transformer, int token, int pos) {
   // HACK: the rest of this runner assumes that logits must be float,
   // so we simply convert them rather than plumbing
   // templating/switch-on-type through the rest of this file.
-  const auto& result_tensor = result[0].toTensor();
+  const auto &result_tensor = result[0].toTensor();
   ET_SWITCH_REALHBBF16_TYPES(
-      result_tensor.scalar_type(),
-      unused,
-      "forward",
-      CTYPE,
-      [&]() {
-        const CTYPE* logits = result_tensor.const_data_ptr<CTYPE>();
-        std::transform(logits, logits + p->vocab_size, s->logits, [](auto x) {
-          return static_cast<float>(x);
-        });
+      result_tensor.scalar_type(), unused, "forward", CTYPE, [&]() {
+        const CTYPE *logits = result_tensor.const_data_ptr<CTYPE>();
+        std::transform(logits, logits + p->vocab_size, s->logits,
+                       [](auto x) { return static_cast<float>(x); });
       });
 #endif
 
@@ -249,13 +257,13 @@ typedef struct {
 
 typedef struct {
   int vocab_size;
-  ProbIndex* probindex; // buffer used in top-p sampling
+  ProbIndex *probindex; // buffer used in top-p sampling
   float temperature;
   float topp;
   unsigned long long rng_state;
 } Sampler;
 
-int sample_argmax(float* probabilities, int n) {
+int sample_argmax(float *probabilities, int n) {
   // return the index that has the highest probability
   int max_i = 0;
   float max_p = probabilities[0];
@@ -268,7 +276,7 @@ int sample_argmax(float* probabilities, int n) {
   return max_i;
 }
 
-int sample_mult(float* probabilities, int n, float coin) {
+int sample_mult(float *probabilities, int n, float coin) {
   // sample index from probabilities (they must sum to 1!)
   // coin is a random number in [0, 1), usually from random_f32()
   float cdf = 0.0f;
@@ -281,9 +289,9 @@ int sample_mult(float* probabilities, int n, float coin) {
   return n - 1; // in case of rounding errors
 }
 
-int compare(const void* a, const void* b) {
-  ProbIndex* a_ = (ProbIndex*)a;
-  ProbIndex* b_ = (ProbIndex*)b;
+int compare(const void *a, const void *b) {
+  ProbIndex *a_ = (ProbIndex *)a;
+  ProbIndex *b_ = (ProbIndex *)b;
   if (a_->prob > b_->prob)
     return -1;
   if (a_->prob < b_->prob)
@@ -291,12 +299,8 @@ int compare(const void* a, const void* b) {
   return 0;
 }
 
-int sample_topp(
-    float* probabilities,
-    int n,
-    float topp,
-    ProbIndex* probindex,
-    float coin) {
+int sample_topp(float *probabilities, int n, float topp, ProbIndex *probindex,
+                float coin) {
   // top-p sampling (or "nucleus sampling") samples from the smallest set of
   // tokens that exceed probability topp. This way we never sample tokens that
   // have very low probabilities and are less likely to go "off the rails".
@@ -339,37 +343,31 @@ int sample_topp(
   return probindex[last_idx].index; // in case of rounding errors
 }
 
-void build_sampler(
-    Sampler* sampler,
-    int vocab_size,
-    float temperature,
-    float topp,
-    unsigned long long rng_seed) {
+void build_sampler(Sampler *sampler, int vocab_size, float temperature,
+                   float topp, unsigned long long rng_seed) {
   sampler->vocab_size = vocab_size;
   sampler->temperature = temperature;
   sampler->topp = topp;
   sampler->rng_state = rng_seed;
   // buffer only used with nucleus sampling; may not need but it's ~small
   sampler->probindex =
-      (ProbIndex*)malloc(sampler->vocab_size * sizeof(ProbIndex));
+      (ProbIndex *)malloc(sampler->vocab_size * sizeof(ProbIndex));
 }
 
-void free_sampler(Sampler* sampler) {
-  free(sampler->probindex);
-}
+void free_sampler(Sampler *sampler) { free(sampler->probindex); }
 
-unsigned int random_u32(unsigned long long* state) {
+unsigned int random_u32(unsigned long long *state) {
   // xorshift rng: https://en.wikipedia.org/wiki/Xorshift#xorshift.2A
   *state ^= *state >> 12;
   *state ^= *state << 25;
   *state ^= *state >> 27;
   return (*state * 0x2545F4914F6CDD1Dull) >> 32;
 }
-float random_f32(unsigned long long* state) { // random float32 in [0,1)
+float random_f32(unsigned long long *state) { // random float32 in [0,1)
   return (random_u32(state) >> 8) / 16777216.0f;
 }
 
-int sample(Sampler* sampler, float* logits) {
+int sample(Sampler *sampler, float *logits) {
   // sample the token given the logits and some hyperparameters
   int next;
   if (sampler->temperature == 0.0f) {
@@ -390,39 +388,37 @@ int sample(Sampler* sampler, float* logits) {
       next = sample_mult(logits, sampler->vocab_size, coin);
     } else {
       // top-p (nucleus) sampling, clamping the least likely tokens to zero
-      next = sample_topp(
-          logits, sampler->vocab_size, sampler->topp, sampler->probindex, coin);
+      next = sample_topp(logits, sampler->vocab_size, sampler->topp,
+                         sampler->probindex, coin);
     }
   }
   return next;
 }
 
-Tokenizer* build_tokenizer(const char* tokenizer_path, ModelType model_type) {
-  Tokenizer* tokenizer = NULL;
+Tokenizer *build_tokenizer(const char *tokenizer_path, ModelType model_type) {
+  Tokenizer *tokenizer = NULL;
   switch (model_type) {
-    case LLAMA2_MODEL:
-      tokenizer = new SPTokenizer();
-      tokenizer->load(tokenizer_path);
-      break;
-    case LLAMA3_MODEL:
-      tokenizer = new Tiktoken();
-      tokenizer->load(tokenizer_path);
-      break;
-    default:
-      fprintf(stderr, "No tokenizer defined for model type %d.\n", model_type);
-      exit(EXIT_FAILURE);
+  case LLAMA2_MODEL:
+    tokenizer = new SPTokenizer();
+    tokenizer->load(tokenizer_path);
+    break;
+  case LLAMA3_MODEL:
+    tokenizer = new Tiktoken();
+    tokenizer->load(tokenizer_path);
+    break;
+  default:
+    fprintf(stderr, "No tokenizer defined for model type %d.\n", model_type);
+    exit(EXIT_FAILURE);
   }
   return tokenizer;
 }
 
-void free_tokenizer(Tokenizer* tokenizer) {
-  delete tokenizer;
-}
+void free_tokenizer(Tokenizer *tokenizer) { delete tokenizer; }
 
 // ----------------------------------------------------------------------------
 // utilities: time
 
-void safe_printf(const char* piece) {
+void safe_printf(const char *piece) {
   // piece might be a raw byte token, and we only want to print printable chars
   // or whitespace because some of the other bytes can be various control codes,
   // backspace, etc.
@@ -454,21 +450,18 @@ long time_in_ms() {
 // Prints decoded tokens generated from the transformer.
 // The first token is not printed and is assumed to be a BOS or other similar
 // token
-unsigned generate_from_prompt_tokens(
-    Transformer* transformer,
-    Tokenizer* tokenizer,
-    Sampler* sampler,
-    const std::vector<uint64_t>& prompt_tokens,
-    unsigned pos,
-    const std::vector<uint64_t>& stop_tokens,
-    int stop_pos,
-    bool print_prompt,
-    bool print_tok_per_sec) {
+unsigned generate_from_prompt_tokens(Transformer *transformer,
+                                     Tokenizer *tokenizer, Sampler *sampler,
+                                     const std::vector<uint64_t> &prompt_tokens,
+                                     unsigned pos,
+                                     const std::vector<uint64_t> &stop_tokens,
+                                     int stop_pos, bool print_prompt,
+                                     bool print_tok_per_sec) {
   if (prompt_tokens.size() == 0) {
     return pos;
   }
 
-  uint64_t next; // will store the next token in the sequence
+  uint64_t next;  // will store the next token in the sequence
   uint64_t token; // stores the current token to feed into the transformer
   bool done_with_prompt; // whether we are done processing prompt
 
@@ -486,7 +479,7 @@ unsigned generate_from_prompt_tokens(
     if (pos_in_prompt < prompt_tokens.size()) {
       // Token comes from prompt
       token = prompt_tokens[pos_in_prompt++];
-      float* logits = forward(transformer, token, pos);
+      float *logits = forward(transformer, token, pos);
 
       // Next token is either from prompt or if on last
       // prompt token, next is sampled
@@ -498,29 +491,27 @@ unsigned generate_from_prompt_tokens(
     } else {
       // Token comes from next sampled from previous round.
       token = next;
-      float* logits = forward(transformer, token, pos);
+      float *logits = forward(transformer, token, pos);
       next = sample(sampler, logits);
     }
     done_with_prompt = (pos_in_prompt >= prompt_tokens.size());
 
     // we terminate on finding the stop_token if we are done processing the
     // prompt (stop_tokens in the prompt do not terminate the loop)
-    if (done_with_prompt &&
-        (std::find(stop_tokens.begin(), stop_tokens.end(), token) !=
-         stop_tokens.end())) {
+    if (done_with_prompt && (std::find(stop_tokens.begin(), stop_tokens.end(),
+                                       token) != stop_tokens.end())) {
       found_stop_token = true;
     }
 
     // We print next in each iteration of the loop, not token
     if (!found_stop_token && (print_prompt || done_with_prompt)) {
       // The stop_token is printed as newline
-      bool next_is_stop =
-          std::find(stop_tokens.begin(), stop_tokens.end(), next) !=
-          stop_tokens.end();
+      bool next_is_stop = std::find(stop_tokens.begin(), stop_tokens.end(),
+                                    next) != stop_tokens.end();
       if (next_is_stop) {
         printf("\n");
       } else {
-        std::string piece = tokenizer->decode(token, next);
+        std::string piece = UNWRAP(tokenizer->decode(token, next));
         safe_printf(piece.c_str()); // same as printf("%s", piece), but skips
                                     // "unsafe" bytes
         fflush(stdout);
@@ -538,23 +529,16 @@ unsigned generate_from_prompt_tokens(
   // iteration)
   if (print_tok_per_sec && pos > 1) {
     long end = time_in_ms();
-    fprintf(
-        stderr,
-        "\n\nachieved tok/s: %f\n",
-        (pos - 1) / (double)(end - start) * 1000);
+    fprintf(stderr, "\n\nachieved tok/s: %f\n",
+            (pos - 1) / (double)(end - start) * 1000);
   }
 
   return pos;
 }
 
-void generate(
-    Transformer* transformer,
-    Tokenizer* tokenizer,
-    Sampler* sampler,
-    const char* prompt,
-    int steps,
-    ModelType model_type) {
-  const char* default_prompt = "Once upon a time";
+void generate(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler,
+              const char *prompt, int steps, ModelType model_type) {
+  const char *default_prompt = "Once upon a time";
   if (prompt == NULL) {
     prompt = default_prompt;
   }
@@ -566,33 +550,30 @@ void generate(
   std::vector<uint64_t> prompt_tokens;
   std::vector<uint64_t> stop_tokens;
   switch (model_type) {
-    case LLAMA2_MODEL:
-      prompt_tokens = tokenizer->encode(prompt, 1, 0);
-      stop_tokens.push_back(tokenizer->eos_tok());
-      break;
-    case LLAMA3_MODEL:
-      prompt_tokens = tokenizer->encode(prompt, 1, 0);
-      stop_tokens.push_back(tokenizer->encode("<|end_of_text|>", 0, 0)[0]);
-      stop_tokens.push_back(tokenizer->encode("<|eot_id|>", 0, 0)[0]);
-      break;
-    default:
-      fprintf(stderr, "Generate does not support model type %d.\n", model_type);
-      exit(EXIT_FAILURE);
-  }
-
-  generate_from_prompt_tokens(
-      transformer,
-      tokenizer,
-      sampler,
-      prompt_tokens,
-      /*pos=*/0,
-      /*stop_tokens=*/stop_tokens,
-      /*stop_pos=*/steps - 1,
-      /*print_prompt=*/true,
-      /*print_tok_per_sec=*/true);
+  case LLAMA2_MODEL:
+    prompt_tokens = UNWRAP(tokenizer->encode(prompt, 1, 0));
+    stop_tokens.push_back(tokenizer->eos_tok());
+    break;
+  case LLAMA3_MODEL:
+    prompt_tokens = UNWRAP(tokenizer->encode(prompt, 1, 0));
+    stop_tokens.push_back(
+        UNWRAP(tokenizer->encode("<|end_of_text|>", 0, 0))[0]);
+    stop_tokens.push_back(UNWRAP(tokenizer->encode("<|eot_id|>", 0, 0))[0]);
+    break;
+  default:
+    fprintf(stderr, "Generate does not support model type %d.\n", model_type);
+    exit(EXIT_FAILURE);
+  }
+
+  generate_from_prompt_tokens(transformer, tokenizer, sampler, prompt_tokens,
+                              /*pos=*/0,
+                              /*stop_tokens=*/stop_tokens,
+                              /*stop_pos=*/steps - 1,
+                              /*print_prompt=*/true,
+                              /*print_tok_per_sec=*/true);
 }
 
-void read_stdin(const char* guide, char* buffer, size_t bufsize) {
+void read_stdin(const char *guide, char *buffer, size_t bufsize) {
   // read a line from stdin, up to but not including \n
   printf("%s", guide);
   if (fgets(buffer, bufsize, stdin) != NULL) {
@@ -609,11 +590,10 @@ void read_stdin(const char* guide, char* buffer, size_t bufsize) {
 // python reference and that seemed ok, but this was not thoroughly tested and
 // is not safely implemented, it's more a proof of concept atm.
 
-std::vector<uint64_t> get_initial_prompt_tokens(
-    const char* cli_system_prompt,
-    const char* cli_user_prompt,
-    Tokenizer* tokenizer,
-    ModelType model_type) {
+std::vector<uint64_t> get_initial_prompt_tokens(const char *cli_system_prompt,
+                                                const char *cli_user_prompt,
+                                                Tokenizer *tokenizer,
+                                                ModelType model_type) {
   char system_prompt[512];
   char user_prompt[512];
   char rendered_prompt[512 * 2 + 200]; // the prompt template is ~170
@@ -622,10 +602,8 @@ std::vector<uint64_t> get_initial_prompt_tokens(
   if (cli_system_prompt != NULL) {
     strcpy(system_prompt, cli_system_prompt);
   } else {
-    read_stdin(
-        "Enter system prompt (optional): ",
-        system_prompt,
-        sizeof(system_prompt));
+    read_stdin("Enter system prompt (optional): ", system_prompt,
+               sizeof(system_prompt));
   }
 
   if (cli_user_prompt != NULL) {
@@ -637,48 +615,40 @@ std::vector<uint64_t> get_initial_prompt_tokens(
   std::vector<uint64_t> tokens;
 
   switch (model_type) {
-    case LLAMA2_MODEL:
-      if (system_prompt[0] != '\0') {
-        snprintf(
-            rendered_prompt,
-            sizeof(rendered_prompt) - 1,
-            "[INST] <<SYS>>\n%s\n<</SYS>>\n\n%s [/INST]",
-            system_prompt,
-            user_prompt);
-      } else {
-        snprintf(
-            rendered_prompt,
-            sizeof(rendered_prompt) - 1,
-            "[INST] %s [/INST]",
-            user_prompt);
-      }
+  case LLAMA2_MODEL:
+    if (system_prompt[0] != '\0') {
+      snprintf(rendered_prompt, sizeof(rendered_prompt) - 1,
+               "[INST] <<SYS>>\n%s\n<</SYS>>\n\n%s [/INST]", system_prompt,
+               user_prompt);
+    } else {
+      snprintf(rendered_prompt, sizeof(rendered_prompt) - 1,
+               "[INST] %s [/INST]", user_prompt);
+    }
 
-      // We need to add BOS token here and not in template because llama2
-      // tokenizer does not pattern match special tokens
-      tokens = tokenizer->encode(rendered_prompt, 1, 0);
-      break;
-
-    case LLAMA3_MODEL:
-      if (system_prompt[0] != '\0') {
-        snprintf(
-            rendered_prompt,
-            sizeof(rendered_prompt) - 1,
-            "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
-            system_prompt,
-            user_prompt);
-      } else {
-        snprintf(
-            rendered_prompt,
-            sizeof(rendered_prompt) - 1,
-            "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
-            user_prompt);
-      }
-      tokens = tokenizer->encode(rendered_prompt, 0, 0);
-      break;
+    // We need to add BOS token here and not in template because llama2
+    // tokenizer does not pattern match special tokens
+    tokens = UNWRAP(tokenizer->encode(rendered_prompt, 1, 0));
+    break;
+
+  case LLAMA3_MODEL:
+    if (system_prompt[0] != '\0') {
+      snprintf(rendered_prompt, sizeof(rendered_prompt) - 1,
+               "<|begin_of_text|><|start_header_id|>system<|end_header_id|>"
+               "\n\n%s<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n%s<"
+               "|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+               system_prompt, user_prompt);
+    } else {
+      snprintf(rendered_prompt, sizeof(rendered_prompt) - 1,
+               "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n%"
+               "s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+               user_prompt);
+    }
+    tokens = UNWRAP(tokenizer->encode(rendered_prompt, 0, 0));
+    break;
 
-    default:
-      fprintf(stderr, "Chat does not support model type %d.\n", model_type);
-      exit(EXIT_FAILURE);
+  default:
+    fprintf(stderr, "Chat does not support model type %d.\n", model_type);
+    exit(EXIT_FAILURE);
   }
 
 #ifdef DEBUG
@@ -695,9 +665,8 @@ std::vector<uint64_t> get_initial_prompt_tokens(
   return tokens;
 }
 
-std::vector<uint64_t> get_next_user_prompt_tokens(
-    Tokenizer* tokenizer,
-    ModelType model_type) {
+std::vector<uint64_t> get_next_user_prompt_tokens(Tokenizer *tokenizer,
+                                                  ModelType model_type) {
   char user_prompt[512];
   char rendered_prompt[512 + 150]; // the prompt template is ~100 characters. We
                                    // use 150 to be safe.
@@ -706,30 +675,26 @@ std::vector<uint64_t> get_next_user_prompt_tokens(
   std::vector<uint64_t> tokens;
 
   switch (model_type) {
-    case LLAMA2_MODEL:
-      snprintf(
-          rendered_prompt,
-          sizeof(rendered_prompt) - 1,
-          "[INST] %s [/INST]",
-          user_prompt);
-
-      // We need to add BOS token here and not in template because llama2
-      // tokenizer does not pattern match special tokens
-      tokens = tokenizer->encode(rendered_prompt, /*bos*/ 1, /*eos*/ 0);
-      break;
-
-    case LLAMA3_MODEL:
-      snprintf(
-          rendered_prompt,
-          sizeof(rendered_prompt) - 1,
-          "<|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
-          user_prompt);
-      tokens = tokenizer->encode(rendered_prompt, 0, 0);
-      break;
-
-    default:
-      fprintf(stderr, "Chat does not support model type %d.\n", model_type);
-      exit(EXIT_FAILURE);
+  case LLAMA2_MODEL:
+    snprintf(rendered_prompt, sizeof(rendered_prompt) - 1, "[INST] %s [/INST]",
+             user_prompt);
+
+    // We need to add BOS token here and not in template because llama2
+    // tokenizer does not pattern match special tokens
+    tokens = UNWRAP(tokenizer->encode(rendered_prompt, /*bos*/ 1, /*eos*/ 0));
+    break;
+
+  case LLAMA3_MODEL:
+    snprintf(rendered_prompt, sizeof(rendered_prompt) - 1,
+             "<|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_"
+             "header_id|>assistant<|end_header_id|>\n\n",
+             user_prompt);
+    tokens = UNWRAP(tokenizer->encode(rendered_prompt, 0, 0));
+    break;
+
+  default:
+    fprintf(stderr, "Chat does not support model type %d.\n", model_type);
+    exit(EXIT_FAILURE);
   }
 
 #ifdef DEBUG
@@ -746,14 +711,9 @@ std::vector<uint64_t> get_next_user_prompt_tokens(
   return tokens;
 }
 
-void chat(
-    Transformer* transformer,
-    Tokenizer* tokenizer,
-    Sampler* sampler,
-    const char* cli_user_prompt,
-    const char* cli_system_prompt,
-    unsigned steps,
-    ModelType model_type) {
+void chat(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler,
+          const char *cli_user_prompt, const char *cli_system_prompt,
+          unsigned steps, ModelType model_type) {
   if (steps == 0) {
     return;
   }
@@ -761,16 +721,16 @@ void chat(
   uint64_t eot_token;
   std::vector<uint64_t> prompt_tokens;
   switch (model_type) {
-    case LLAMA2_MODEL:
-      // llama2 uses EOS as EOT token
-      eot_token = tokenizer->eos_tok();
-      break;
-    case LLAMA3_MODEL:
-      eot_token = tokenizer->encode("<|eot_id|>", 0, 0)[0];
-      break;
-    default:
-      fprintf(stderr, "Chat does not support model type %d.\n", model_type);
-      exit(EXIT_FAILURE);
+  case LLAMA2_MODEL:
+    // llama2 uses EOS as EOT token
+    eot_token = tokenizer->eos_tok();
+    break;
+  case LLAMA3_MODEL:
+    eot_token = UNWRAP(tokenizer->encode("<|eot_id|>", 0, 0))[0];
+    break;
+  default:
+    fprintf(stderr, "Chat does not support model type %d.\n", model_type);
+    exit(EXIT_FAILURE);
   }
 
   std::vector<uint64_t> stop_tokens{eot_token};
@@ -784,11 +744,7 @@ void chat(
     }
     printf("Assistant: ");
     pos = generate_from_prompt_tokens(
-        transformer,
-        tokenizer,
-        sampler,
-        prompt_tokens,
-        pos,
+        transformer, tokenizer, sampler, prompt_tokens, pos,
         /*stop_tokens=*/stop_tokens,
         /*stop_pos=*/steps - 1, // We could pass in -1 here if we do not want
                                 // the model to stop mid-reply
@@ -803,46 +759,43 @@ void chat(
 
 void error_usage() {
   fprintf(stderr, "Usage:   run <model_path> [options]\n");
-  fprintf(
-      stderr, "Example: run model.{so,pte} -n 256 -i \"Once upon a time\"\n");
+  fprintf(stderr,
+          "Example: run model.{so,pte} -n 256 -i \"Once upon a time\"\n");
   fprintf(stderr, "Options:\n");
   fprintf(stderr, "  -t <float>  temperature in [0,inf], default 1.0\n");
-  fprintf(
-      stderr,
-      "  -p <float>  p value in top-p (nucleus) sampling in [0,1], default 0.9\n");
+  fprintf(stderr, "  -p <float>  p value in top-p (nucleus) sampling in [0,1], "
+                  "default 0.9\n");
   fprintf(stderr, "  -s <int>    random seed, default time(NULL)\n");
-  fprintf(
-      stderr,
-      "  -n <int>    number of steps to run for, default 256. 0 = max_seq_len\n");
+  fprintf(stderr, "  -n <int>    number of steps to run for, default 256. 0 = "
+                  "max_seq_len\n");
   fprintf(stderr, "  -i <string> input prompt\n");
   fprintf(stderr, "  -z <string> path to tokenizer\n");
   fprintf(stderr, "  -m <string> mode: generate|chat, default: generate\n");
   fprintf(stderr, "  -y <string> (optional) system prompt in chat mode\n");
-  fprintf(
-      stderr,
-      "  -v <int>    (optional) vocab size, default is model-specific.\n");
-  fprintf(
-      stderr, "  -l <int>    (optional) llama version (2 or 3), default 2.\n");
+  fprintf(stderr,
+          "  -v <int>    (optional) vocab size, default is model-specific.\n");
+  fprintf(stderr,
+          "  -l <int>    (optional) llama version (2 or 3), default 2.\n");
   fprintf(
       stderr,
       "  -d <string> (optional) device(CUDA or CPU)  model was exported for\n");
   exit(EXIT_FAILURE);
 }
 
-int main(int argc, char* argv[]) {
+int main(int argc, char *argv[]) {
   // default parameters
-  char* model_path = NULL;
-  char* tokenizer_path = NULL;
+  char *model_path = NULL;
+  char *tokenizer_path = NULL;
   float temperature =
       1.0f; // 0.0 = greedy deterministic. 1.0 = original. don't set higher
   float topp = 0.9f; // top-p in nucleus sampling. 1.0 = off. 0.9 works well,
                      // but slower
 
-  int steps = 128; // number of steps to run for
-  const char* prompt = NULL; // prompt string
+  int steps = 128;                 // number of steps to run for
+  const char *prompt = NULL;       // prompt string
   unsigned long long rng_seed = 0; // seed rng with time by default
-  const char* mode = "generate"; // generate|chat
-  char* system_prompt =
+  const char *mode = "generate";   // generate|chat
+  char *system_prompt =
       NULL; // the (optional) system prompt to use in chat mode
 
   int vocab_size = -1;
@@ -916,10 +869,8 @@ int main(int argc, char* argv[]) {
 
   ModelType model_type = get_model_type(llama_ver);
   if (model_type == UNKNOWN_MODEL) {
-    fprintf(
-        stderr,
-        "Unknown model type passed by -l argument.  Received l=%d.",
-        llama_ver);
+    fprintf(stderr, "Unknown model type passed by -l argument.  Received l=%d.",
+            llama_ver);
     error_usage();
   }
 
@@ -943,7 +894,7 @@ int main(int argc, char* argv[]) {
   if (steps < 0)
     steps = 0;
 
-  Tokenizer* tokenizer = build_tokenizer(tokenizer_path, model_type);
+  Tokenizer *tokenizer = build_tokenizer(tokenizer_path, model_type);
 
   // If no tokenizer path provided, get default for model_type
   if (vocab_size == -1) {
@@ -959,14 +910,8 @@ int main(int argc, char* argv[]) {
   if (strcmp(mode, "generate") == 0) {
     generate(&transformer, tokenizer, &sampler, prompt, steps, model_type);
   } else if (strcmp(mode, "chat") == 0) {
-    chat(
-        &transformer,
-        tokenizer,
-        &sampler,
-        prompt,
-        system_prompt,
-        steps,
-        model_type);
+    chat(&transformer, tokenizer, &sampler, prompt, system_prompt, steps,
+         model_type);
   } else {
     fprintf(stderr, "unknown mode: %s\n", mode);
     error_usage();
diff --git a/runner/third-party/tokenizers b/runner/third-party/tokenizers
new file mode 160000
index 000000000..19e463d66
--- /dev/null
+++ b/runner/third-party/tokenizers
@@ -0,0 +1 @@
+Subproject commit 19e463d665110e1d23145df1ad72bb8db111618b
diff --git a/tokenizer/CMakeLists.txt b/tokenizer/CMakeLists.txt
deleted file mode 100644
index 39c20885d..000000000
--- a/tokenizer/CMakeLists.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-cmake_minimum_required(VERSION 3.24)
-set(CMAKE_CXX_STANDARD 17)
-IF(DEFINED ENV{TORCHCHAT_ROOT})
-    set(TORCHCHAT_ROOT $ENV{TORCHCHAT_ROOT})
-ELSE()
-    set(TORCHCHAT_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
-ENDIF()
-
-# build tokenizer library
-add_library(
-    tokenizer
-    tokenizer.h
-    sentencepiece.cpp
-    tiktoken.cpp)
-
-target_include_directories(tokenizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} third-party/sentencepiece/src)
-
-# add RE2 as subdirectory
-set(ABSL_ENABLE_INSTALL ON)
-set(ABSL_PROPAGATE_CXX_STD ON)
-set(_pic_flag
-${CMAKE_POSITION_INDEPENDENT_CODE})
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-add_subdirectory(third-party/abseil-cpp)
-add_subdirectory(third-party/re2)
-add_subdirectory(third-party/sentencepiece)
-set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
-
-target_link_libraries(tokenizer PUBLIC re2::re2 sentencepiece-static)
diff --git a/tokenizer/__init__.py b/tokenizer/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tokenizer/base.py b/tokenizer/base.py
deleted file mode 100644
index 75998b32b..000000000
--- a/tokenizer/base.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Abstract base class for all tokenizer classes in python matching c++ interface.
-"""
-
-# Standard
-from abc import ABC, abstractmethod
-from typing import List
-
-
-class TokenizerBase(ABC):
-    __doc__ = __doc__
-
-    @abstractmethod
-    def encode(self, s: str, *, bos: bool = False, eos: bool = False) -> List[int]:
-        """Encode the given string and optionally include bos/eos tokens"""
-
-    @abstractmethod
-    def decode(self, ids: List[int]) -> str:
-        """Decode the given token ids into a string"""
-
-    @abstractmethod
-    def bos_id(self) -> int:
-        """The id of the begin-of-string token"""
-
-    @abstractmethod
-    def eos_id(self) -> int:
-        """The id of the end-of-string token"""
diff --git a/tokenizer/base64.h b/tokenizer/base64.h
deleted file mode 100644
index 12b8703a8..000000000
--- a/tokenizer/base64.h
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-// @lint-ignore-every LICENSELINT
-/**************************************************************************
-   Copyright (c) 2023 sewenew
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
- *************************************************************************/
-
-#pragma once
-
-#include <cassert>
-#include <cstdint>
-#include <string>
-#include <string_view>
-
-namespace base64 {
-
-std::string decode(const std::string_view& input);
-
-namespace detail {
-
-constexpr uint32_t DECODE_TABLE[] = {
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62,  255,
-    255, 255, 63,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  255, 255,
-    255, 255, 255, 255, 255, 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
-    10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
-    25,  255, 255, 255, 255, 255, 255, 26,  27,  28,  29,  30,  31,  32,  33,
-    34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
-    49,  50,  51,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255};
-
-inline void validate(uint32_t v) {
-  if (v == 255) {
-    fprintf(stderr, "invalid char");
-    exit(EXIT_FAILURE);
-  }
-}
-
-inline void decode(const std::string_view& input, std::string& output) {
-  if (input.size() != 4) {
-    fprintf(stderr, "input length must be 4, got %zu", input.size());
-    exit(EXIT_FAILURE);
-  }
-
-  uint32_t val = 0;
-
-  uint8_t c = input[0];
-  auto v = DECODE_TABLE[c];
-  validate(v);
-  val = v;
-
-  c = input[1];
-  v = DECODE_TABLE[c];
-  validate(v);
-  val = (val << 6) | v;
-
-  c = input[2];
-  v = DECODE_TABLE[c];
-  validate(v);
-  val = (val << 6) | v;
-
-  c = input[3];
-  v = DECODE_TABLE[c];
-  validate(v);
-  val = (val << 6) | v;
-
-  output.push_back(static_cast<char>((val >> 16) & 0xFF));
-  output.push_back(static_cast<char>((val >> 8) & 0xFF));
-  output.push_back(static_cast<char>(val & 0xFF));
-}
-
-inline void decode_1_padding(
-    const std::string_view& input,
-    std::string& output) {
-  if (input.size() != 3) {
-    fprintf(stderr, "input length must be 3, got %zu", input.size());
-    exit(EXIT_FAILURE);
-  }
-
-  uint32_t val = 0;
-
-  uint8_t c = input[0];
-  auto v = DECODE_TABLE[c];
-  validate(v);
-  val = v;
-
-  c = input[1];
-  v = DECODE_TABLE[c];
-  validate(v);
-  val = (val << 6) | v;
-
-  c = input[2];
-  v = DECODE_TABLE[c];
-  validate(v);
-  val = (val << 6) | v;
-
-  output.push_back(static_cast<char>((val >> 10) & 0xFF));
-  output.push_back(static_cast<char>((val >> 2) & 0xFF));
-}
-
-inline void decode_2_padding(
-    const std::string_view& input,
-    std::string& output) {
-  assert(input.size() == 2);
-
-  uint32_t val = 0;
-
-  uint8_t c = input[0];
-  auto v = DECODE_TABLE[c];
-  validate(v);
-  val = v;
-
-  c = input[1];
-  v = DECODE_TABLE[c];
-  validate(v);
-  val = (val << 6) | v;
-
-  output.push_back(static_cast<char>((val >> 4) & 0xFF));
-}
-
-} // namespace detail
-
-inline std::string decode(const std::string_view& input) {
-  if (input.empty()) {
-    fprintf(stderr, "empty input");
-    exit(EXIT_FAILURE);
-  }
-
-  // Faster than `input.size() % 4`.
-  if ((input.size() & 3) != 0 || input.size() < 4) {
-    fprintf(
-        stderr,
-        "input length must be larger than 4 and is multiple of 4, got %zu",
-        input.size());
-    exit(EXIT_FAILURE);
-  }
-
-  std::string output;
-  output.reserve(input.size() / 4 * 3);
-  auto idx = 0U;
-  for (; idx < input.size() - 4; idx += 4) {
-    detail::decode(input.substr(idx, 4), output);
-  }
-
-  // Last 4 bytes. Might contain paddings.
-  if (input[idx + 3] == '=') {
-    if (input[idx + 2] == '=') {
-      // Tow paddings.
-      detail::decode_2_padding(input.substr(idx, 2), output);
-    } else {
-      // One padding.
-      detail::decode_1_padding(input.substr(idx, 3), output);
-    }
-  } else {
-    // No padding.
-    detail::decode(input.substr(idx, 4), output);
-  }
-
-  return output;
-}
-} // namespace base64
diff --git a/tokenizer/hf_tokenizer.py b/tokenizer/hf_tokenizer.py
deleted file mode 100644
index 7ad5807d1..000000000
--- a/tokenizer/hf_tokenizer.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# Standard
-from typing import List, Optional
-import json
-import os
-
-# Third Party
-from tokenizers import Tokenizer
-
-# Local
-from .base import TokenizerBase
-
-
-class HFTokenizer(TokenizerBase):
-    """
-    Wrapper around the Huggingface `tokenizers` library for API compatibility
-    """
-
-    def __init__(self, file_path: str):
-        # If the path is a directory, look for "tokenizer.json" which is
-        # standard for transformers checkpoints and also look for the
-        # "tokenizer_config.json" file to parse eos/bos tokens
-        if os.path.isdir(file_path):
-            tokenizer_path = os.path.join(file_path, "tokenizer.json")
-            tokenizer_config_path = os.path.join(file_path, "tokenizer_config.json")
-        else:
-            tokenizer_path = file_path
-            tokenizer_config_path = os.path.join(os.path.dirname(file_path), "tokenizer_config.json")
-        if not os.path.isfile(tokenizer_path):
-            tokenizer_config_path = None
-
-        # Load the tokenizer itself
-        self._tokenizer = Tokenizer.from_file(tokenizer_path)
-
-        # If available, parse bos/eos tokens from the tokenizer config
-        self._bos_id, self._eos_id = None, None
-        if tokenizer_config_path is not None:
-            with open(tokenizer_config_path, "r") as handle:
-                tok_config = json.load(handle)
-            bos_token = tok_config.get("bos_token")
-            eos_token = tok_config.get("eos_token")
-            if bos_token is not None:
-                self._bos_id = self._tokenizer.token_to_id(bos_token)
-            if eos_token is not None:
-                self._eos_id = self._tokenizer.token_to_id(eos_token)
-
-        # If no eos/bos tokens found, go looking for them!
-        if None in [self._bos_id, self._eos_id]:
-            tok_content = json.loads(self._tokenizer.to_str())
-            if self._bos_id is None:
-                self._bos_id = self._look_for_special_token(tok_content, ["begin", "text"])
-            if self._eos_id is None:
-                self._eos_id = self._look_for_special_token(tok_content, ["end", "text"])
-
-        assert None not in [self._bos_id, self._eos_id], "Unable to find an BOS/EOS tokens"
-
-    @staticmethod
-    def _look_for_special_token(added_tokens: dict, search_strs: List[str]) -> Optional[int]:
-        candidate_toks = added_tokens
-        for search_str in search_strs:
-            candidate_toks = [
-                tok for tok in candidate_toks
-                if tok["special"] and search_str in tok["content"]
-            ]
-            if len(candidate_toks) == 1:
-                return candidate_toks[0]["id"]
-
-    def encode(
-        self,
-        s: str,
-        *,
-        bos: bool = False,
-        eos: bool = False,
-    ) -> List[int]:
-        res = self._tokenizer.encode(s, add_special_tokens=bos).ids
-        if eos and (not res or res[-1] != self._eos_token):
-            res.append(self._eos_token)
-        return res
-
-    def decode(self, ids: List[int]) -> str:
-        return self._tokenizer.decode(ids)
-
-    def bos_id(self) -> int:
-        return self._bos_id
-
-    def eos_id(self) -> int:
-        return self._eos_id
diff --git a/tokenizer/sentencepiece.cpp b/tokenizer/sentencepiece.cpp
deleted file mode 100644
index 0cdfc7e30..000000000
--- a/tokenizer/sentencepiece.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// sentencepiece tokenizer
-
-#include <sentencepiece_processor.h>
-#include <tokenizer.h>
-#include <cinttypes>
-#include <string>
-#include "absl/strings/str_replace.h"
-
-const char kSpaceSymbol[] = "\xe2\x96\x81";
-
-SPTokenizer::SPTokenizer()
-    : Tokenizer(),
-      _processor(std::make_unique<sentencepiece::SentencePieceProcessor>()) {}
-
-/**
- * @brief Load the tokenizer from a file. The tokenizer file contains the
- * vocabulary and scores. The format is: the first integer is the maximum
- * token length, followed by a list of (word_len, word) pairs. Here we
- * are reading all the vocabulary into memory and keep it sorted for fast
- * lookup.
- *
- * @param tokenizer_path The path to the tokenizer file.
- * @return void
- */
-void SPTokenizer::load(const std::string& tokenizer_path) {
-  if (initialized_) {
-    fprintf(stderr, "Tokenizer already initialized.\n");
-    return;
-  }
-  // read in the file
-  const auto status = _processor->Load(tokenizer_path);
-  if (!status.ok()) {
-    fprintf(stderr, "couldn't load %s\n. If this tokenizer artifact is for llama3, please pass `-l 3`.", tokenizer_path.c_str());
-    exit(EXIT_FAILURE);
-  }
-  // load vocab_size, bos_tok, eos_tok
-  vocab_size_ = _processor->GetPieceSize();
-  bos_tok_ = _processor->bos_id();
-  eos_tok_ = _processor->eos_id();
-  initialized_ = true;
-}
-
-SPTokenizer::~SPTokenizer() {}
-
-/**
- * @brief Decode a token into string.
- *
- * @param prev_token The previous token.
- * @param token The current token.
- * @return std::string A pointer to the string representation of the
- * token.
- */
-std::string SPTokenizer::decode(uint64_t prev_token, uint64_t token) {
-  if (!initialized_) {
-    fprintf(stderr, "Tokenizer not initialized\n");
-    exit(EXIT_FAILURE);
-  }
-  // get rid of the control ids <s> and </s>
-  if (_processor->IsControl(token)) {
-    // NB: returning empty string doesn't work for some reason. It causes
-    // free(): invalid pointer error.
-    return " ";
-  }
-
-  std::string result =
-      absl::StrReplaceAll(_processor->IdToPiece(token), {{kSpaceSymbol, " "}});
-
-  // following BOS token, sentencepiece decoder strips any leading
-  // whitespace
-  if (prev_token == bos_tok_ && result[0] == ' ') {
-    result = result.substr(1);
-  }
-
-  // handle <0x0A>
-  result = absl::StrReplaceAll(result, {{"<0x0A>", "\n"}});
-
-  return result;
-}
-
-/**
- * @brief Encode a string into a sequence of tokens.
- *
- * @param text The string to be encoded.
- * @param bos The number of BOS to prepend to the token list.
- * @param eos The number of EOS to append to the token list.
- * @return std::vector<uint64_t>
- */
-std::vector<uint64_t>
-SPTokenizer::encode(const std::string& text, int8_t bos, int8_t eos) {
-  if (!initialized_) {
-    fprintf(stderr, "Tokenizer not initialized\n");
-    exit(EXIT_FAILURE);
-  }
-  // workaround a weird issue that text doesn't have correct size()
-  std::string input(text.c_str());
-  // should we reserve memory?
-  std::vector<int> res;
-  auto status = _processor->Encode(input, &res);
-  if (!status.ok()) {
-    fprintf(stderr, "couldn't encode %s\n", text.c_str());
-    exit(EXIT_FAILURE);
-  }
-
-  std::vector<uint64_t> tokens;
-  for (auto i = 0; i < bos; ++i) {
-    tokens.push_back(bos_tok_);
-  }
-
-  for (auto i = 0; i < res.size(); ++i) {
-    tokens.push_back(res[i]);
-  }
-
-  for (auto i = 0; i < eos; ++i) {
-    tokens.push_back(eos_tok_);
-  }
-  return tokens;
-}
diff --git a/tokenizer/third-party/abseil-cpp b/tokenizer/third-party/abseil-cpp
deleted file mode 160000
index 854193071..000000000
--- a/tokenizer/third-party/abseil-cpp
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 854193071498f330b71083d7e06a7cd18e02a4cc
diff --git a/tokenizer/third-party/re2 b/tokenizer/third-party/re2
deleted file mode 160000
index ac82d4f62..000000000
--- a/tokenizer/third-party/re2
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit ac82d4f628a2045d89964ae11c48403d3b091af1
diff --git a/tokenizer/third-party/sentencepiece b/tokenizer/third-party/sentencepiece
deleted file mode 160000
index 7dcb54145..000000000
--- a/tokenizer/third-party/sentencepiece
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 7dcb541451b1862d73f473b3804ccf8f2a9e10f6
diff --git a/tokenizer/tiktoken.cpp b/tokenizer/tiktoken.cpp
deleted file mode 100644
index 2f31f057a..000000000
--- a/tokenizer/tiktoken.cpp
+++ /dev/null
@@ -1,390 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**************************************************************************
-   Copyright (c) 2023 sewenew
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
- *************************************************************************/
-
-#include <base64.h>
-#include <tokenizer.h>
-#include <cctype>
-#include <cinttypes>
-#include <cstdint>
-#include <fstream>
-#include <functional>
-#include <limits>
-#include <memory>
-#include <regex>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-// ------------------------------Util start------------------------------------
-
-static uint64_t _max_size() {
-  return std::numeric_limits<uint64_t>::max();
-}
-
-static Re2UPtr _create_regex(const std::string& pattern) {
-  assert(!pattern.empty());
-
-  return std::make_unique<re2::RE2>("(" + pattern + ")");
-}
-
-static Re2UPtr _build_special_token_regex(const Encoder& special_encoder) {
-  std::string special_pattern;
-  for (const auto& ele : special_encoder) {
-    if (!special_pattern.empty()) {
-      special_pattern += "|";
-    }
-    special_pattern += re2::RE2::QuoteMeta(ele.first);
-  }
-
-  if (special_pattern.empty()) {
-    return nullptr;
-  }
-
-  return _create_regex(special_pattern);
-}
-
-static std::pair<std::string, uint64_t> _parse(const std::string& line) {
-  auto pos = line.find(" ");
-  if (pos == std::string::npos) {
-    throw std::invalid_argument("invalid encoder line: " + line);
-  }
-
-  auto token = base64::decode({line.data(), pos});
-  uint64_t rank = 0;
-  try {
-    rank = std::stoul(line.substr(pos + 1));
-  } catch (const std::exception&) {
-    throw std::invalid_argument("invalid encoder rank:  " + line);
-  }
-
-  return {std::move(token), rank};
-}
-
-static Encoder _load_encoder(const std::string& path) {
-  std::ifstream file(path);
-  if (!file) {
-    fprintf(stderr, "failed to open encoder file: %s\n", path.c_str());
-    exit(EXIT_FAILURE);
-  }
-
-  Encoder encoder;
-  std::string line;
-  while (std::getline(file, line)) {
-    auto [token, rank] = _parse(line);
-
-    if (!encoder.emplace(std::move(token), rank).second) {
-      fprintf(stderr, "duplicate item: %s\n", line.c_str());
-    }
-  }
-  return encoder;
-}
-
-static Decoder _build_decoder(const Encoder& encoder) {
-  Decoder decoder;
-  for (const auto& [k, v] : encoder) {
-    decoder.emplace(v, k);
-  }
-
-  if (encoder.size() != decoder.size()) {
-    fprintf(stderr, "duplicate items in encoder");
-    exit(EXIT_FAILURE);
-  }
-
-  return decoder;
-}
-
-static std::vector<uint64_t> _byte_pair_merge(
-    const std::string& piece,
-    const std::unordered_map<std::string, uint64_t>& ranks,
-    std::function<uint64_t(uint64_t, uint64_t)> func) {
-  // This is a vector of (start, rank).
-  // The rank is of the byte pair starting at position start.
-  // The rank of the last item in the vector is not a valid value.
-  std::vector<std::pair<uint64_t, uint64_t>> parts;
-  parts.reserve(piece.size() + 1);
-  for (auto idx = 0U; idx < piece.size() + 1; ++idx) {
-    parts.emplace_back(idx, _max_size());
-  }
-
-  auto get_rank = [&piece, &ranks](
-                      const std::vector<std::pair<uint64_t, uint64_t>>& parts,
-                      uint64_t start_idx,
-                      uint64_t skip) -> std::optional<uint64_t> {
-    if (start_idx + skip + 2 < parts.size()) {
-      auto s = parts[start_idx].first;
-      auto e = parts[start_idx + skip + 2].first;
-      auto key = piece.substr(s, e - s);
-      auto iter = ranks.find(key);
-      if (iter != ranks.end()) {
-        return iter->second;
-      }
-    }
-    return std::nullopt;
-  };
-
-  // We look up the ranks once in the beginning and iteratively update
-  // them during each merge, which reduces the number of rank lookups.
-  for (auto i = 0U; i < parts.size() - 2; ++i) {
-    auto rank = get_rank(parts, i, 0);
-    if (rank) {
-      // usize::MAX is a sentinel value and cannot be a valid rank
-      if (*rank == _max_size()) {
-        fprintf(stderr, "at %" PRIu32 " rank is too large\n", i);
-      }
-      parts[i].second = *rank;
-    }
-  }
-
-  // If you have n parts and m merges, this does O(mn) work.
-  // We could do something with a heap and do O(m log n) work.
-  // It is important to consider that n is often small (<100), and as such
-  // the cache-locality benefits outweigh the algorithmic complexity downsides
-  // of the `parts` vector data structure above.
-
-  // Note that we hash bytes, not token pairs. As long as we train BPE the way
-  // we currently do, this is equivalent. An easy way to break this would be
-  // to decouple merge priority from token index or to prevent specific token
-  // merges.
-  while (true) {
-    if (parts.size() == 1) {
-      break;
-    }
-
-    // usize::MAX is a sentinel rank value allowing us to
-    // take the min more quickly
-    auto min_rank = std::make_pair<uint64_t, uint64_t>(_max_size(), 0);
-    for (auto i = 0U; i < parts.size() - 1; ++i) {
-      auto rank = parts[i].second;
-      if (rank < min_rank.first) {
-        min_rank.first = rank;
-        min_rank.second = i;
-      }
-    }
-
-    if (min_rank.first != _max_size()) {
-      auto i = min_rank.second;
-
-      // NOTE: We are about to remove parts[i + 1]. We do not do it
-      // yet because there are cache-locality benefits to updating
-      // parts[i] and parts[i-1] before removing, which could thrash
-      // the cache. Thus, we update the rank calculation by skipping over
-      // parts[i + 1], by invoking `get_rank!` with `skip = 1`.
-      auto rank = get_rank(parts, i, 1);
-      if (rank) {
-        parts[i].second = *rank;
-      } else {
-        parts[i].second = _max_size();
-      }
-      if (i > 0) {
-        rank = get_rank(parts, i - 1, 1);
-        if (rank) {
-          parts[i - 1].second = *rank;
-        } else {
-          parts[i - 1].second = _max_size();
-        }
-      }
-
-      parts.erase(parts.begin() + (i + 1));
-    } else {
-      break;
-    }
-  }
-  std::vector<uint64_t> out;
-  out.reserve(parts.size() - 1);
-  for (auto i = 0U; i < parts.size() - 1; ++i) {
-    auto s = parts[i].first;
-    auto e = parts[i + 1].first;
-    out.push_back(func(s, e));
-  }
-  return out;
-}
-
-static std::vector<uint64_t> _byte_pair_encode(
-    const std::string& piece,
-    const Encoder& encoder) {
-  if (piece.size() == 1) {
-    auto iter = encoder.find(piece);
-    if (iter != encoder.end()) {
-      return std::vector<uint64_t>({iter->second});
-    } else {
-      // TODO: is it possible?
-      return {};
-    }
-  }
-
-  return _byte_pair_merge(
-      piece, encoder, [&piece, &encoder](uint64_t start, uint64_t stop) {
-        std::string key = piece.substr(start, stop - start);
-        auto iter = encoder.find(key);
-        if (iter != encoder.end()) {
-          return iter->second;
-        } else {
-          // TODO: what if key does not exist? Should we return `unknown`?
-          // assert(false); // ??
-          return uint64_t(0);
-        }
-      });
-}
-// ------------------------------Util end------------------------------------
-// -------------------------private method start-------------------------------
-
-template <typename T>
-std::pair<std::optional<std::string>, re2::StringPiece>
-Tiktoken::_split_with_allowed_special_token(
-    re2::StringPiece& input,
-    const T& allowed_special) {
-  if (!_special_token_regex) {
-    return std::make_pair(std::nullopt, input);
-  }
-
-  auto start = input.begin();
-  std::string special;
-  while (true) {
-    if (!re2::RE2::FindAndConsume(&input, *_special_token_regex, &special)) {
-      // No special token.
-      break;
-    }
-
-    if (allowed_special.count(special) == 1) {
-      // Found an allowed special token, split the text with it.
-      return std::make_pair(
-          special,
-          re2::StringPiece(start, input.begin() - start - special.size()));
-    } // else try to find the next special token
-  }
-
-  return std::make_pair(std::nullopt, input);
-}
-
-void Tiktoken::_encode(
-    re2::StringPiece& input,
-    std::vector<uint64_t>& ret,
-    uint64_t& last_piece_token_len) {
-  std::string piece;
-  assert(_regex);
-  while (re2::RE2::FindAndConsume(&input, *_regex, &piece)) {
-    auto iter = _encoder.find(piece);
-    if (iter != _encoder.end()) {
-      last_piece_token_len = 1;
-      ret.push_back(iter->second);
-      continue;
-    }
-    auto tokens = _byte_pair_encode(piece, _encoder);
-    last_piece_token_len = tokens.size();
-    ret.insert(ret.end(), tokens.begin(), tokens.end());
-  }
-}
-
-template <typename T>
-std::pair<std::vector<uint64_t>, uint64_t> Tiktoken::_encode_with_special_token(
-    const std::string& text,
-    const T& allowed_special) {
-  std::vector<uint64_t> tokens;
-  uint64_t last_piece_token_len = 0;
-  re2::StringPiece input(text);
-  while (true) {
-    auto [special, sub_input] =
-        _split_with_allowed_special_token(input, allowed_special);
-
-    _encode(sub_input, tokens, last_piece_token_len);
-
-    if (special) {
-      uint64_t token = 0;
-      try {
-        token = _special_token_encoder.at(*special);
-      } catch (const std::out_of_range&) {
-        // Should never go here, since special pattern includes all special
-        // chars.
-        fprintf(stderr, "unknown special token: %s\n", special->c_str());
-        exit(EXIT_FAILURE);
-      }
-
-      tokens.push_back(token);
-      last_piece_token_len = 0;
-    } else {
-      break;
-    }
-  }
-
-  // last_piece_token_len is how many tokens came from the last regex split.
-  // This is used for determining unstable tokens, since you can't merge
-  // across (stable) regex splits
-  return std::make_pair(tokens, last_piece_token_len);
-}
-
-// -------------------------private method end-------------------------------
-// -------------------------public method start-------------------------------
-
-Tiktoken::Tiktoken() : Tokenizer() {}
-
-void Tiktoken::load(const std::string& path) {
-  _encoder = _load_encoder(path);
-  _special_token_encoder = _get_special_tokens(_encoder.size());
-
-  _decoder = _build_decoder(_encoder);
-  _special_token_decoder = _build_decoder(_special_token_encoder);
-
-  _regex = _create_regex(_pattern);
-  _special_token_regex = _build_special_token_regex(_special_token_encoder);
-
-  // initialize vocab_size, bos_tok, eos_tok
-  vocab_size_ = _encoder.size() + _special_token_encoder.size();
-  bos_tok_ = _encoder.size(); // hardcoded (see _get_special_tokens)
-  eos_tok_ = _encoder.size() + 1; // hardcoded (see _get_special_tokens)
-  initialized_ = true;
-}
-
-std::vector<uint64_t>
-Tiktoken::encode(const std::string& text, int8_t bos, int8_t eos) {
-  if (!initialized_) {
-    exit(EXIT_FAILURE);
-  }
-  auto res = _encode_with_special_token(text, _special_token_encoder).first;
-  for (auto i = 0; i < bos; ++i) {
-    res.insert(res.begin(), bos_tok_);
-  }
-  for (auto i = 0; i < eos; ++i) {
-    res.push_back(eos_tok_);
-  }
-  return res;
-}
-
-std::string Tiktoken::decode(uint64_t prev, uint64_t cur) {
-  (void)prev;
-  if (!initialized_) {
-    exit(EXIT_FAILURE);
-  }
-  std::string ret;
-
-  std::string token_bytes;
-  auto iter = _decoder.find(cur);
-  if (iter != _decoder.end()) {
-    token_bytes = iter->second;
-  } else {
-    iter = _special_token_decoder.find(cur);
-    if (iter != _special_token_decoder.end()) {
-      token_bytes = iter->second;
-    } else {
-      fprintf(stderr, "unknown token: %" PRIu64 "\n", cur);
-      exit(EXIT_FAILURE);
-    }
-  }
-  ret += token_bytes;
-
-  return ret;
-}
-// -------------------------public method end-------------------------------
diff --git a/tokenizer/tiktoken.py b/tokenizer/tiktoken.py
deleted file mode 100644
index 30eb98624..000000000
--- a/tokenizer/tiktoken.py
+++ /dev/null
@@ -1,241 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import os
-from logging import getLogger
-from pathlib import Path
-from typing import (
-    AbstractSet,
-    cast,
-    Collection,
-    Dict,
-    Iterator,
-    List,
-    Literal,
-    Sequence,
-    TypedDict,
-    Union,
-)
-
-import tiktoken
-from tiktoken.load import load_tiktoken_bpe
-
-from .base import TokenizerBase
-
-
-logger = getLogger(__name__)
-
-
-Role = Literal["system", "user", "assistant"]
-
-
-class Message(TypedDict):
-    role: Role
-    content: str
-
-
-Dialog = Sequence[Message]
-
-
-class Tokenizer(TokenizerBase):
-    """
-    tokenizing and encoding/decoding text using the Tiktoken tokenizer.
-    """
-
-    special_tokens: Dict[str, int]
-
-    num_reserved_special_tokens = 256
-
-    pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"  # noqa: E501
-
-    def __init__(self, model_path: str):
-        """
-        Initializes the Tokenizer with a Tiktoken model.
-
-        Args:
-            model_path (str): The path to the Tiktoken model file.
-        """
-        # reload tokenizer
-        assert os.path.isfile(model_path), model_path
-
-        mergeable_ranks = load_tiktoken_bpe(model_path)
-        num_base_tokens = len(mergeable_ranks)
-        special_tokens = [
-            "<|begin_of_text|>",
-            "<|end_of_text|>",
-            "<|reserved_special_token_0|>",
-            "<|reserved_special_token_1|>",
-            "<|reserved_special_token_2|>",
-            "<|reserved_special_token_3|>",
-            "<|start_header_id|>",
-            "<|end_header_id|>",
-            "<|reserved_special_token_4|>",
-            "<|eot_id|>",  # end of turn
-        ] + [
-            f"<|reserved_special_token_{i}|>"
-            for i in range(5, self.num_reserved_special_tokens - 5)
-        ]
-        self.special_tokens = {
-            token: num_base_tokens + i for i, token in enumerate(special_tokens)
-        }
-        self.model = tiktoken.Encoding(
-            name=Path(model_path).name,
-            pat_str=self.pat_str,
-            mergeable_ranks=mergeable_ranks,
-            special_tokens=self.special_tokens,
-        )
-        logger.debug(f"Reloaded Tiktoken model from {model_path}")
-
-        # BOS / EOS token IDs
-        self.n_words: int = self.model.n_vocab
-        self._bos_id: int = self.special_tokens["<|begin_of_text|>"]
-        self._eos_id: int = self.special_tokens["<|end_of_text|>"]
-        self.pad_id: int = -1
-        self.stop_tokens = {
-            self.special_tokens["<|end_of_text|>"],
-            self.special_tokens["<|eot_id|>"],
-        }
-        logger.debug(
-            f"#words: {self.n_words} - BOS ID: {self._bos_id} - EOS ID: {self._eos_id}"
-        )
-
-    def encode(
-        self,
-        s: str,
-        *,
-        bos: bool = False,
-        eos: bool = False,
-        allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),  # noqa B006
-        disallowed_special: Union[Literal["all"], Collection[str]] = (),
-    ) -> List[int]:
-        """
-        Encodes a string into a list of token IDs.
-
-        Args:
-            s (str): The input string to be encoded.
-            bos (bool): Whether to prepend the beginning-of-sequence token.
-            eos (bool): Whether to append the end-of-sequence token.
-            allowed_special ("all"|set[str]): allowed special tokens in string
-            disallowed_special ("all"|set[str]): special tokens that raise an error when in string
-
-        Returns:
-            list[int]: A list of token IDs.
-
-        By default, setting disallowed_special=() encodes a string by ignoring
-        special tokens. Specifically:
-        - Setting `disallowed_special` to () will cause all text corresponding
-          to special tokens to be encoded as natural text (instead of raising
-          an error).
-        - Setting `allowed_special` to "all" will treat all text corresponding
-          to special tokens to be encoded as special tokens.
-        """
-        assert type(s) is str
-
-        # The tiktoken tokenizer can handle <=400k chars without
-        # pyo3_runtime.PanicException (may go beyond 400k)
-        TIKTOKEN_MAX_ENCODE_CHARS = 400_000
-
-        # https://github.com/openai/tiktoken/issues/195
-        # Here we iterate over subsequences and split if we exceed the limit
-        # of max consecutive non-whitespace or whitespace characters.
-        MAX_NO_WHITESPACES_CHARS = 25_000
-
-        substrs = (
-            substr
-            for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS)
-            for substr in self._split_whitespaces_or_nonwhitespaces(
-                s[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
-            )
-        )
-        t: List[int] = []
-        for substr in substrs:
-            t.extend(
-                self.model.encode(
-                    substr,
-                    allowed_special=allowed_special,
-                    disallowed_special=disallowed_special,
-                )
-            )
-        if bos:
-            t.insert(0, self._bos_id)
-        if eos:
-            t.append(self._eos_id)
-        return t
-
-    def bos_id(self) -> int:
-        return self._bos_id
-
-    def eos_id(self) -> int:
-        return self._eos_id
-
-    def decode(self, t: Sequence[int]) -> str:
-        """
-        Decodes a list of token IDs into a string.
-
-        Args:
-            t (List[int]): The list of token IDs to be decoded.
-
-        Returns:
-            str: The decoded string.
-        """
-        # typecast is safe here, Tiktoken doesn't do anything list-related with the sequence.
-        return self.model.decode(cast(List[int], t))
-
-    @staticmethod
-    def _split_whitespaces_or_nonwhitespaces(
-        s: str, max_consecutive_slice_len: int
-    ) -> Iterator[str]:
-        """
-        Split the string `s` so that each substring contains no more than `max_consecutive_slice_len`
-        consecutive whitespaces or consecutive non-whitespaces
-        """
-        current_slice_len = 0
-        current_slice_is_space = s[0].isspace() if len(s) > 0 else False
-        slice_start = 0
-
-        for i in range(len(s)):
-            is_now_space = s[i].isspace()
-
-            if current_slice_is_space ^ is_now_space:
-                current_slice_len = 1
-                current_slice_is_space = is_now_space
-            else:
-                current_slice_len += 1
-                if current_slice_len > max_consecutive_slice_len:
-                    yield s[slice_start:i]
-                    slice_start = i
-                    current_slice_len = 1
-        yield s[slice_start:]
-
-
-class ChatFormat:
-    def __init__(self, tokenizer: Tokenizer):
-        self.tokenizer = tokenizer
-
-    def encode_header(self, message: Message) -> List[int]:
-        tokens = []
-        tokens.append(self.tokenizer.special_tokens["<|start_header_id|>"])
-        tokens.extend(self.tokenizer.encode(message["role"], bos=False, eos=False))
-        tokens.append(self.tokenizer.special_tokens["<|end_header_id|>"])
-        tokens.extend(self.tokenizer.encode("\n\n", bos=False, eos=False))
-        return tokens
-
-    def encode_message(self, message: Message) -> List[int]:
-        tokens = self.encode_header(message)
-        tokens.extend(
-            self.tokenizer.encode(message["content"].strip(), bos=False, eos=False)
-        )
-        tokens.append(self.tokenizer.special_tokens["<|eot_id|>"])
-        return tokens
-
-    def encode_dialog_prompt(self, dialog: Dialog) -> List[int]:
-        tokens = []
-        tokens.append(self.tokenizer.special_tokens["<|begin_of_text|>"])
-        for message in dialog:
-            tokens.extend(self.encode_message(message))
-        # Add the start of an assistant message for the model to complete
-        tokens.extend(self.encode_header({"role": "assistant", "content": ""}))
-        return tokens
diff --git a/tokenizer/tokenizer.h b/tokenizer/tokenizer.h
deleted file mode 100644
index 9e1977b71..000000000
--- a/tokenizer/tokenizer.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// A simple Tokenizer interface.
-#pragma once
-
-#include <re2/re2.h>
-#include <cctype>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <functional>
-#include <memory>
-#include <optional>
-#include <regex>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "sentencepiece_processor.h"
-
-class Tokenizer {
- public:
-  explicit Tokenizer() {}
-  virtual ~Tokenizer() {}
-
-  virtual void load(const std::string& tokenizer_path) = 0;
-
-  virtual std::vector<uint64_t>
-  encode(const std::string& input, int8_t bos, int8_t eos) = 0;
-
-  virtual std::string decode(uint64_t prev_token, uint64_t token) = 0;
-
-  // getters
-  int32_t vocab_size() const {
-    return vocab_size_;
-  }
-
-  uint64_t bos_tok() const {
-    return bos_tok_;
-  }
-
-  uint64_t eos_tok() const {
-    return eos_tok_;
-  }
-
- protected:
-  bool initialized_ = false;
-  int32_t vocab_size_;
-  uint64_t bos_tok_, eos_tok_;
-};
-
-// ----------------------- SPTokenizer -----------------------
-// Used by sentencepiece. Adapted from llama2.c.
-struct TokenIndex {
-  const char* str;
-  int32_t id;
-};
-
-class SPTokenizer : public Tokenizer {
- public:
-  explicit SPTokenizer();
-  ~SPTokenizer() override;
-
-  void load(const std::string& tokenizer_path) override;
-
-  std::vector<uint64_t> encode(const std::string& input, int8_t bos, int8_t eos)
-      override;
-
-  std::string decode(uint64_t prev_token, uint64_t token) override;
-
- private:
-  std::unique_ptr<sentencepiece::SentencePieceProcessor> _processor;
-};
-
-// ----------------------- Tiktoken -----------------------
-// Used by OpenAI, adapted from https://github.com/sewenew/tokenizer
-
-using Encoder = std::unordered_map<std::string, uint64_t>;
-using Decoder = std::unordered_map<uint64_t, std::string>;
-using Re2UPtr = std::unique_ptr<re2::RE2>;
-
-class Tiktoken : public Tokenizer {
- public:
-  explicit Tiktoken();
-  ~Tiktoken(){};
-
-  void load(const std::string& tokenizer_path);
-
-  std::vector<uint64_t>
-  encode(const std::string& input, int8_t bos, int8_t eos);
-
-  std::string decode(uint64_t prev_token, uint64_t token);
-
- private:
-  static inline const Encoder _get_special_tokens(ssize_t num_base_tokens) {
-    Encoder special_tokens;
-    special_tokens.emplace("<|begin_of_text|>", num_base_tokens++);
-    special_tokens.emplace("<|end_of_text|>", num_base_tokens++);
-    special_tokens.emplace("<|reserved_special_token_0|>", num_base_tokens++);
-    special_tokens.emplace("<|reserved_special_token_1|>", num_base_tokens++);
-    special_tokens.emplace("<|reserved_special_token_2|>", num_base_tokens++);
-    special_tokens.emplace("<|reserved_special_token_3|>", num_base_tokens++);
-    special_tokens.emplace("<|start_header_id|>", num_base_tokens++);
-    special_tokens.emplace("<|end_header_id|>", num_base_tokens++);
-    special_tokens.emplace("<|reserved_special_token_4|>", num_base_tokens++);
-    special_tokens.emplace("<|eot_id|>", num_base_tokens++);
-    for (auto i = 5; i < 251; ++i) {
-      special_tokens.emplace(
-          "<|reserved_special_token_" + std::to_string(i) + "|>",
-          num_base_tokens++);
-    }
-    return special_tokens;
-  }
-
-  template <typename T>
-  std::pair<std::optional<std::string>, re2::StringPiece>
-  _split_with_allowed_special_token(
-      re2::StringPiece& input,
-      const T& allowed_special);
-
-  void _encode(
-      re2::StringPiece& input,
-      std::vector<uint64_t>& ret,
-      uint64_t& last_piece_token_len);
-
-  template <typename T>
-  std::pair<std::vector<uint64_t>, uint64_t> _encode_with_special_token(
-      const std::string& text,
-      const T& allowed_special);
-
-  // Removed negative lookahead \s+(?!\S) since it's not supported by RE2.
-  const std::string _pattern =
-      R"((?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+)";
-  Encoder _encoder;
-  Encoder _special_token_encoder;
-  Decoder _decoder;
-  Decoder _special_token_decoder;
-
-  Re2UPtr _regex;
-  Re2UPtr _special_token_regex;
-};
diff --git a/torchchat/utils/scripts/build_native.sh b/torchchat/utils/scripts/build_native.sh
index 3c2c1c846..909fd2b97 100755
--- a/torchchat/utils/scripts/build_native.sh
+++ b/torchchat/utils/scripts/build_native.sh
@@ -64,7 +64,7 @@ fi
 
 
 pushd ${TORCHCHAT_ROOT}
-git submodule update --init
+git submodule update --init --recursive
 git submodule sync
 if [[ "$TARGET" == "et" ]]; then
   if [ ! -d "${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install" ]; then

From 6e40ec00276d740fc7d369faad7f1e02c8160e96 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Mon, 9 Dec 2024 12:29:58 -0800
Subject: [PATCH 26/83] Update PT Pin to 1013 (#1407)

* Update PT Pin to 1015

* Update install_requirements.sh
---
 install/install_requirements.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/install/install_requirements.sh b/install/install_requirements.sh
index a39c55cc8..3e1f9a655 100755
--- a/install/install_requirements.sh
+++ b/install/install_requirements.sh
@@ -62,13 +62,13 @@ echo "Using pip executable: $PIP_EXECUTABLE"
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # PYTORCH_NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-PYTORCH_NIGHTLY_VERSION=dev20241010
+PYTORCH_NIGHTLY_VERSION=dev20241013
 
 # Nightly version for torchvision
-VISION_NIGHTLY_VERSION=dev20241010
+VISION_NIGHTLY_VERSION=dev20241013
 
 # Nightly version for torchtune
-TUNE_NIGHTLY_VERSION=dev20241010
+TUNE_NIGHTLY_VERSION=dev20241013
 
 # Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same
 (

From d979da14c28bdb9287e427adc4bb9d7cee20d4bb Mon Sep 17 00:00:00 2001
From: YanbingJiang <yanbing.jiang@intel.com>
Date: Tue, 10 Dec 2024 05:17:39 +0800
Subject: [PATCH 27/83] Update docs for max-autotune usage (#1405)

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 docs/ADVANCED-USERS.md      | 2 ++
 docs/model_customization.md | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/docs/ADVANCED-USERS.md b/docs/ADVANCED-USERS.md
index 8f66b8a29..b996bf202 100644
--- a/docs/ADVANCED-USERS.md
+++ b/docs/ADVANCED-USERS.md
@@ -251,6 +251,8 @@ To improve performance, you can compile the model with `--compile`
 trading off the time to first token processed with time per token.  To
 improve performance further, you may also compile the prefill with
 `--compile-prefill`. This will increase further compilation times though. 
+For CPU, you can use `--max-autotune` to further improve the performance
+with `--compile` and `compile-prefill`. See [`max-autotune on CPU tutorial`](https://pytorch.org/tutorials/prototype/max_autotune_on_CPU_tutorial.html).
 
 Parallel prefill is not yet supported by exported models, and may be
 supported in a future release.
diff --git a/docs/model_customization.md b/docs/model_customization.md
index 3c076fa71..7108b4ce2 100644
--- a/docs/model_customization.md
+++ b/docs/model_customization.md
@@ -34,6 +34,9 @@ prefill with `--compile_prefill`.
 
 To learn more about compilation, check out: https://pytorch.org/get-started/pytorch-2.0/
 
+For CPU, you can use `--max-autotune` to further improve the performance with `--compile` and `compile-prefill`.
+
+See [`max-autotune on CPU tutorial`](https://pytorch.org/tutorials/prototype/max_autotune_on_CPU_tutorial.html).
 
 ## Model Precision
 

From 6d6f2b94408e5c0a4d69db888188efbaccb63937 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Mon, 9 Dec 2024 13:19:16 -0800
Subject: [PATCH 28/83] Update run-docs to include `run-docs native` (#1403)

* Update run-docs to include `run native`

run docs/native-execution.md commands during execution

* Update run-readme-pr.yml

Include `run-docs native` in workflow

---------

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 .ci/scripts/run-docs                | 19 ++++++++++++-
 .github/workflows/run-readme-pr.yml | 43 +++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs
index cc88bedac..871814593 100755
--- a/.ci/scripts/run-docs
+++ b/.ci/scripts/run-docs
@@ -95,7 +95,7 @@ fi
 if [ "$1" == "multimodal" ]; then
 
    # Expecting that this might fail this test as-is, because 
-   # it's the first on-pr test depending on githib secrets for access with HF token access
+   # it's the first on-pr test depending on github secrets for access with HF token access
 
         echo "::group::Create script to run multimodal"
         python3 torchchat/utils/scripts/updown.py --file docs/multimodal.md > ./run-multimodal.sh
@@ -111,3 +111,20 @@ if [ "$1" == "multimodal" ]; then
         bash -x ./run-multimodal.sh
         echo "::endgroup::"
 fi
+
+if [ "$1" == "native" ]; then
+
+        echo "::group::Create script to run native-execution"
+        python3 torchchat/utils/scripts/updown.py --file docs/native-execution.md > ./run-native.sh
+        # for good measure, if something happened to updown processor,
+        # and it did not error out, fail with an exit 1
+        echo "exit 1" >> ./run-native.sh
+        echo "::endgroup::"
+
+        echo "::group::Run native-execution"
+        echo "*******************************************"
+        cat ./run-native.sh
+        echo "*******************************************"
+        bash -x ./run-native.sh
+        echo "::endgroup::"
+fi
diff --git a/.github/workflows/run-readme-pr.yml b/.github/workflows/run-readme-pr.yml
index 4e5e6d014..1dc2942ef 100644
--- a/.github/workflows/run-readme-pr.yml
+++ b/.github/workflows/run-readme-pr.yml
@@ -287,3 +287,46 @@ jobs:
         echo "::endgroup::"
 
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs multimodal
+
+  test-native-any:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
+
+        .ci/scripts/run-docs native
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
+
+  test-native-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs native  

From 46b784ea1addb0f71bfa6d82d15f7396c0b0e011 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Mon, 9 Dec 2024 13:19:53 -0800
Subject: [PATCH 29/83] Update README.md to run and query server during test
 (#1384)

* Update README.md to run and query server

1 - Run server:
      1a - in background
      1b - capture server_pid

2 - enable query using curl

3 - shutdown server with server pid captured in server_pid

* Punctuation in README.md

Fix a punctuation issue in README.  While this is a valid change to improve language, it  is really a decoy to trigger rerunning a test that failed due to a SEV.

* Extend timeout for run-readme-pr-mps.yml

Readme run on M1 with MPS takes over 30 minutes, and may be hitting default timeout.  Extending timeout.

---------

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 .github/workflows/run-readme-pr-mps.yml | 1 +
 README.md                               | 8 +++++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/run-readme-pr-mps.yml b/.github/workflows/run-readme-pr-mps.yml
index 718d5cf9e..3e90265f5 100644
--- a/.github/workflows/run-readme-pr-mps.yml
+++ b/.github/workflows/run-readme-pr-mps.yml
@@ -10,6 +10,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
       runner: macos-m1-14
+      timeout-minutes: 50
       script: |
           conda create -y -n test-readme-mps-macos python=3.10.11 llvm-openmp
           conda activate test-readme-mps-macos
diff --git a/README.md b/README.md
index 4b910e575..3c37edf09 100644
--- a/README.md
+++ b/README.md
@@ -231,6 +231,8 @@ python3 torchchat.py server llama3.1
 ```
 [skip default]: end
 
+[shell default]: python3 torchchat.py server llama3.1 & server_pid=$!
+
 In another terminal, query the server using `curl`. Depending on the model configuration, this query might take a few minutes to respond.
 
 > [!NOTE]
@@ -244,8 +246,6 @@ Setting `stream` to "true" in the request emits a response in chunks. If `stream
 
 **Example Input + Output**
 
-[skip default]: begin
-
 ```
 curl http://127.0.0.1:5000/v1/chat/completions \
   -H "Content-Type: application/json" \
@@ -265,12 +265,14 @@ curl http://127.0.0.1:5000/v1/chat/completions \
     ]
   }'
 ```
+[skip default]: begin
 ```
 {"response":" I'm a software developer with a passion for building innovative and user-friendly applications. I have experience in developing web and mobile applications using various technologies such as Java, Python, and JavaScript. I'm always looking for new challenges and opportunities to learn and grow as a developer.\n\nIn my free time, I enjoy reading books on computer science and programming, as well as experimenting with new technologies and techniques. I'm also interested in machine learning and artificial intelligence, and I'm always looking for ways to apply these concepts to real-world problems.\n\nI'm excited to be a part of the developer community and to have the opportunity to share my knowledge and experience with others. I'm always happy to help with any questions or problems you may have, and I'm looking forward to learning from you as well.\n\nThank you for visiting my profile! I hope you find my information helpful and interesting. If you have any questions or would like to discuss any topics, please feel free to reach out to me. I"}
 ```
 
 [skip default]: end
 
+[shell default]: kill ${server_pid}
 
 </details>
 
@@ -664,6 +666,6 @@ awesome libraries and tools you've built around local LLM inference.
 
 torchchat is released under the [BSD 3 license](LICENSE). (Additional
 code in this distribution is covered by the MIT and Apache Open Source
-licenses.) However you may have other legal obligations that govern
+licenses.) However, you may have other legal obligations that govern
 your use of content, such as the terms of service for third-party
 models.

From 2c03a2a8585efb0d34346232db2a9bf0673da442 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Mon, 9 Dec 2024 14:13:13 -0800
Subject: [PATCH 30/83] Update run-docs to enable `run-docs evaluation` (#1383)

* Update run-docs

Enable evaluation tests from docs/evaluation.md

* Update evaluation.md

Install executorch for running test

* Update evaluation.md

wording

* Update evaluation.md

Avoid bleedthru of markup for test

---------

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 .ci/scripts/run-docs               | 3 ---
 torchchat/utils/docs/evaluation.md | 7 ++++++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs
index 871814593..6f5ee46c7 100755
--- a/.ci/scripts/run-docs
+++ b/.ci/scripts/run-docs
@@ -75,9 +75,6 @@ if [ "$1" == "advanced" ]; then
 fi
 
 if [ "$1" == "evaluation" ]; then
-
-    exit 0
-
         echo "::group::Create script to run evaluation"
         python3 torchchat/utils/scripts/updown.py --file torchchat/utils/docs/evaluation.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-evaluation.sh
         # for good measure, if something happened to updown processor,
diff --git a/torchchat/utils/docs/evaluation.md b/torchchat/utils/docs/evaluation.md
index 490500223..8bc995ca7 100644
--- a/torchchat/utils/docs/evaluation.md
+++ b/torchchat/utils/docs/evaluation.md
@@ -4,8 +4,13 @@
 
 # Evaluation Features
 
+<!--
+
 [shell default]: ./install/install_requirements.sh
 
+[shell default]: TORCHCHAT_ROOT=${PWD} ./torchchat/utils/scripts/install_et.sh
+
+-->
 
 Torchchat provides evaluation functionality for your language model on
 a variety of tasks using the
@@ -14,7 +19,7 @@ library.
 
 ## Usage
 
-The evaluation mode of `torchchat.py` script can be used to evaluate your language model on various tasks available in the `lm_eval` library such as "wikitext". You can specify the task(s) you want to evaluate using the `--tasks` option, and limit the evaluation using the `--limit` option. If no task is specified, it will default to evaluating on "wikitext".
+The evaluation mode of `torchchat.py` script can be used to evaluate your language model on various tasks available in the `lm_eval` library such as "wikitext". You can specify the task(s) you want to evaluate using the `--tasks` option, and limit the evaluation using the `--limit` option. If no task is specified, the task will default to evaluating on "wikitext".
 
 **Examples**
 

From e1fefc095947a901ad5337cf5d067941f8034fb6 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Tue, 10 Dec 2024 11:32:05 -0800
Subject: [PATCH 31/83] Revert "Use pytorch-labs/tokenizers and remove
 tokenizer/ (#1401)" (#1414)

This reverts commit fff956c5c3a68025025c5d906af80eb44e960ce4.
---
 .github/workflows/pull.yml              |   4 +-
 .gitmodules                             |  12 +-
 CMakeLists.txt                          |   9 +-
 runner/run.cpp                          | 517 +++++++++++++-----------
 runner/third-party/tokenizers           |   1 -
 tokenizer/CMakeLists.txt                |  29 ++
 tokenizer/__init__.py                   |   0
 tokenizer/base.py                       |  32 ++
 tokenizer/base64.h                      | 187 +++++++++
 tokenizer/hf_tokenizer.py               |  92 +++++
 tokenizer/sentencepiece.cpp             | 125 ++++++
 tokenizer/third-party/abseil-cpp        |   1 +
 tokenizer/third-party/re2               |   1 +
 tokenizer/third-party/sentencepiece     |   1 +
 tokenizer/tiktoken.cpp                  | 390 ++++++++++++++++++
 tokenizer/tiktoken.py                   | 241 +++++++++++
 tokenizer/tokenizer.h                   | 147 +++++++
 torchchat/utils/scripts/build_native.sh |   2 +-
 18 files changed, 1547 insertions(+), 244 deletions(-)
 delete mode 160000 runner/third-party/tokenizers
 create mode 100644 tokenizer/CMakeLists.txt
 create mode 100644 tokenizer/__init__.py
 create mode 100644 tokenizer/base.py
 create mode 100644 tokenizer/base64.h
 create mode 100644 tokenizer/hf_tokenizer.py
 create mode 100644 tokenizer/sentencepiece.cpp
 create mode 160000 tokenizer/third-party/abseil-cpp
 create mode 160000 tokenizer/third-party/re2
 create mode 160000 tokenizer/third-party/sentencepiece
 create mode 100644 tokenizer/tiktoken.cpp
 create mode 100644 tokenizer/tiktoken.py
 create mode 100644 tokenizer/tokenizer.h

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index c86e8ab62..c48436a80 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -942,7 +942,7 @@ jobs:
           path: |
             ./et-build
             ./torchchat/utils/scripts
-          key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}-${{ hashFiles('**/install_et.sh', '**/build_native.sh') }}
+          key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}-${{ hashFiles('**/install_et.sh') }}
       - if: ${{ steps.install-et.outputs.cache-hit != 'true' }}
         continue-on-error: true
         run: |
@@ -1053,7 +1053,7 @@ jobs:
 
           # Pull submodules (re2, abseil) for Tiktoken
           git submodule sync
-          git submodule update --init --recursive
+          git submodule update --init
           ./runner/build_android.sh
           echo "Tests complete."
 
diff --git a/.gitmodules b/.gitmodules
index 76bc1b9fd..7681823df 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,9 @@
-[submodule "runner/third-party/tokenizers"]
-	path = runner/third-party/tokenizers
-	url = https://github.com/pytorch-labs/tokenizers
+[submodule "tokenizer/third-party/abseil-cpp"]
+	path = tokenizer/third-party/abseil-cpp
+	url = https://github.com/abseil/abseil-cpp.git
+[submodule "tokenizer/third-party/re2"]
+	path = tokenizer/third-party/re2
+	url = https://github.com/google/re2.git
+[submodule "tokenizer/third-party/sentencepiece"]
+	path = tokenizer/third-party/sentencepiece
+	url = https://github.com/google/sentencepiece.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e004dbfcb..61fd4d5a6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,21 +7,18 @@ ELSE()
 ENDIF()
 
 project(Torchchat)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes")
 
 # include tokenizer
-add_subdirectory(runner/third-party/tokenizers)
+add_subdirectory(tokenizer)
 
 # include et_run executable
 include(runner/et.cmake)
 if(TARGET et_run)
-    target_link_libraries(et_run PUBLIC tokenizers microkernels-prod)
-    target_include_directories(et_run PUBLIC runner/third-party/tokenizers/include)
+    target_link_libraries(et_run PUBLIC tokenizer microkernels-prod)
 endif()
 
 # include aoti_run executable
 include(runner/aoti.cmake)
 if(TARGET aoti_run)
-    target_link_libraries(aoti_run tokenizers)
-    target_include_directories(aoti_run PUBLIC runner/third-party/tokenizers/include)
+    target_link_libraries(aoti_run tokenizer)
 endif()
diff --git a/runner/run.cpp b/runner/run.cpp
index f2b8e8e6b..abfbb4584 100644
--- a/runner/run.cpp
+++ b/runner/run.cpp
@@ -7,21 +7,20 @@ LICENSE file in the root directory of this source tree.
 */
 
 /* Inference for Llama-2 Transformer model in pure C++ */
-#include "sentencepiece.h"
-#include "tiktoken.h"
-#include <algorithm>
-#include <cinttypes>
-#include <cstdint>
-#include <cstdlib>
 #include <ctype.h>
-#include <iterator>
 #include <math.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <string>
 #include <time.h>
+#include <tokenizer.h>
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <string>
+
 #ifdef DEBUG
 #include <cassert>
 #include <iostream>
@@ -48,25 +47,13 @@ torch::Device aoti_device(torch::kCPU);
 #endif
 
 using exec_aten::ScalarType;
-using executorch::extension::make_tensor_ptr;
-using executorch::extension::TensorPtr;
 using torch::executor::EValue;
+using executorch::extension::TensorPtr;
+using executorch::extension::make_tensor_ptr;
 using torch::executor::Module;
 using torch::executor::Result;
 #endif
 
-using tokenizers::SPTokenizer;
-using tokenizers::Tiktoken;
-using tokenizers::Tokenizer;
-
-#define UNWRAP(x)                                                              \
-  ({                                                                           \
-    if (!(x).ok()) {                                                           \
-      fprintf(stderr, "Got error code % " PRIu32, x.error());                  \
-      exit(EXIT_FAILURE);                                                      \
-    }                                                                          \
-    std::move(x.get());                                                        \
-  })
 // ----------------------------------------------------------------------------
 // Transformer model
 
@@ -78,56 +65,56 @@ enum ModelType {
 
 ModelType get_model_type(int model_int) {
   switch (model_int) {
-  case 2:
-    return LLAMA2_MODEL;
-    break;
-  case 3:
-    return LLAMA3_MODEL;
-    break;
-  default:
-    return UNKNOWN_MODEL;
+    case 2:
+      return LLAMA2_MODEL;
+      break;
+    case 3:
+      return LLAMA3_MODEL;
+      break;
+    default:
+      return UNKNOWN_MODEL;
   }
 }
 
 typedef struct {
   int vocab_size; // vocabulary size, usually 256 (byte-level)
-  int seq_len;    // max sequence length
+  int seq_len; // max sequence length
 } Config;
 
 typedef struct {
-  float *logits; // output logits
-  int64_t *toks; // tokens seen so far; no kv-cache :(
+  float* logits; // output logits
+  int64_t* toks; // tokens seen so far; no kv-cache :(
 } RunState;
 
 typedef struct {
-  Config config;  // the hyperparameters of the architecture (the blueprint)
+  Config config; // the hyperparameters of the architecture (the blueprint)
   RunState state; // buffers for the "wave" of activations in the forward pass
 
 #ifdef __AOTI_MODEL__
-  torch::inductor::AOTIModelPackageLoader *runner;
+  torch::inductor::AOTIModelPackageLoader* runner;
 #else // __ET_MODEL__
-  Module *runner;
+  Module* runner;
 #endif
 
 } Transformer;
 
-void malloc_run_state(RunState *s, Config *p) {
+void malloc_run_state(RunState* s, Config* p) {
   // we calloc instead of malloc to keep valgrind happy
-  s->logits = (float *)calloc(p->vocab_size, sizeof(float));
-  s->toks = (int64_t *)calloc(p->seq_len, sizeof(int64_t));
+  s->logits = (float*)calloc(p->vocab_size, sizeof(float));
+  s->toks = (int64_t*)calloc(p->seq_len, sizeof(int64_t));
   if (!s->logits || !s->toks) {
     fprintf(stderr, "malloc failed!\n");
     exit(EXIT_FAILURE);
   }
 }
 
-void free_run_state(RunState *s) {
+void free_run_state(RunState* s) {
   free(s->logits);
   free(s->toks);
 }
 
-void read_checkpoint(char *checkpoint, Config *config) {
-  FILE *file = fopen(checkpoint, "rb");
+void read_checkpoint(char* checkpoint, Config* config) {
+  FILE* file = fopen(checkpoint, "rb");
   if (!file) {
     fprintf(stderr, "Couldn't open file %s\n", checkpoint);
     exit(EXIT_FAILURE);
@@ -141,8 +128,11 @@ void read_checkpoint(char *checkpoint, Config *config) {
   config->vocab_size = abs(config->vocab_size);
 }
 
-void build_transformer(Transformer *t, char *model_path, int vocab_size,
-                       int seq_len) {
+void build_transformer(
+    Transformer* t,
+    char* model_path,
+    int vocab_size,
+    int seq_len) {
   // read in the Config and the Weights from the model
   // read_checkpoint(model_path, &t->config);
   // allocate the RunState buffers
@@ -152,9 +142,7 @@ void build_transformer(Transformer *t, char *model_path, int vocab_size,
 
 #ifdef __AOTI_MODEL__
   t->runner = new torch::inductor::AOTIModelPackageLoader(model_path);
-  aoti_device = t->runner->get_metadata()["AOTI_DEVICE_KEY"] == "cpu"
-                    ? torch::Device(torch::kCPU)
-                    : torch::Device(torch::kCUDA);
+  aoti_device = t->runner->get_metadata()["AOTI_DEVICE_KEY"] == "cpu" ? torch::Device(torch::kCPU) : torch::Device(torch::kCUDA);
 #else //__ET_MODEL__
   t->runner = new Module(
       /* path to PTE model */ model_path,
@@ -162,7 +150,7 @@ void build_transformer(Transformer *t, char *model_path, int vocab_size,
 #endif
 }
 
-void free_transformer(Transformer *t) {
+void free_transformer(Transformer* t) {
   // free the RunState buffers
   free_run_state(&t->state);
   delete t->runner;
@@ -171,7 +159,7 @@ void free_transformer(Transformer *t) {
 // ----------------------------------------------------------------------------
 // neural net blocks; the dynamics of the Transformer
 
-void softmax(float *x, int size) {
+void softmax(float* x, int size) {
   // find max value (for numerical stability)
   float max_val = x[0];
   for (int i = 1; i < size; i++) {
@@ -191,9 +179,9 @@ void softmax(float *x, int size) {
   }
 }
 
-float *forward(Transformer *transformer, int token, int pos) {
-  Config *p = &transformer->config;
-  RunState *s = &transformer->state;
+float* forward(Transformer* transformer, int token, int pos) {
+  Config* p = &transformer->config;
+  RunState* s = &transformer->state;
   s->toks[pos] = token;
   long token_buffer[1] = {token};
   long pos_buffer[1] = {pos};
@@ -206,8 +194,8 @@ float *forward(Transformer *transformer, int token, int pos) {
   torch::Tensor token_tensor =
       torch::from_blob(token_buffer, {1, 1}, torch::kLong);
   torch::Tensor pos_tensor = torch::from_blob(pos_buffer, {1}, torch::kLong);
-  std::vector<torch::Tensor> inputs{token_tensor.to(aoti_device),
-                                    pos_tensor.to(aoti_device)};
+  std::vector<torch::Tensor> inputs{
+      token_tensor.to(aoti_device), pos_tensor.to(aoti_device)};
 
   torch::Tensor result = transformer->runner->run(inputs)[0]
                              .to(torch::dtype(torch::kFloat32))
@@ -216,8 +204,7 @@ float *forward(Transformer *transformer, int token, int pos) {
   memcpy(s->logits, logits, p->vocab_size * sizeof(float));
 #else // __ET_MODEL__
   TensorPtr pos_managed = make_tensor_ptr({1}, pos_buffer, ScalarType::Long);
-  TensorPtr tokens_managed =
-      make_tensor_ptr({1, 1}, token_buffer, ScalarType::Long);
+  TensorPtr tokens_managed = make_tensor_ptr({1, 1}, token_buffer, ScalarType::Long);
   std::vector<EValue> inputs;
   auto tmp1 = EValue(tokens_managed);
   auto tmp2 = EValue(pos_managed);
@@ -234,12 +221,17 @@ float *forward(Transformer *transformer, int token, int pos) {
   // HACK: the rest of this runner assumes that logits must be float,
   // so we simply convert them rather than plumbing
   // templating/switch-on-type through the rest of this file.
-  const auto &result_tensor = result[0].toTensor();
+  const auto& result_tensor = result[0].toTensor();
   ET_SWITCH_REALHBBF16_TYPES(
-      result_tensor.scalar_type(), unused, "forward", CTYPE, [&]() {
-        const CTYPE *logits = result_tensor.const_data_ptr<CTYPE>();
-        std::transform(logits, logits + p->vocab_size, s->logits,
-                       [](auto x) { return static_cast<float>(x); });
+      result_tensor.scalar_type(),
+      unused,
+      "forward",
+      CTYPE,
+      [&]() {
+        const CTYPE* logits = result_tensor.const_data_ptr<CTYPE>();
+        std::transform(logits, logits + p->vocab_size, s->logits, [](auto x) {
+          return static_cast<float>(x);
+        });
       });
 #endif
 
@@ -257,13 +249,13 @@ typedef struct {
 
 typedef struct {
   int vocab_size;
-  ProbIndex *probindex; // buffer used in top-p sampling
+  ProbIndex* probindex; // buffer used in top-p sampling
   float temperature;
   float topp;
   unsigned long long rng_state;
 } Sampler;
 
-int sample_argmax(float *probabilities, int n) {
+int sample_argmax(float* probabilities, int n) {
   // return the index that has the highest probability
   int max_i = 0;
   float max_p = probabilities[0];
@@ -276,7 +268,7 @@ int sample_argmax(float *probabilities, int n) {
   return max_i;
 }
 
-int sample_mult(float *probabilities, int n, float coin) {
+int sample_mult(float* probabilities, int n, float coin) {
   // sample index from probabilities (they must sum to 1!)
   // coin is a random number in [0, 1), usually from random_f32()
   float cdf = 0.0f;
@@ -289,9 +281,9 @@ int sample_mult(float *probabilities, int n, float coin) {
   return n - 1; // in case of rounding errors
 }
 
-int compare(const void *a, const void *b) {
-  ProbIndex *a_ = (ProbIndex *)a;
-  ProbIndex *b_ = (ProbIndex *)b;
+int compare(const void* a, const void* b) {
+  ProbIndex* a_ = (ProbIndex*)a;
+  ProbIndex* b_ = (ProbIndex*)b;
   if (a_->prob > b_->prob)
     return -1;
   if (a_->prob < b_->prob)
@@ -299,8 +291,12 @@ int compare(const void *a, const void *b) {
   return 0;
 }
 
-int sample_topp(float *probabilities, int n, float topp, ProbIndex *probindex,
-                float coin) {
+int sample_topp(
+    float* probabilities,
+    int n,
+    float topp,
+    ProbIndex* probindex,
+    float coin) {
   // top-p sampling (or "nucleus sampling") samples from the smallest set of
   // tokens that exceed probability topp. This way we never sample tokens that
   // have very low probabilities and are less likely to go "off the rails".
@@ -343,31 +339,37 @@ int sample_topp(float *probabilities, int n, float topp, ProbIndex *probindex,
   return probindex[last_idx].index; // in case of rounding errors
 }
 
-void build_sampler(Sampler *sampler, int vocab_size, float temperature,
-                   float topp, unsigned long long rng_seed) {
+void build_sampler(
+    Sampler* sampler,
+    int vocab_size,
+    float temperature,
+    float topp,
+    unsigned long long rng_seed) {
   sampler->vocab_size = vocab_size;
   sampler->temperature = temperature;
   sampler->topp = topp;
   sampler->rng_state = rng_seed;
   // buffer only used with nucleus sampling; may not need but it's ~small
   sampler->probindex =
-      (ProbIndex *)malloc(sampler->vocab_size * sizeof(ProbIndex));
+      (ProbIndex*)malloc(sampler->vocab_size * sizeof(ProbIndex));
 }
 
-void free_sampler(Sampler *sampler) { free(sampler->probindex); }
+void free_sampler(Sampler* sampler) {
+  free(sampler->probindex);
+}
 
-unsigned int random_u32(unsigned long long *state) {
+unsigned int random_u32(unsigned long long* state) {
   // xorshift rng: https://en.wikipedia.org/wiki/Xorshift#xorshift.2A
   *state ^= *state >> 12;
   *state ^= *state << 25;
   *state ^= *state >> 27;
   return (*state * 0x2545F4914F6CDD1Dull) >> 32;
 }
-float random_f32(unsigned long long *state) { // random float32 in [0,1)
+float random_f32(unsigned long long* state) { // random float32 in [0,1)
   return (random_u32(state) >> 8) / 16777216.0f;
 }
 
-int sample(Sampler *sampler, float *logits) {
+int sample(Sampler* sampler, float* logits) {
   // sample the token given the logits and some hyperparameters
   int next;
   if (sampler->temperature == 0.0f) {
@@ -388,37 +390,39 @@ int sample(Sampler *sampler, float *logits) {
       next = sample_mult(logits, sampler->vocab_size, coin);
     } else {
       // top-p (nucleus) sampling, clamping the least likely tokens to zero
-      next = sample_topp(logits, sampler->vocab_size, sampler->topp,
-                         sampler->probindex, coin);
+      next = sample_topp(
+          logits, sampler->vocab_size, sampler->topp, sampler->probindex, coin);
     }
   }
   return next;
 }
 
-Tokenizer *build_tokenizer(const char *tokenizer_path, ModelType model_type) {
-  Tokenizer *tokenizer = NULL;
+Tokenizer* build_tokenizer(const char* tokenizer_path, ModelType model_type) {
+  Tokenizer* tokenizer = NULL;
   switch (model_type) {
-  case LLAMA2_MODEL:
-    tokenizer = new SPTokenizer();
-    tokenizer->load(tokenizer_path);
-    break;
-  case LLAMA3_MODEL:
-    tokenizer = new Tiktoken();
-    tokenizer->load(tokenizer_path);
-    break;
-  default:
-    fprintf(stderr, "No tokenizer defined for model type %d.\n", model_type);
-    exit(EXIT_FAILURE);
+    case LLAMA2_MODEL:
+      tokenizer = new SPTokenizer();
+      tokenizer->load(tokenizer_path);
+      break;
+    case LLAMA3_MODEL:
+      tokenizer = new Tiktoken();
+      tokenizer->load(tokenizer_path);
+      break;
+    default:
+      fprintf(stderr, "No tokenizer defined for model type %d.\n", model_type);
+      exit(EXIT_FAILURE);
   }
   return tokenizer;
 }
 
-void free_tokenizer(Tokenizer *tokenizer) { delete tokenizer; }
+void free_tokenizer(Tokenizer* tokenizer) {
+  delete tokenizer;
+}
 
 // ----------------------------------------------------------------------------
 // utilities: time
 
-void safe_printf(const char *piece) {
+void safe_printf(const char* piece) {
   // piece might be a raw byte token, and we only want to print printable chars
   // or whitespace because some of the other bytes can be various control codes,
   // backspace, etc.
@@ -450,18 +454,21 @@ long time_in_ms() {
 // Prints decoded tokens generated from the transformer.
 // The first token is not printed and is assumed to be a BOS or other similar
 // token
-unsigned generate_from_prompt_tokens(Transformer *transformer,
-                                     Tokenizer *tokenizer, Sampler *sampler,
-                                     const std::vector<uint64_t> &prompt_tokens,
-                                     unsigned pos,
-                                     const std::vector<uint64_t> &stop_tokens,
-                                     int stop_pos, bool print_prompt,
-                                     bool print_tok_per_sec) {
+unsigned generate_from_prompt_tokens(
+    Transformer* transformer,
+    Tokenizer* tokenizer,
+    Sampler* sampler,
+    const std::vector<uint64_t>& prompt_tokens,
+    unsigned pos,
+    const std::vector<uint64_t>& stop_tokens,
+    int stop_pos,
+    bool print_prompt,
+    bool print_tok_per_sec) {
   if (prompt_tokens.size() == 0) {
     return pos;
   }
 
-  uint64_t next;  // will store the next token in the sequence
+  uint64_t next; // will store the next token in the sequence
   uint64_t token; // stores the current token to feed into the transformer
   bool done_with_prompt; // whether we are done processing prompt
 
@@ -479,7 +486,7 @@ unsigned generate_from_prompt_tokens(Transformer *transformer,
     if (pos_in_prompt < prompt_tokens.size()) {
       // Token comes from prompt
       token = prompt_tokens[pos_in_prompt++];
-      float *logits = forward(transformer, token, pos);
+      float* logits = forward(transformer, token, pos);
 
       // Next token is either from prompt or if on last
       // prompt token, next is sampled
@@ -491,27 +498,29 @@ unsigned generate_from_prompt_tokens(Transformer *transformer,
     } else {
       // Token comes from next sampled from previous round.
       token = next;
-      float *logits = forward(transformer, token, pos);
+      float* logits = forward(transformer, token, pos);
       next = sample(sampler, logits);
     }
     done_with_prompt = (pos_in_prompt >= prompt_tokens.size());
 
     // we terminate on finding the stop_token if we are done processing the
     // prompt (stop_tokens in the prompt do not terminate the loop)
-    if (done_with_prompt && (std::find(stop_tokens.begin(), stop_tokens.end(),
-                                       token) != stop_tokens.end())) {
+    if (done_with_prompt &&
+        (std::find(stop_tokens.begin(), stop_tokens.end(), token) !=
+         stop_tokens.end())) {
       found_stop_token = true;
     }
 
     // We print next in each iteration of the loop, not token
     if (!found_stop_token && (print_prompt || done_with_prompt)) {
       // The stop_token is printed as newline
-      bool next_is_stop = std::find(stop_tokens.begin(), stop_tokens.end(),
-                                    next) != stop_tokens.end();
+      bool next_is_stop =
+          std::find(stop_tokens.begin(), stop_tokens.end(), next) !=
+          stop_tokens.end();
       if (next_is_stop) {
         printf("\n");
       } else {
-        std::string piece = UNWRAP(tokenizer->decode(token, next));
+        std::string piece = tokenizer->decode(token, next);
         safe_printf(piece.c_str()); // same as printf("%s", piece), but skips
                                     // "unsafe" bytes
         fflush(stdout);
@@ -529,16 +538,23 @@ unsigned generate_from_prompt_tokens(Transformer *transformer,
   // iteration)
   if (print_tok_per_sec && pos > 1) {
     long end = time_in_ms();
-    fprintf(stderr, "\n\nachieved tok/s: %f\n",
-            (pos - 1) / (double)(end - start) * 1000);
+    fprintf(
+        stderr,
+        "\n\nachieved tok/s: %f\n",
+        (pos - 1) / (double)(end - start) * 1000);
   }
 
   return pos;
 }
 
-void generate(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler,
-              const char *prompt, int steps, ModelType model_type) {
-  const char *default_prompt = "Once upon a time";
+void generate(
+    Transformer* transformer,
+    Tokenizer* tokenizer,
+    Sampler* sampler,
+    const char* prompt,
+    int steps,
+    ModelType model_type) {
+  const char* default_prompt = "Once upon a time";
   if (prompt == NULL) {
     prompt = default_prompt;
   }
@@ -550,30 +566,33 @@ void generate(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler,
   std::vector<uint64_t> prompt_tokens;
   std::vector<uint64_t> stop_tokens;
   switch (model_type) {
-  case LLAMA2_MODEL:
-    prompt_tokens = UNWRAP(tokenizer->encode(prompt, 1, 0));
-    stop_tokens.push_back(tokenizer->eos_tok());
-    break;
-  case LLAMA3_MODEL:
-    prompt_tokens = UNWRAP(tokenizer->encode(prompt, 1, 0));
-    stop_tokens.push_back(
-        UNWRAP(tokenizer->encode("<|end_of_text|>", 0, 0))[0]);
-    stop_tokens.push_back(UNWRAP(tokenizer->encode("<|eot_id|>", 0, 0))[0]);
-    break;
-  default:
-    fprintf(stderr, "Generate does not support model type %d.\n", model_type);
-    exit(EXIT_FAILURE);
-  }
-
-  generate_from_prompt_tokens(transformer, tokenizer, sampler, prompt_tokens,
-                              /*pos=*/0,
-                              /*stop_tokens=*/stop_tokens,
-                              /*stop_pos=*/steps - 1,
-                              /*print_prompt=*/true,
-                              /*print_tok_per_sec=*/true);
+    case LLAMA2_MODEL:
+      prompt_tokens = tokenizer->encode(prompt, 1, 0);
+      stop_tokens.push_back(tokenizer->eos_tok());
+      break;
+    case LLAMA3_MODEL:
+      prompt_tokens = tokenizer->encode(prompt, 1, 0);
+      stop_tokens.push_back(tokenizer->encode("<|end_of_text|>", 0, 0)[0]);
+      stop_tokens.push_back(tokenizer->encode("<|eot_id|>", 0, 0)[0]);
+      break;
+    default:
+      fprintf(stderr, "Generate does not support model type %d.\n", model_type);
+      exit(EXIT_FAILURE);
+  }
+
+  generate_from_prompt_tokens(
+      transformer,
+      tokenizer,
+      sampler,
+      prompt_tokens,
+      /*pos=*/0,
+      /*stop_tokens=*/stop_tokens,
+      /*stop_pos=*/steps - 1,
+      /*print_prompt=*/true,
+      /*print_tok_per_sec=*/true);
 }
 
-void read_stdin(const char *guide, char *buffer, size_t bufsize) {
+void read_stdin(const char* guide, char* buffer, size_t bufsize) {
   // read a line from stdin, up to but not including \n
   printf("%s", guide);
   if (fgets(buffer, bufsize, stdin) != NULL) {
@@ -590,10 +609,11 @@ void read_stdin(const char *guide, char *buffer, size_t bufsize) {
 // python reference and that seemed ok, but this was not thoroughly tested and
 // is not safely implemented, it's more a proof of concept atm.
 
-std::vector<uint64_t> get_initial_prompt_tokens(const char *cli_system_prompt,
-                                                const char *cli_user_prompt,
-                                                Tokenizer *tokenizer,
-                                                ModelType model_type) {
+std::vector<uint64_t> get_initial_prompt_tokens(
+    const char* cli_system_prompt,
+    const char* cli_user_prompt,
+    Tokenizer* tokenizer,
+    ModelType model_type) {
   char system_prompt[512];
   char user_prompt[512];
   char rendered_prompt[512 * 2 + 200]; // the prompt template is ~170
@@ -602,8 +622,10 @@ std::vector<uint64_t> get_initial_prompt_tokens(const char *cli_system_prompt,
   if (cli_system_prompt != NULL) {
     strcpy(system_prompt, cli_system_prompt);
   } else {
-    read_stdin("Enter system prompt (optional): ", system_prompt,
-               sizeof(system_prompt));
+    read_stdin(
+        "Enter system prompt (optional): ",
+        system_prompt,
+        sizeof(system_prompt));
   }
 
   if (cli_user_prompt != NULL) {
@@ -615,40 +637,48 @@ std::vector<uint64_t> get_initial_prompt_tokens(const char *cli_system_prompt,
   std::vector<uint64_t> tokens;
 
   switch (model_type) {
-  case LLAMA2_MODEL:
-    if (system_prompt[0] != '\0') {
-      snprintf(rendered_prompt, sizeof(rendered_prompt) - 1,
-               "[INST] <<SYS>>\n%s\n<</SYS>>\n\n%s [/INST]", system_prompt,
-               user_prompt);
-    } else {
-      snprintf(rendered_prompt, sizeof(rendered_prompt) - 1,
-               "[INST] %s [/INST]", user_prompt);
-    }
+    case LLAMA2_MODEL:
+      if (system_prompt[0] != '\0') {
+        snprintf(
+            rendered_prompt,
+            sizeof(rendered_prompt) - 1,
+            "[INST] <<SYS>>\n%s\n<</SYS>>\n\n%s [/INST]",
+            system_prompt,
+            user_prompt);
+      } else {
+        snprintf(
+            rendered_prompt,
+            sizeof(rendered_prompt) - 1,
+            "[INST] %s [/INST]",
+            user_prompt);
+      }
 
-    // We need to add BOS token here and not in template because llama2
-    // tokenizer does not pattern match special tokens
-    tokens = UNWRAP(tokenizer->encode(rendered_prompt, 1, 0));
-    break;
-
-  case LLAMA3_MODEL:
-    if (system_prompt[0] != '\0') {
-      snprintf(rendered_prompt, sizeof(rendered_prompt) - 1,
-               "<|begin_of_text|><|start_header_id|>system<|end_header_id|>"
-               "\n\n%s<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n%s<"
-               "|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
-               system_prompt, user_prompt);
-    } else {
-      snprintf(rendered_prompt, sizeof(rendered_prompt) - 1,
-               "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n%"
-               "s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
-               user_prompt);
-    }
-    tokens = UNWRAP(tokenizer->encode(rendered_prompt, 0, 0));
-    break;
+      // We need to add BOS token here and not in template because llama2
+      // tokenizer does not pattern match special tokens
+      tokens = tokenizer->encode(rendered_prompt, 1, 0);
+      break;
+
+    case LLAMA3_MODEL:
+      if (system_prompt[0] != '\0') {
+        snprintf(
+            rendered_prompt,
+            sizeof(rendered_prompt) - 1,
+            "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+            system_prompt,
+            user_prompt);
+      } else {
+        snprintf(
+            rendered_prompt,
+            sizeof(rendered_prompt) - 1,
+            "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+            user_prompt);
+      }
+      tokens = tokenizer->encode(rendered_prompt, 0, 0);
+      break;
 
-  default:
-    fprintf(stderr, "Chat does not support model type %d.\n", model_type);
-    exit(EXIT_FAILURE);
+    default:
+      fprintf(stderr, "Chat does not support model type %d.\n", model_type);
+      exit(EXIT_FAILURE);
   }
 
 #ifdef DEBUG
@@ -665,8 +695,9 @@ std::vector<uint64_t> get_initial_prompt_tokens(const char *cli_system_prompt,
   return tokens;
 }
 
-std::vector<uint64_t> get_next_user_prompt_tokens(Tokenizer *tokenizer,
-                                                  ModelType model_type) {
+std::vector<uint64_t> get_next_user_prompt_tokens(
+    Tokenizer* tokenizer,
+    ModelType model_type) {
   char user_prompt[512];
   char rendered_prompt[512 + 150]; // the prompt template is ~100 characters. We
                                    // use 150 to be safe.
@@ -675,26 +706,30 @@ std::vector<uint64_t> get_next_user_prompt_tokens(Tokenizer *tokenizer,
   std::vector<uint64_t> tokens;
 
   switch (model_type) {
-  case LLAMA2_MODEL:
-    snprintf(rendered_prompt, sizeof(rendered_prompt) - 1, "[INST] %s [/INST]",
-             user_prompt);
-
-    // We need to add BOS token here and not in template because llama2
-    // tokenizer does not pattern match special tokens
-    tokens = UNWRAP(tokenizer->encode(rendered_prompt, /*bos*/ 1, /*eos*/ 0));
-    break;
-
-  case LLAMA3_MODEL:
-    snprintf(rendered_prompt, sizeof(rendered_prompt) - 1,
-             "<|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_"
-             "header_id|>assistant<|end_header_id|>\n\n",
-             user_prompt);
-    tokens = UNWRAP(tokenizer->encode(rendered_prompt, 0, 0));
-    break;
-
-  default:
-    fprintf(stderr, "Chat does not support model type %d.\n", model_type);
-    exit(EXIT_FAILURE);
+    case LLAMA2_MODEL:
+      snprintf(
+          rendered_prompt,
+          sizeof(rendered_prompt) - 1,
+          "[INST] %s [/INST]",
+          user_prompt);
+
+      // We need to add BOS token here and not in template because llama2
+      // tokenizer does not pattern match special tokens
+      tokens = tokenizer->encode(rendered_prompt, /*bos*/ 1, /*eos*/ 0);
+      break;
+
+    case LLAMA3_MODEL:
+      snprintf(
+          rendered_prompt,
+          sizeof(rendered_prompt) - 1,
+          "<|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+          user_prompt);
+      tokens = tokenizer->encode(rendered_prompt, 0, 0);
+      break;
+
+    default:
+      fprintf(stderr, "Chat does not support model type %d.\n", model_type);
+      exit(EXIT_FAILURE);
   }
 
 #ifdef DEBUG
@@ -711,9 +746,14 @@ std::vector<uint64_t> get_next_user_prompt_tokens(Tokenizer *tokenizer,
   return tokens;
 }
 
-void chat(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler,
-          const char *cli_user_prompt, const char *cli_system_prompt,
-          unsigned steps, ModelType model_type) {
+void chat(
+    Transformer* transformer,
+    Tokenizer* tokenizer,
+    Sampler* sampler,
+    const char* cli_user_prompt,
+    const char* cli_system_prompt,
+    unsigned steps,
+    ModelType model_type) {
   if (steps == 0) {
     return;
   }
@@ -721,16 +761,16 @@ void chat(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler,
   uint64_t eot_token;
   std::vector<uint64_t> prompt_tokens;
   switch (model_type) {
-  case LLAMA2_MODEL:
-    // llama2 uses EOS as EOT token
-    eot_token = tokenizer->eos_tok();
-    break;
-  case LLAMA3_MODEL:
-    eot_token = UNWRAP(tokenizer->encode("<|eot_id|>", 0, 0))[0];
-    break;
-  default:
-    fprintf(stderr, "Chat does not support model type %d.\n", model_type);
-    exit(EXIT_FAILURE);
+    case LLAMA2_MODEL:
+      // llama2 uses EOS as EOT token
+      eot_token = tokenizer->eos_tok();
+      break;
+    case LLAMA3_MODEL:
+      eot_token = tokenizer->encode("<|eot_id|>", 0, 0)[0];
+      break;
+    default:
+      fprintf(stderr, "Chat does not support model type %d.\n", model_type);
+      exit(EXIT_FAILURE);
   }
 
   std::vector<uint64_t> stop_tokens{eot_token};
@@ -744,7 +784,11 @@ void chat(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler,
     }
     printf("Assistant: ");
     pos = generate_from_prompt_tokens(
-        transformer, tokenizer, sampler, prompt_tokens, pos,
+        transformer,
+        tokenizer,
+        sampler,
+        prompt_tokens,
+        pos,
         /*stop_tokens=*/stop_tokens,
         /*stop_pos=*/steps - 1, // We could pass in -1 here if we do not want
                                 // the model to stop mid-reply
@@ -759,43 +803,46 @@ void chat(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler,
 
 void error_usage() {
   fprintf(stderr, "Usage:   run <model_path> [options]\n");
-  fprintf(stderr,
-          "Example: run model.{so,pte} -n 256 -i \"Once upon a time\"\n");
+  fprintf(
+      stderr, "Example: run model.{so,pte} -n 256 -i \"Once upon a time\"\n");
   fprintf(stderr, "Options:\n");
   fprintf(stderr, "  -t <float>  temperature in [0,inf], default 1.0\n");
-  fprintf(stderr, "  -p <float>  p value in top-p (nucleus) sampling in [0,1], "
-                  "default 0.9\n");
+  fprintf(
+      stderr,
+      "  -p <float>  p value in top-p (nucleus) sampling in [0,1], default 0.9\n");
   fprintf(stderr, "  -s <int>    random seed, default time(NULL)\n");
-  fprintf(stderr, "  -n <int>    number of steps to run for, default 256. 0 = "
-                  "max_seq_len\n");
+  fprintf(
+      stderr,
+      "  -n <int>    number of steps to run for, default 256. 0 = max_seq_len\n");
   fprintf(stderr, "  -i <string> input prompt\n");
   fprintf(stderr, "  -z <string> path to tokenizer\n");
   fprintf(stderr, "  -m <string> mode: generate|chat, default: generate\n");
   fprintf(stderr, "  -y <string> (optional) system prompt in chat mode\n");
-  fprintf(stderr,
-          "  -v <int>    (optional) vocab size, default is model-specific.\n");
-  fprintf(stderr,
-          "  -l <int>    (optional) llama version (2 or 3), default 2.\n");
+  fprintf(
+      stderr,
+      "  -v <int>    (optional) vocab size, default is model-specific.\n");
+  fprintf(
+      stderr, "  -l <int>    (optional) llama version (2 or 3), default 2.\n");
   fprintf(
       stderr,
       "  -d <string> (optional) device(CUDA or CPU)  model was exported for\n");
   exit(EXIT_FAILURE);
 }
 
-int main(int argc, char *argv[]) {
+int main(int argc, char* argv[]) {
   // default parameters
-  char *model_path = NULL;
-  char *tokenizer_path = NULL;
+  char* model_path = NULL;
+  char* tokenizer_path = NULL;
   float temperature =
       1.0f; // 0.0 = greedy deterministic. 1.0 = original. don't set higher
   float topp = 0.9f; // top-p in nucleus sampling. 1.0 = off. 0.9 works well,
                      // but slower
 
-  int steps = 128;                 // number of steps to run for
-  const char *prompt = NULL;       // prompt string
+  int steps = 128; // number of steps to run for
+  const char* prompt = NULL; // prompt string
   unsigned long long rng_seed = 0; // seed rng with time by default
-  const char *mode = "generate";   // generate|chat
-  char *system_prompt =
+  const char* mode = "generate"; // generate|chat
+  char* system_prompt =
       NULL; // the (optional) system prompt to use in chat mode
 
   int vocab_size = -1;
@@ -869,8 +916,10 @@ int main(int argc, char *argv[]) {
 
   ModelType model_type = get_model_type(llama_ver);
   if (model_type == UNKNOWN_MODEL) {
-    fprintf(stderr, "Unknown model type passed by -l argument.  Received l=%d.",
-            llama_ver);
+    fprintf(
+        stderr,
+        "Unknown model type passed by -l argument.  Received l=%d.",
+        llama_ver);
     error_usage();
   }
 
@@ -894,7 +943,7 @@ int main(int argc, char *argv[]) {
   if (steps < 0)
     steps = 0;
 
-  Tokenizer *tokenizer = build_tokenizer(tokenizer_path, model_type);
+  Tokenizer* tokenizer = build_tokenizer(tokenizer_path, model_type);
 
   // If no tokenizer path provided, get default for model_type
   if (vocab_size == -1) {
@@ -910,8 +959,14 @@ int main(int argc, char *argv[]) {
   if (strcmp(mode, "generate") == 0) {
     generate(&transformer, tokenizer, &sampler, prompt, steps, model_type);
   } else if (strcmp(mode, "chat") == 0) {
-    chat(&transformer, tokenizer, &sampler, prompt, system_prompt, steps,
-         model_type);
+    chat(
+        &transformer,
+        tokenizer,
+        &sampler,
+        prompt,
+        system_prompt,
+        steps,
+        model_type);
   } else {
     fprintf(stderr, "unknown mode: %s\n", mode);
     error_usage();
diff --git a/runner/third-party/tokenizers b/runner/third-party/tokenizers
deleted file mode 160000
index 19e463d66..000000000
--- a/runner/third-party/tokenizers
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 19e463d665110e1d23145df1ad72bb8db111618b
diff --git a/tokenizer/CMakeLists.txt b/tokenizer/CMakeLists.txt
new file mode 100644
index 000000000..39c20885d
--- /dev/null
+++ b/tokenizer/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.24)
+set(CMAKE_CXX_STANDARD 17)
+IF(DEFINED ENV{TORCHCHAT_ROOT})
+    set(TORCHCHAT_ROOT $ENV{TORCHCHAT_ROOT})
+ELSE()
+    set(TORCHCHAT_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
+ENDIF()
+
+# build tokenizer library
+add_library(
+    tokenizer
+    tokenizer.h
+    sentencepiece.cpp
+    tiktoken.cpp)
+
+target_include_directories(tokenizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} third-party/sentencepiece/src)
+
+# add RE2 as subdirectory
+set(ABSL_ENABLE_INSTALL ON)
+set(ABSL_PROPAGATE_CXX_STD ON)
+set(_pic_flag
+${CMAKE_POSITION_INDEPENDENT_CODE})
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+add_subdirectory(third-party/abseil-cpp)
+add_subdirectory(third-party/re2)
+add_subdirectory(third-party/sentencepiece)
+set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
+
+target_link_libraries(tokenizer PUBLIC re2::re2 sentencepiece-static)
diff --git a/tokenizer/__init__.py b/tokenizer/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tokenizer/base.py b/tokenizer/base.py
new file mode 100644
index 000000000..75998b32b
--- /dev/null
+++ b/tokenizer/base.py
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Abstract base class for all tokenizer classes in python matching c++ interface.
+"""
+
+# Standard
+from abc import ABC, abstractmethod
+from typing import List
+
+
+class TokenizerBase(ABC):
+    __doc__ = __doc__
+
+    @abstractmethod
+    def encode(self, s: str, *, bos: bool = False, eos: bool = False) -> List[int]:
+        """Encode the given string and optionally include bos/eos tokens"""
+
+    @abstractmethod
+    def decode(self, ids: List[int]) -> str:
+        """Decode the given token ids into a string"""
+
+    @abstractmethod
+    def bos_id(self) -> int:
+        """The id of the begin-of-string token"""
+
+    @abstractmethod
+    def eos_id(self) -> int:
+        """The id of the end-of-string token"""
diff --git a/tokenizer/base64.h b/tokenizer/base64.h
new file mode 100644
index 000000000..12b8703a8
--- /dev/null
+++ b/tokenizer/base64.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// @lint-ignore-every LICENSELINT
+/**************************************************************************
+   Copyright (c) 2023 sewenew
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+ *************************************************************************/
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <string>
+#include <string_view>
+
+namespace base64 {
+
+std::string decode(const std::string_view& input);
+
+namespace detail {
+
+constexpr uint32_t DECODE_TABLE[] = {
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62,  255,
+    255, 255, 63,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  255, 255,
+    255, 255, 255, 255, 255, 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
+    10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
+    25,  255, 255, 255, 255, 255, 255, 26,  27,  28,  29,  30,  31,  32,  33,
+    34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
+    49,  50,  51,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255};
+
+inline void validate(uint32_t v) {
+  if (v == 255) {
+    fprintf(stderr, "invalid char");
+    exit(EXIT_FAILURE);
+  }
+}
+
+inline void decode(const std::string_view& input, std::string& output) {
+  if (input.size() != 4) {
+    fprintf(stderr, "input length must be 4, got %zu", input.size());
+    exit(EXIT_FAILURE);
+  }
+
+  uint32_t val = 0;
+
+  uint8_t c = input[0];
+  auto v = DECODE_TABLE[c];
+  validate(v);
+  val = v;
+
+  c = input[1];
+  v = DECODE_TABLE[c];
+  validate(v);
+  val = (val << 6) | v;
+
+  c = input[2];
+  v = DECODE_TABLE[c];
+  validate(v);
+  val = (val << 6) | v;
+
+  c = input[3];
+  v = DECODE_TABLE[c];
+  validate(v);
+  val = (val << 6) | v;
+
+  output.push_back(static_cast<char>((val >> 16) & 0xFF));
+  output.push_back(static_cast<char>((val >> 8) & 0xFF));
+  output.push_back(static_cast<char>(val & 0xFF));
+}
+
+inline void decode_1_padding(
+    const std::string_view& input,
+    std::string& output) {
+  if (input.size() != 3) {
+    fprintf(stderr, "input length must be 3, got %zu", input.size());
+    exit(EXIT_FAILURE);
+  }
+
+  uint32_t val = 0;
+
+  uint8_t c = input[0];
+  auto v = DECODE_TABLE[c];
+  validate(v);
+  val = v;
+
+  c = input[1];
+  v = DECODE_TABLE[c];
+  validate(v);
+  val = (val << 6) | v;
+
+  c = input[2];
+  v = DECODE_TABLE[c];
+  validate(v);
+  val = (val << 6) | v;
+
+  output.push_back(static_cast<char>((val >> 10) & 0xFF));
+  output.push_back(static_cast<char>((val >> 2) & 0xFF));
+}
+
+inline void decode_2_padding(
+    const std::string_view& input,
+    std::string& output) {
+  assert(input.size() == 2);
+
+  uint32_t val = 0;
+
+  uint8_t c = input[0];
+  auto v = DECODE_TABLE[c];
+  validate(v);
+  val = v;
+
+  c = input[1];
+  v = DECODE_TABLE[c];
+  validate(v);
+  val = (val << 6) | v;
+
+  output.push_back(static_cast<char>((val >> 4) & 0xFF));
+}
+
+} // namespace detail
+
+inline std::string decode(const std::string_view& input) {
+  if (input.empty()) {
+    fprintf(stderr, "empty input");
+    exit(EXIT_FAILURE);
+  }
+
+  // Faster than `input.size() % 4`.
+  if ((input.size() & 3) != 0 || input.size() < 4) {
+    fprintf(
+        stderr,
+        "input length must be larger than 4 and is multiple of 4, got %zu",
+        input.size());
+    exit(EXIT_FAILURE);
+  }
+
+  std::string output;
+  output.reserve(input.size() / 4 * 3);
+  auto idx = 0U;
+  for (; idx < input.size() - 4; idx += 4) {
+    detail::decode(input.substr(idx, 4), output);
+  }
+
+  // Last 4 bytes. Might contain paddings.
+  if (input[idx + 3] == '=') {
+    if (input[idx + 2] == '=') {
+      // Tow paddings.
+      detail::decode_2_padding(input.substr(idx, 2), output);
+    } else {
+      // One padding.
+      detail::decode_1_padding(input.substr(idx, 3), output);
+    }
+  } else {
+    // No padding.
+    detail::decode(input.substr(idx, 4), output);
+  }
+
+  return output;
+}
+} // namespace base64
diff --git a/tokenizer/hf_tokenizer.py b/tokenizer/hf_tokenizer.py
new file mode 100644
index 000000000..7ad5807d1
--- /dev/null
+++ b/tokenizer/hf_tokenizer.py
@@ -0,0 +1,92 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Standard
+from typing import List, Optional
+import json
+import os
+
+# Third Party
+from tokenizers import Tokenizer
+
+# Local
+from .base import TokenizerBase
+
+
+class HFTokenizer(TokenizerBase):
+    """
+    Wrapper around the Huggingface `tokenizers` library for API compatibility
+    """
+
+    def __init__(self, file_path: str):
+        # If the path is a directory, look for "tokenizer.json" which is
+        # standard for transformers checkpoints and also look for the
+        # "tokenizer_config.json" file to parse eos/bos tokens
+        if os.path.isdir(file_path):
+            tokenizer_path = os.path.join(file_path, "tokenizer.json")
+            tokenizer_config_path = os.path.join(file_path, "tokenizer_config.json")
+        else:
+            tokenizer_path = file_path
+            tokenizer_config_path = os.path.join(os.path.dirname(file_path), "tokenizer_config.json")
+        if not os.path.isfile(tokenizer_path):
+            tokenizer_config_path = None
+
+        # Load the tokenizer itself
+        self._tokenizer = Tokenizer.from_file(tokenizer_path)
+
+        # If available, parse bos/eos tokens from the tokenizer config
+        self._bos_id, self._eos_id = None, None
+        if tokenizer_config_path is not None:
+            with open(tokenizer_config_path, "r") as handle:
+                tok_config = json.load(handle)
+            bos_token = tok_config.get("bos_token")
+            eos_token = tok_config.get("eos_token")
+            if bos_token is not None:
+                self._bos_id = self._tokenizer.token_to_id(bos_token)
+            if eos_token is not None:
+                self._eos_id = self._tokenizer.token_to_id(eos_token)
+
+        # If no eos/bos tokens found, go looking for them!
+        if None in [self._bos_id, self._eos_id]:
+            tok_content = json.loads(self._tokenizer.to_str())
+            if self._bos_id is None:
+                self._bos_id = self._look_for_special_token(tok_content, ["begin", "text"])
+            if self._eos_id is None:
+                self._eos_id = self._look_for_special_token(tok_content, ["end", "text"])
+
+        assert None not in [self._bos_id, self._eos_id], "Unable to find an BOS/EOS tokens"
+
+    @staticmethod
+    def _look_for_special_token(added_tokens: dict, search_strs: List[str]) -> Optional[int]:
+        candidate_toks = added_tokens
+        for search_str in search_strs:
+            candidate_toks = [
+                tok for tok in candidate_toks
+                if tok["special"] and search_str in tok["content"]
+            ]
+            if len(candidate_toks) == 1:
+                return candidate_toks[0]["id"]
+
+    def encode(
+        self,
+        s: str,
+        *,
+        bos: bool = False,
+        eos: bool = False,
+    ) -> List[int]:
+        res = self._tokenizer.encode(s, add_special_tokens=bos).ids
+        if eos and (not res or res[-1] != self._eos_token):
+            res.append(self._eos_token)
+        return res
+
+    def decode(self, ids: List[int]) -> str:
+        return self._tokenizer.decode(ids)
+
+    def bos_id(self) -> int:
+        return self._bos_id
+
+    def eos_id(self) -> int:
+        return self._eos_id
diff --git a/tokenizer/sentencepiece.cpp b/tokenizer/sentencepiece.cpp
new file mode 100644
index 000000000..0cdfc7e30
--- /dev/null
+++ b/tokenizer/sentencepiece.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// sentencepiece tokenizer
+
+#include <sentencepiece_processor.h>
+#include <tokenizer.h>
+#include <cinttypes>
+#include <string>
+#include "absl/strings/str_replace.h"
+
+const char kSpaceSymbol[] = "\xe2\x96\x81";
+
+SPTokenizer::SPTokenizer()
+    : Tokenizer(),
+      _processor(std::make_unique<sentencepiece::SentencePieceProcessor>()) {}
+
+/**
+ * @brief Load the tokenizer from a file. The tokenizer file contains the
+ * vocabulary and scores. The format is: the first integer is the maximum
+ * token length, followed by a list of (word_len, word) pairs. Here we
+ * are reading all the vocabulary into memory and keep it sorted for fast
+ * lookup.
+ *
+ * @param tokenizer_path The path to the tokenizer file.
+ * @return void
+ */
+void SPTokenizer::load(const std::string& tokenizer_path) {
+  if (initialized_) {
+    fprintf(stderr, "Tokenizer already initialized.\n");
+    return;
+  }
+  // read in the file
+  const auto status = _processor->Load(tokenizer_path);
+  if (!status.ok()) {
+    fprintf(stderr, "couldn't load %s\n. If this tokenizer artifact is for llama3, please pass `-l 3`.", tokenizer_path.c_str());
+    exit(EXIT_FAILURE);
+  }
+  // load vocab_size, bos_tok, eos_tok
+  vocab_size_ = _processor->GetPieceSize();
+  bos_tok_ = _processor->bos_id();
+  eos_tok_ = _processor->eos_id();
+  initialized_ = true;
+}
+
+SPTokenizer::~SPTokenizer() {}
+
+/**
+ * @brief Decode a token into string.
+ *
+ * @param prev_token The previous token.
+ * @param token The current token.
+ * @return std::string A pointer to the string representation of the
+ * token.
+ */
+std::string SPTokenizer::decode(uint64_t prev_token, uint64_t token) {
+  if (!initialized_) {
+    fprintf(stderr, "Tokenizer not initialized\n");
+    exit(EXIT_FAILURE);
+  }
+  // get rid of the control ids <s> and </s>
+  if (_processor->IsControl(token)) {
+    // NB: returning empty string doesn't work for some reason. It causes
+    // free(): invalid pointer error.
+    return " ";
+  }
+
+  std::string result =
+      absl::StrReplaceAll(_processor->IdToPiece(token), {{kSpaceSymbol, " "}});
+
+  // following BOS token, sentencepiece decoder strips any leading
+  // whitespace
+  if (prev_token == bos_tok_ && result[0] == ' ') {
+    result = result.substr(1);
+  }
+
+  // handle <0x0A>
+  result = absl::StrReplaceAll(result, {{"<0x0A>", "\n"}});
+
+  return result;
+}
+
+/**
+ * @brief Encode a string into a sequence of tokens.
+ *
+ * @param text The string to be encoded.
+ * @param bos The number of BOS to prepend to the token list.
+ * @param eos The number of EOS to append to the token list.
+ * @return std::vector<uint64_t>
+ */
+std::vector<uint64_t>
+SPTokenizer::encode(const std::string& text, int8_t bos, int8_t eos) {
+  if (!initialized_) {
+    fprintf(stderr, "Tokenizer not initialized\n");
+    exit(EXIT_FAILURE);
+  }
+  // workaround a weird issue that text doesn't have correct size()
+  std::string input(text.c_str());
+  // should we reserve memory?
+  std::vector<int> res;
+  auto status = _processor->Encode(input, &res);
+  if (!status.ok()) {
+    fprintf(stderr, "couldn't encode %s\n", text.c_str());
+    exit(EXIT_FAILURE);
+  }
+
+  std::vector<uint64_t> tokens;
+  for (auto i = 0; i < bos; ++i) {
+    tokens.push_back(bos_tok_);
+  }
+
+  for (auto i = 0; i < res.size(); ++i) {
+    tokens.push_back(res[i]);
+  }
+
+  for (auto i = 0; i < eos; ++i) {
+    tokens.push_back(eos_tok_);
+  }
+  return tokens;
+}
diff --git a/tokenizer/third-party/abseil-cpp b/tokenizer/third-party/abseil-cpp
new file mode 160000
index 000000000..854193071
--- /dev/null
+++ b/tokenizer/third-party/abseil-cpp
@@ -0,0 +1 @@
+Subproject commit 854193071498f330b71083d7e06a7cd18e02a4cc
diff --git a/tokenizer/third-party/re2 b/tokenizer/third-party/re2
new file mode 160000
index 000000000..ac82d4f62
--- /dev/null
+++ b/tokenizer/third-party/re2
@@ -0,0 +1 @@
+Subproject commit ac82d4f628a2045d89964ae11c48403d3b091af1
diff --git a/tokenizer/third-party/sentencepiece b/tokenizer/third-party/sentencepiece
new file mode 160000
index 000000000..7dcb54145
--- /dev/null
+++ b/tokenizer/third-party/sentencepiece
@@ -0,0 +1 @@
+Subproject commit 7dcb541451b1862d73f473b3804ccf8f2a9e10f6
diff --git a/tokenizer/tiktoken.cpp b/tokenizer/tiktoken.cpp
new file mode 100644
index 000000000..2f31f057a
--- /dev/null
+++ b/tokenizer/tiktoken.cpp
@@ -0,0 +1,390 @@
+// @lint-ignore-every LICENSELINT
+/**************************************************************************
+   Copyright (c) 2023 sewenew
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+ *************************************************************************/
+
+#include <base64.h>
+#include <tokenizer.h>
+#include <cctype>
+#include <cinttypes>
+#include <cstdint>
+#include <fstream>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <regex>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+// ------------------------------Util start------------------------------------
+
+static uint64_t _max_size() {
+  return std::numeric_limits<uint64_t>::max();
+}
+
+static Re2UPtr _create_regex(const std::string& pattern) {
+  assert(!pattern.empty());
+
+  return std::make_unique<re2::RE2>("(" + pattern + ")");
+}
+
+static Re2UPtr _build_special_token_regex(const Encoder& special_encoder) {
+  std::string special_pattern;
+  for (const auto& ele : special_encoder) {
+    if (!special_pattern.empty()) {
+      special_pattern += "|";
+    }
+    special_pattern += re2::RE2::QuoteMeta(ele.first);
+  }
+
+  if (special_pattern.empty()) {
+    return nullptr;
+  }
+
+  return _create_regex(special_pattern);
+}
+
+static std::pair<std::string, uint64_t> _parse(const std::string& line) {
+  auto pos = line.find(" ");
+  if (pos == std::string::npos) {
+    throw std::invalid_argument("invalid encoder line: " + line);
+  }
+
+  auto token = base64::decode({line.data(), pos});
+  uint64_t rank = 0;
+  try {
+    rank = std::stoul(line.substr(pos + 1));
+  } catch (const std::exception&) {
+    throw std::invalid_argument("invalid encoder rank:  " + line);
+  }
+
+  return {std::move(token), rank};
+}
+
+static Encoder _load_encoder(const std::string& path) {
+  std::ifstream file(path);
+  if (!file) {
+    fprintf(stderr, "failed to open encoder file: %s\n", path.c_str());
+    exit(EXIT_FAILURE);
+  }
+
+  Encoder encoder;
+  std::string line;
+  while (std::getline(file, line)) {
+    auto [token, rank] = _parse(line);
+
+    if (!encoder.emplace(std::move(token), rank).second) {
+      fprintf(stderr, "duplicate item: %s\n", line.c_str());
+    }
+  }
+  return encoder;
+}
+
+static Decoder _build_decoder(const Encoder& encoder) {
+  Decoder decoder;
+  for (const auto& [k, v] : encoder) {
+    decoder.emplace(v, k);
+  }
+
+  if (encoder.size() != decoder.size()) {
+    fprintf(stderr, "duplicate items in encoder");
+    exit(EXIT_FAILURE);
+  }
+
+  return decoder;
+}
+
+static std::vector<uint64_t> _byte_pair_merge(
+    const std::string& piece,
+    const std::unordered_map<std::string, uint64_t>& ranks,
+    std::function<uint64_t(uint64_t, uint64_t)> func) {
+  // This is a vector of (start, rank).
+  // The rank is of the byte pair starting at position start.
+  // The rank of the last item in the vector is not a valid value.
+  std::vector<std::pair<uint64_t, uint64_t>> parts;
+  parts.reserve(piece.size() + 1);
+  for (auto idx = 0U; idx < piece.size() + 1; ++idx) {
+    parts.emplace_back(idx, _max_size());
+  }
+
+  auto get_rank = [&piece, &ranks](
+                      const std::vector<std::pair<uint64_t, uint64_t>>& parts,
+                      uint64_t start_idx,
+                      uint64_t skip) -> std::optional<uint64_t> {
+    if (start_idx + skip + 2 < parts.size()) {
+      auto s = parts[start_idx].first;
+      auto e = parts[start_idx + skip + 2].first;
+      auto key = piece.substr(s, e - s);
+      auto iter = ranks.find(key);
+      if (iter != ranks.end()) {
+        return iter->second;
+      }
+    }
+    return std::nullopt;
+  };
+
+  // We look up the ranks once in the beginning and iteratively update
+  // them during each merge, which reduces the number of rank lookups.
+  for (auto i = 0U; i < parts.size() - 2; ++i) {
+    auto rank = get_rank(parts, i, 0);
+    if (rank) {
+      // usize::MAX is a sentinel value and cannot be a valid rank
+      if (*rank == _max_size()) {
+        fprintf(stderr, "at %" PRIu32 " rank is too large\n", i);
+      }
+      parts[i].second = *rank;
+    }
+  }
+
+  // If you have n parts and m merges, this does O(mn) work.
+  // We could do something with a heap and do O(m log n) work.
+  // It is important to consider that n is often small (<100), and as such
+  // the cache-locality benefits outweigh the algorithmic complexity downsides
+  // of the `parts` vector data structure above.
+
+  // Note that we hash bytes, not token pairs. As long as we train BPE the way
+  // we currently do, this is equivalent. An easy way to break this would be
+  // to decouple merge priority from token index or to prevent specific token
+  // merges.
+  while (true) {
+    if (parts.size() == 1) {
+      break;
+    }
+
+    // usize::MAX is a sentinel rank value allowing us to
+    // take the min more quickly
+    auto min_rank = std::make_pair<uint64_t, uint64_t>(_max_size(), 0);
+    for (auto i = 0U; i < parts.size() - 1; ++i) {
+      auto rank = parts[i].second;
+      if (rank < min_rank.first) {
+        min_rank.first = rank;
+        min_rank.second = i;
+      }
+    }
+
+    if (min_rank.first != _max_size()) {
+      auto i = min_rank.second;
+
+      // NOTE: We are about to remove parts[i + 1]. We do not do it
+      // yet because there are cache-locality benefits to updating
+      // parts[i] and parts[i-1] before removing, which could thrash
+      // the cache. Thus, we update the rank calculation by skipping over
+      // parts[i + 1], by invoking `get_rank!` with `skip = 1`.
+      auto rank = get_rank(parts, i, 1);
+      if (rank) {
+        parts[i].second = *rank;
+      } else {
+        parts[i].second = _max_size();
+      }
+      if (i > 0) {
+        rank = get_rank(parts, i - 1, 1);
+        if (rank) {
+          parts[i - 1].second = *rank;
+        } else {
+          parts[i - 1].second = _max_size();
+        }
+      }
+
+      parts.erase(parts.begin() + (i + 1));
+    } else {
+      break;
+    }
+  }
+  std::vector<uint64_t> out;
+  out.reserve(parts.size() - 1);
+  for (auto i = 0U; i < parts.size() - 1; ++i) {
+    auto s = parts[i].first;
+    auto e = parts[i + 1].first;
+    out.push_back(func(s, e));
+  }
+  return out;
+}
+
+static std::vector<uint64_t> _byte_pair_encode(
+    const std::string& piece,
+    const Encoder& encoder) {
+  if (piece.size() == 1) {
+    auto iter = encoder.find(piece);
+    if (iter != encoder.end()) {
+      return std::vector<uint64_t>({iter->second});
+    } else {
+      // TODO: is it possible?
+      return {};
+    }
+  }
+
+  return _byte_pair_merge(
+      piece, encoder, [&piece, &encoder](uint64_t start, uint64_t stop) {
+        std::string key = piece.substr(start, stop - start);
+        auto iter = encoder.find(key);
+        if (iter != encoder.end()) {
+          return iter->second;
+        } else {
+          // TODO: what if key does not exist? Should we return `unknown`?
+          // assert(false); // ??
+          return uint64_t(0);
+        }
+      });
+}
+// ------------------------------Util end------------------------------------
+// -------------------------private method start-------------------------------
+
+template <typename T>
+std::pair<std::optional<std::string>, re2::StringPiece>
+Tiktoken::_split_with_allowed_special_token(
+    re2::StringPiece& input,
+    const T& allowed_special) {
+  if (!_special_token_regex) {
+    return std::make_pair(std::nullopt, input);
+  }
+
+  auto start = input.begin();
+  std::string special;
+  while (true) {
+    if (!re2::RE2::FindAndConsume(&input, *_special_token_regex, &special)) {
+      // No special token.
+      break;
+    }
+
+    if (allowed_special.count(special) == 1) {
+      // Found an allowed special token, split the text with it.
+      return std::make_pair(
+          special,
+          re2::StringPiece(start, input.begin() - start - special.size()));
+    } // else try to find the next special token
+  }
+
+  return std::make_pair(std::nullopt, input);
+}
+
+void Tiktoken::_encode(
+    re2::StringPiece& input,
+    std::vector<uint64_t>& ret,
+    uint64_t& last_piece_token_len) {
+  std::string piece;
+  assert(_regex);
+  while (re2::RE2::FindAndConsume(&input, *_regex, &piece)) {
+    auto iter = _encoder.find(piece);
+    if (iter != _encoder.end()) {
+      last_piece_token_len = 1;
+      ret.push_back(iter->second);
+      continue;
+    }
+    auto tokens = _byte_pair_encode(piece, _encoder);
+    last_piece_token_len = tokens.size();
+    ret.insert(ret.end(), tokens.begin(), tokens.end());
+  }
+}
+
+template <typename T>
+std::pair<std::vector<uint64_t>, uint64_t> Tiktoken::_encode_with_special_token(
+    const std::string& text,
+    const T& allowed_special) {
+  std::vector<uint64_t> tokens;
+  uint64_t last_piece_token_len = 0;
+  re2::StringPiece input(text);
+  while (true) {
+    auto [special, sub_input] =
+        _split_with_allowed_special_token(input, allowed_special);
+
+    _encode(sub_input, tokens, last_piece_token_len);
+
+    if (special) {
+      uint64_t token = 0;
+      try {
+        token = _special_token_encoder.at(*special);
+      } catch (const std::out_of_range&) {
+        // Should never go here, since special pattern includes all special
+        // chars.
+        fprintf(stderr, "unknown special token: %s\n", special->c_str());
+        exit(EXIT_FAILURE);
+      }
+
+      tokens.push_back(token);
+      last_piece_token_len = 0;
+    } else {
+      break;
+    }
+  }
+
+  // last_piece_token_len is how many tokens came from the last regex split.
+  // This is used for determining unstable tokens, since you can't merge
+  // across (stable) regex splits
+  return std::make_pair(tokens, last_piece_token_len);
+}
+
+// -------------------------private method end-------------------------------
+// -------------------------public method start-------------------------------
+
+Tiktoken::Tiktoken() : Tokenizer() {}
+
+void Tiktoken::load(const std::string& path) {
+  _encoder = _load_encoder(path);
+  _special_token_encoder = _get_special_tokens(_encoder.size());
+
+  _decoder = _build_decoder(_encoder);
+  _special_token_decoder = _build_decoder(_special_token_encoder);
+
+  _regex = _create_regex(_pattern);
+  _special_token_regex = _build_special_token_regex(_special_token_encoder);
+
+  // initialize vocab_size, bos_tok, eos_tok
+  vocab_size_ = _encoder.size() + _special_token_encoder.size();
+  bos_tok_ = _encoder.size(); // hardcoded (see _get_special_tokens)
+  eos_tok_ = _encoder.size() + 1; // hardcoded (see _get_special_tokens)
+  initialized_ = true;
+}
+
+std::vector<uint64_t>
+Tiktoken::encode(const std::string& text, int8_t bos, int8_t eos) {
+  if (!initialized_) {
+    exit(EXIT_FAILURE);
+  }
+  auto res = _encode_with_special_token(text, _special_token_encoder).first;
+  for (auto i = 0; i < bos; ++i) {
+    res.insert(res.begin(), bos_tok_);
+  }
+  for (auto i = 0; i < eos; ++i) {
+    res.push_back(eos_tok_);
+  }
+  return res;
+}
+
+std::string Tiktoken::decode(uint64_t prev, uint64_t cur) {
+  (void)prev;
+  if (!initialized_) {
+    exit(EXIT_FAILURE);
+  }
+  std::string ret;
+
+  std::string token_bytes;
+  auto iter = _decoder.find(cur);
+  if (iter != _decoder.end()) {
+    token_bytes = iter->second;
+  } else {
+    iter = _special_token_decoder.find(cur);
+    if (iter != _special_token_decoder.end()) {
+      token_bytes = iter->second;
+    } else {
+      fprintf(stderr, "unknown token: %" PRIu64 "\n", cur);
+      exit(EXIT_FAILURE);
+    }
+  }
+  ret += token_bytes;
+
+  return ret;
+}
+// -------------------------public method end-------------------------------
diff --git a/tokenizer/tiktoken.py b/tokenizer/tiktoken.py
new file mode 100644
index 000000000..30eb98624
--- /dev/null
+++ b/tokenizer/tiktoken.py
@@ -0,0 +1,241 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+from logging import getLogger
+from pathlib import Path
+from typing import (
+    AbstractSet,
+    cast,
+    Collection,
+    Dict,
+    Iterator,
+    List,
+    Literal,
+    Sequence,
+    TypedDict,
+    Union,
+)
+
+import tiktoken
+from tiktoken.load import load_tiktoken_bpe
+
+from .base import TokenizerBase
+
+
+logger = getLogger(__name__)
+
+
+Role = Literal["system", "user", "assistant"]
+
+
+class Message(TypedDict):
+    role: Role
+    content: str
+
+
+Dialog = Sequence[Message]
+
+
+class Tokenizer(TokenizerBase):
+    """
+    tokenizing and encoding/decoding text using the Tiktoken tokenizer.
+    """
+
+    special_tokens: Dict[str, int]
+
+    num_reserved_special_tokens = 256
+
+    pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"  # noqa: E501
+
+    def __init__(self, model_path: str):
+        """
+        Initializes the Tokenizer with a Tiktoken model.
+
+        Args:
+            model_path (str): The path to the Tiktoken model file.
+        """
+        # reload tokenizer
+        assert os.path.isfile(model_path), model_path
+
+        mergeable_ranks = load_tiktoken_bpe(model_path)
+        num_base_tokens = len(mergeable_ranks)
+        special_tokens = [
+            "<|begin_of_text|>",
+            "<|end_of_text|>",
+            "<|reserved_special_token_0|>",
+            "<|reserved_special_token_1|>",
+            "<|reserved_special_token_2|>",
+            "<|reserved_special_token_3|>",
+            "<|start_header_id|>",
+            "<|end_header_id|>",
+            "<|reserved_special_token_4|>",
+            "<|eot_id|>",  # end of turn
+        ] + [
+            f"<|reserved_special_token_{i}|>"
+            for i in range(5, self.num_reserved_special_tokens - 5)
+        ]
+        self.special_tokens = {
+            token: num_base_tokens + i for i, token in enumerate(special_tokens)
+        }
+        self.model = tiktoken.Encoding(
+            name=Path(model_path).name,
+            pat_str=self.pat_str,
+            mergeable_ranks=mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        logger.debug(f"Reloaded Tiktoken model from {model_path}")
+
+        # BOS / EOS token IDs
+        self.n_words: int = self.model.n_vocab
+        self._bos_id: int = self.special_tokens["<|begin_of_text|>"]
+        self._eos_id: int = self.special_tokens["<|end_of_text|>"]
+        self.pad_id: int = -1
+        self.stop_tokens = {
+            self.special_tokens["<|end_of_text|>"],
+            self.special_tokens["<|eot_id|>"],
+        }
+        logger.debug(
+            f"#words: {self.n_words} - BOS ID: {self._bos_id} - EOS ID: {self._eos_id}"
+        )
+
+    def encode(
+        self,
+        s: str,
+        *,
+        bos: bool = False,
+        eos: bool = False,
+        allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),  # noqa B006
+        disallowed_special: Union[Literal["all"], Collection[str]] = (),
+    ) -> List[int]:
+        """
+        Encodes a string into a list of token IDs.
+
+        Args:
+            s (str): The input string to be encoded.
+            bos (bool): Whether to prepend the beginning-of-sequence token.
+            eos (bool): Whether to append the end-of-sequence token.
+            allowed_special ("all"|set[str]): allowed special tokens in string
+            disallowed_special ("all"|set[str]): special tokens that raise an error when in string
+
+        Returns:
+            list[int]: A list of token IDs.
+
+        By default, setting disallowed_special=() encodes a string by ignoring
+        special tokens. Specifically:
+        - Setting `disallowed_special` to () will cause all text corresponding
+          to special tokens to be encoded as natural text (instead of raising
+          an error).
+        - Setting `allowed_special` to "all" will treat all text corresponding
+          to special tokens to be encoded as special tokens.
+        """
+        assert type(s) is str
+
+        # The tiktoken tokenizer can handle <=400k chars without
+        # pyo3_runtime.PanicException (may go beyond 400k)
+        TIKTOKEN_MAX_ENCODE_CHARS = 400_000
+
+        # https://github.com/openai/tiktoken/issues/195
+        # Here we iterate over subsequences and split if we exceed the limit
+        # of max consecutive non-whitespace or whitespace characters.
+        MAX_NO_WHITESPACES_CHARS = 25_000
+
+        substrs = (
+            substr
+            for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS)
+            for substr in self._split_whitespaces_or_nonwhitespaces(
+                s[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
+            )
+        )
+        t: List[int] = []
+        for substr in substrs:
+            t.extend(
+                self.model.encode(
+                    substr,
+                    allowed_special=allowed_special,
+                    disallowed_special=disallowed_special,
+                )
+            )
+        if bos:
+            t.insert(0, self._bos_id)
+        if eos:
+            t.append(self._eos_id)
+        return t
+
+    def bos_id(self) -> int:
+        return self._bos_id
+
+    def eos_id(self) -> int:
+        return self._eos_id
+
+    def decode(self, t: Sequence[int]) -> str:
+        """
+        Decodes a list of token IDs into a string.
+
+        Args:
+            t (List[int]): The list of token IDs to be decoded.
+
+        Returns:
+            str: The decoded string.
+        """
+        # typecast is safe here, Tiktoken doesn't do anything list-related with the sequence.
+        return self.model.decode(cast(List[int], t))
+
+    @staticmethod
+    def _split_whitespaces_or_nonwhitespaces(
+        s: str, max_consecutive_slice_len: int
+    ) -> Iterator[str]:
+        """
+        Split the string `s` so that each substring contains no more than `max_consecutive_slice_len`
+        consecutive whitespaces or consecutive non-whitespaces
+        """
+        current_slice_len = 0
+        current_slice_is_space = s[0].isspace() if len(s) > 0 else False
+        slice_start = 0
+
+        for i in range(len(s)):
+            is_now_space = s[i].isspace()
+
+            if current_slice_is_space ^ is_now_space:
+                current_slice_len = 1
+                current_slice_is_space = is_now_space
+            else:
+                current_slice_len += 1
+                if current_slice_len > max_consecutive_slice_len:
+                    yield s[slice_start:i]
+                    slice_start = i
+                    current_slice_len = 1
+        yield s[slice_start:]
+
+
+class ChatFormat:
+    def __init__(self, tokenizer: Tokenizer):
+        self.tokenizer = tokenizer
+
+    def encode_header(self, message: Message) -> List[int]:
+        tokens = []
+        tokens.append(self.tokenizer.special_tokens["<|start_header_id|>"])
+        tokens.extend(self.tokenizer.encode(message["role"], bos=False, eos=False))
+        tokens.append(self.tokenizer.special_tokens["<|end_header_id|>"])
+        tokens.extend(self.tokenizer.encode("\n\n", bos=False, eos=False))
+        return tokens
+
+    def encode_message(self, message: Message) -> List[int]:
+        tokens = self.encode_header(message)
+        tokens.extend(
+            self.tokenizer.encode(message["content"].strip(), bos=False, eos=False)
+        )
+        tokens.append(self.tokenizer.special_tokens["<|eot_id|>"])
+        return tokens
+
+    def encode_dialog_prompt(self, dialog: Dialog) -> List[int]:
+        tokens = []
+        tokens.append(self.tokenizer.special_tokens["<|begin_of_text|>"])
+        for message in dialog:
+            tokens.extend(self.encode_message(message))
+        # Add the start of an assistant message for the model to complete
+        tokens.extend(self.encode_header({"role": "assistant", "content": ""}))
+        return tokens
diff --git a/tokenizer/tokenizer.h b/tokenizer/tokenizer.h
new file mode 100644
index 000000000..9e1977b71
--- /dev/null
+++ b/tokenizer/tokenizer.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// A simple Tokenizer interface.
+#pragma once
+
+#include <re2/re2.h>
+#include <cctype>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <regex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "sentencepiece_processor.h"
+
+class Tokenizer {
+ public:
+  explicit Tokenizer() {}
+  virtual ~Tokenizer() {}
+
+  virtual void load(const std::string& tokenizer_path) = 0;
+
+  virtual std::vector<uint64_t>
+  encode(const std::string& input, int8_t bos, int8_t eos) = 0;
+
+  virtual std::string decode(uint64_t prev_token, uint64_t token) = 0;
+
+  // getters
+  int32_t vocab_size() const {
+    return vocab_size_;
+  }
+
+  uint64_t bos_tok() const {
+    return bos_tok_;
+  }
+
+  uint64_t eos_tok() const {
+    return eos_tok_;
+  }
+
+ protected:
+  bool initialized_ = false;
+  int32_t vocab_size_;
+  uint64_t bos_tok_, eos_tok_;
+};
+
+// ----------------------- SPTokenizer -----------------------
+// Used by sentencepiece. Adapted from llama2.c.
+struct TokenIndex {
+  const char* str;
+  int32_t id;
+};
+
+class SPTokenizer : public Tokenizer {
+ public:
+  explicit SPTokenizer();
+  ~SPTokenizer() override;
+
+  void load(const std::string& tokenizer_path) override;
+
+  std::vector<uint64_t> encode(const std::string& input, int8_t bos, int8_t eos)
+      override;
+
+  std::string decode(uint64_t prev_token, uint64_t token) override;
+
+ private:
+  std::unique_ptr<sentencepiece::SentencePieceProcessor> _processor;
+};
+
+// ----------------------- Tiktoken -----------------------
+// Used by OpenAI, adapted from https://github.com/sewenew/tokenizer
+
+using Encoder = std::unordered_map<std::string, uint64_t>;
+using Decoder = std::unordered_map<uint64_t, std::string>;
+using Re2UPtr = std::unique_ptr<re2::RE2>;
+
+class Tiktoken : public Tokenizer {
+ public:
+  explicit Tiktoken();
+  ~Tiktoken(){};
+
+  void load(const std::string& tokenizer_path);
+
+  std::vector<uint64_t>
+  encode(const std::string& input, int8_t bos, int8_t eos);
+
+  std::string decode(uint64_t prev_token, uint64_t token);
+
+ private:
+  static inline const Encoder _get_special_tokens(ssize_t num_base_tokens) {
+    Encoder special_tokens;
+    special_tokens.emplace("<|begin_of_text|>", num_base_tokens++);
+    special_tokens.emplace("<|end_of_text|>", num_base_tokens++);
+    special_tokens.emplace("<|reserved_special_token_0|>", num_base_tokens++);
+    special_tokens.emplace("<|reserved_special_token_1|>", num_base_tokens++);
+    special_tokens.emplace("<|reserved_special_token_2|>", num_base_tokens++);
+    special_tokens.emplace("<|reserved_special_token_3|>", num_base_tokens++);
+    special_tokens.emplace("<|start_header_id|>", num_base_tokens++);
+    special_tokens.emplace("<|end_header_id|>", num_base_tokens++);
+    special_tokens.emplace("<|reserved_special_token_4|>", num_base_tokens++);
+    special_tokens.emplace("<|eot_id|>", num_base_tokens++);
+    for (auto i = 5; i < 251; ++i) {
+      special_tokens.emplace(
+          "<|reserved_special_token_" + std::to_string(i) + "|>",
+          num_base_tokens++);
+    }
+    return special_tokens;
+  }
+
+  template <typename T>
+  std::pair<std::optional<std::string>, re2::StringPiece>
+  _split_with_allowed_special_token(
+      re2::StringPiece& input,
+      const T& allowed_special);
+
+  void _encode(
+      re2::StringPiece& input,
+      std::vector<uint64_t>& ret,
+      uint64_t& last_piece_token_len);
+
+  template <typename T>
+  std::pair<std::vector<uint64_t>, uint64_t> _encode_with_special_token(
+      const std::string& text,
+      const T& allowed_special);
+
+  // Removed negative lookahead \s+(?!\S) since it's not supported by RE2.
+  const std::string _pattern =
+      R"((?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+)";
+  Encoder _encoder;
+  Encoder _special_token_encoder;
+  Decoder _decoder;
+  Decoder _special_token_decoder;
+
+  Re2UPtr _regex;
+  Re2UPtr _special_token_regex;
+};
diff --git a/torchchat/utils/scripts/build_native.sh b/torchchat/utils/scripts/build_native.sh
index 909fd2b97..3c2c1c846 100755
--- a/torchchat/utils/scripts/build_native.sh
+++ b/torchchat/utils/scripts/build_native.sh
@@ -64,7 +64,7 @@ fi
 
 
 pushd ${TORCHCHAT_ROOT}
-git submodule update --init --recursive
+git submodule update --init
 git submodule sync
 if [[ "$TARGET" == "et" ]]; then
   if [ ! -d "${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install" ]; then

From bc0c1dc0d92cc578e86065201ca03aba81cba584 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Tue, 10 Dec 2024 13:03:08 -0800
Subject: [PATCH 32/83] Update README.md (whitespace) (#1412)

Consistent spacing around punctuation
---
 README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 3c37edf09..f1c70acfb 100644
--- a/README.md
+++ b/README.md
@@ -45,16 +45,16 @@ aliases.
 
 | Model | Mobile Friendly | Notes |
 |------------------|---|---------------------|
-|[meta-llama/Meta-Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)|✅|Tuned for `chat` . Alias to `llama3.2-3b`.|
+|[meta-llama/Meta-Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)|✅|Tuned for `chat`. Alias to `llama3.2-3b`.|
 |[meta-llama/Meta-Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B)|✅|Best for `generate`. Alias to `llama3.2-3b-base`.|
-|[meta-llama/Llama-Guard-3-1B](https://huggingface.co/meta-llama/Llama-Guard-3-1B)|✅|Tuned for classification . Alias to `llama3-1b-guard`.|
-|[meta-llama/Meta-Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)|✅|Tuned for `chat` . Alias to `llama3.2-1b`.|
+|[meta-llama/Llama-Guard-3-1B](https://huggingface.co/meta-llama/Llama-Guard-3-1B)|✅|Tuned for classification. Alias to `llama3-1b-guard`.|
+|[meta-llama/Meta-Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)|✅|Tuned for `chat`. Alias to `llama3.2-1b`.|
 |[meta-llama/Meta-Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B)|✅|Best for `generate`. Alias to `llama3.2-1b-base`.|
-|[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)||Multimodal (Image + Text). Tuned for `chat` . Alias to `llama3.2-11B`.|
-|[meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)||Multimodal (Image + Text). Tuned for `generate` . Alias to `llama3.2-11B-base`.|
-|[meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)|✅|Tuned for `chat` . Alias to `llama3.1`.|
+|[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)||Multimodal (Image + Text). Tuned for `chat`. Alias to `llama3.2-11B`.|
+|[meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)||Multimodal (Image + Text). Tuned for `generate`. Alias to `llama3.2-11B-base`.|
+|[meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)|✅|Tuned for `chat`. Alias to `llama3.1`.|
 |[meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B)|✅|Best for `generate`. Alias to `llama3.1-base`.|
-|[meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)|✅|Tuned for `chat` . Alias to `llama3`.|
+|[meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)|✅|Tuned for `chat`. Alias to `llama3`.|
 |[meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)|✅|Best for `generate`. Alias to `llama3-base`.|
 |[meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)|✅|Tuned for `chat`. Alias to `llama2`.|
 |[meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)||Tuned for `chat`. Alias to `llama2-13b-chat`.|

From dfbd8652fb818c252cf116ea03de7df3b0c79888 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Tue, 10 Dec 2024 13:03:58 -0800
Subject: [PATCH 33/83] Update evaluation.md to include AOTI (#1411)

* Update evaluation.md to include AOTI

Update evaluation.md to include running tests with AOTI

* Update evaluation.md

Fix typo

* Formatting evaluation.md

Fix formatting

* Formatting evaluation.md
---
 torchchat/utils/docs/evaluation.md | 33 +++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/torchchat/utils/docs/evaluation.md b/torchchat/utils/docs/evaluation.md
index 8bc995ca7..ac2aa54d3 100644
--- a/torchchat/utils/docs/evaluation.md
+++ b/torchchat/utils/docs/evaluation.md
@@ -21,24 +21,51 @@ library.
 
 The evaluation mode of `torchchat.py` script can be used to evaluate your language model on various tasks available in the `lm_eval` library such as "wikitext". You can specify the task(s) you want to evaluate using the `--tasks` option, and limit the evaluation using the `--limit` option. If no task is specified, the task will default to evaluating on "wikitext".
 
-**Examples**
+## Examples
+
+### Evaluation example with model in Python
 
 Running wikitext for 10 iterations
 ```
 python3 torchchat.py eval stories15M --tasks wikitext --limit 10
 ```
 
-Running an exported model
+Running wikitext with torch.compile for 10 iterations
+```
+python3 torchchat.py eval stories15M --compile --tasks wikitext --limit 10
+```
+
+Running multiple tasks and calling eval.py directly (with torch.compile):
+```
+python3 torchchat.py eval stories15M --compile --tasks wikitext hellaswag
+```
+
+### Evaluation with model exported to PTE with ExecuTorch
+
+Running an exported model with ExecuTorch (as PTE)
 ```
 python3 torchchat.py export stories15M --output-pte-path stories15M.pte
 python3 torchchat.py eval stories15M --pte-path stories15M.pte
 ```
 
-Running multiple tasks and calling eval.py directly:
+Running multiple tasks and calling eval.py directly (with PTE):
 ```
 python3 torchchat.py eval stories15M --pte-path stories15M.pte --tasks wikitext hellaswag
 ```
 
+### Evaluation with model exported to DSO with AOT Inductor (AOTI)
+
+Running an exported model with AOT Inductor (DSO model)
+```
+python3 torchchat.py export stories15M --dtype fast16 --output-dso-path stories15M.so
+python3 torchchat.py eval stories15M --dtype fast16 --dso-path stories15M.so
+```
+
+Running multiple tasks and calling eval.py directly (with AOTI):
+```
+python3 torchchat.py eval stories15M --dso-path stories15M.so --tasks wikitext hellaswag
+```
+
 For more information and a list of tasks/metrics see [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness).
 
 [end default]: end

From 19ecd955828925da6947381df9e1edd7d7c9a28c Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Tue, 10 Dec 2024 17:27:23 -0800
Subject: [PATCH 34/83] Update ADVANCED-USERS.md (#1396)

* Update ADVANCED-USERS.md

* Avoid bleed-thru of markup for test
* Remove prerelease warning (consistent with other files, we have a general disclaimer in docs already

* Update ADVANCED-USERS.md

Fix typo

---------

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 docs/ADVANCED-USERS.md | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/docs/ADVANCED-USERS.md b/docs/ADVANCED-USERS.md
index b996bf202..a8d02c2f9 100644
--- a/docs/ADVANCED-USERS.md
+++ b/docs/ADVANCED-USERS.md
@@ -1,22 +1,20 @@
 > [!WARNING]
 > Files in this directory may be outdated, incomplete, scratch notes, or a WIP. torchchat provides no guarantees on these files as references. Please refer to the root README for stable features and documentation.
 
-# Torchchat is still in pre-release!
-
-
-Torchchat is currently in a pre-release state and under extensive development.
-
 
 # The Lost Manual: torchchat
 
 [**Introduction**](#introduction) | [**Installation**](#installation) | [**Get Started**](#get-started) | [**Download**](#download) | [**Chat**](#chat) | [**Generate**](#generate) | [**Eval**](#eval) | [**Export**](#export) | [**Supported Systems**](#supported-systems) | [**Contributing**](#contributing) | [**License**](#license)
 
+<!--
+
 [shell default]: HF_TOKEN="${SECRET_HF_TOKEN_PERIODIC}" huggingface-cli login
 
 [shell default]: ./install/install_requirements.sh
 
 [shell default]: TORCHCHAT_ROOT=${PWD} ./torchchat/utils/scripts/install_et.sh
 
+-->
 
 This is the advanced users' guide, if you're looking to get started
 with LLMs, please refer to the README at the root directory of the
@@ -465,7 +463,7 @@ significant impact on accuracy.
 
 ## Native (Stand-Alone) Execution of Exported Models
 
-Refer to the [README](README.md] for an introduction to native
+Refer to the [README](README.md) for an introduction to native
 execution on servers, desktops, and laptops.  Mobile and Edge execution for Android and iOS are
 described under [torchchat/edge/docs/Android.md] and [torchchat/edge/docs/iOS.md], respectively.
 

From 1315275f3f3580b41a4ebbdcc3118966b4609c12 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Thu, 12 Dec 2024 00:25:59 -0800
Subject: [PATCH 35/83] Bump PT pin to 20241028 (#1419)

* Bump PT pin to 20241014

* Push bump to 1028 and add migration to export_for_training
---
 install/install_requirements.sh | 4 ++--
 torchchat/export.py             | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/install/install_requirements.sh b/install/install_requirements.sh
index 3e1f9a655..eab92a4f1 100755
--- a/install/install_requirements.sh
+++ b/install/install_requirements.sh
@@ -62,10 +62,10 @@ echo "Using pip executable: $PIP_EXECUTABLE"
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # PYTORCH_NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-PYTORCH_NIGHTLY_VERSION=dev20241013
+PYTORCH_NIGHTLY_VERSION=dev20241028
 
 # Nightly version for torchvision
-VISION_NIGHTLY_VERSION=dev20241013
+VISION_NIGHTLY_VERSION=dev20241028
 
 # Nightly version for torchtune
 TUNE_NIGHTLY_VERSION=dev20241013
diff --git a/torchchat/export.py b/torchchat/export.py
index 7c5243b68..979778b7c 100644
--- a/torchchat/export.py
+++ b/torchchat/export.py
@@ -122,7 +122,7 @@ def export_for_server(
     from executorch.exir.tracer import Value
 
     from torch._export import capture_pre_autograd_graph
-    from torch.export import export, ExportedProgram
+    from torch.export import export, export_for_training, ExportedProgram
 
     from torchchat.model import apply_rotary_emb, Attention
     from torchchat.utils.build_utils import get_precision
@@ -238,7 +238,7 @@ def _to_core_aten(
             raise ValueError(
                 f"Expected passed in model to be an instance of fx.GraphModule, got {type(model)}"
             )
-        core_aten_ep = export(model, example_inputs, dynamic_shapes=dynamic_shapes)
+        core_aten_ep = export_for_training(model, example_inputs, dynamic_shapes=dynamic_shapes)
         if verbose:
             logging.info(f"Core ATen graph:\n{core_aten_ep.graph}")
         return core_aten_ep

From 1d7e71f0cde3987565335becfb6893d6300dfe6f Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Thu, 12 Dec 2024 11:10:32 -0800
Subject: [PATCH 36/83] Avoid curl fails due to server startup time in
 CI(#1418)

Add sleep after server startup to make sure server ready prior to client request via `curl`

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f1c70acfb..91f5a7e51 100644
--- a/README.md
+++ b/README.md
@@ -231,7 +231,7 @@ python3 torchchat.py server llama3.1
 ```
 [skip default]: end
 
-[shell default]: python3 torchchat.py server llama3.1 & server_pid=$!
+[shell default]: python3 torchchat.py server llama3.1 & server_pid=$! ; sleep 90 # wait for server to be ready to accept requests
 
 In another terminal, query the server using `curl`. Depending on the model configuration, this query might take a few minutes to respond.
 

From 36d071245161070db36f088541fc5bb9aec677da Mon Sep 17 00:00:00 2001
From: Manuel Candales <42380156+manuelcandales@users.noreply.github.com>
Date: Fri, 13 Dec 2024 15:07:24 -0500
Subject: [PATCH 37/83] Add torchao mps ops (#1415)

---
 .github/workflows/pull.yml                   | 38 ++++++++++++++++++++
 docs/quantization.md                         | 26 ++++++++++++++
 install/.pins/torchao-pin.txt                |  2 +-
 torchchat/utils/quantize.py                  | 33 +++++++++--------
 torchchat/utils/scripts/build_torchao_ops.sh |  7 +++-
 torchchat/utils/scripts/install_utils.sh     | 14 ++++++--
 6 files changed, 101 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index c48436a80..d25c674dd 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -1124,3 +1124,41 @@ jobs:
           echo "Generate AOTI"
           python torchchat.py generate stories110M --aoti-package-path ./model.pt2 --prompt "${PRMT}"
           echo "Tests complete."
+
+  test-torchao-experimental-mps:
+    strategy:
+      matrix:
+        runner: [macos-m1-stable]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.10.11
+      - name: Print machine info
+        run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
+      - name: Install torchchat
+        run: |
+          echo "Intalling pip3 packages"
+          ./install/install_requirements.sh
+          pip3 list
+          python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+      - name: Install torchao-ops-mps
+        id: install-torchao-ops-mps
+        run: |
+          bash torchchat/utils/scripts/build_torchao_ops.sh mps
+      - name: Run inference
+        run: |
+          python torchchat.py download stories110M
+          export PRMT="Once upon a time in a land far away"
+          echo "Generate eager"
+          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}" --device mps --dtype float32 --quantize '{"linear:afpwx": {"bitwidth": 3, "groupsize": 32}}'
diff --git a/docs/quantization.md b/docs/quantization.md
index 5007946bb..08086d8d1 100644
--- a/docs/quantization.md
+++ b/docs/quantization.md
@@ -196,6 +196,32 @@ Note: only the ExecuTorch C++ runner in torchchat when built using the instructi
 ./cmake-out/et_run llama3_1.pte -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l 3 -i "Once upon a time,"
 ```
 
+## Experimental TorchAO MPS lowbit kernels
+
+WARNING: These kernels only work on devices with Apple Silicon.
+
+### Use
+
+#### linear:afpwx
+The quantization scheme linear:afpwx quantizes only the weights in a groupwise manner with a specified bitwidth and groupsize.
+It takes arguments bitwidth (1, 2, 3, 4, 5, 6, 7) and groupsize (32, 64, 128, 256).
+
+### Setup
+To use linear:afpwx, you must set up the torchao mps experimental kernels. These will only work on device with Apple Silicon.
+Currently, torchchat can only run them on Eager mode.
+
+From the torchchat root directory, run
+```
+sh torchchat/utils/scripts/build_torchao_ops.sh mps
+```
+
+### Examples
+
+#### Eager mode
+```
+python3 torchchat.py generate stories110M --device mps --dtype float32 --quantize '{"linear:afpwx": {"bitwidth": 4, "groupsize": 256}}' --prompt "Once upon a time," --num-samples 5
+```
+
 ## Quantization Profiles
 
 Four [sample profiles](https://github.com/pytorch/torchchat/tree/main/torchchat/quant_config/) are included with the torchchat distribution: `cuda.json`, `desktop.json`, `mobile.json`, `pi5.json`
diff --git a/install/.pins/torchao-pin.txt b/install/.pins/torchao-pin.txt
index 40f083249..80a4751bc 100644
--- a/install/.pins/torchao-pin.txt
+++ b/install/.pins/torchao-pin.txt
@@ -1 +1 @@
-c8f1174a06dcc0102849c8348ca6573bde8847a9
+7d7c14e898eca3fe66138d2a9445755a9270b800
diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py
index bda695ae2..7f060c365 100644
--- a/torchchat/utils/quantize.py
+++ b/torchchat/utils/quantize.py
@@ -142,6 +142,11 @@ def quantize_model(
                     )
                     set_precision(torch.float32)
 
+            if quantizer == "linear:afpwx" and device != "mps":
+                raise RuntimeError(
+                    "linear:afpwx quantization can only run on mps device!"
+                )
+
             # We set global precision from quantize options if it is specified at cli.py:485
             # so the precision returned by get_precision() is always the authoritative precision/dtype in torchchat
             precision = get_precision()
@@ -813,10 +818,12 @@ def quantized_model(self) -> nn.Module:
     from torchao_experimental_quant_api import (
         Int8DynActIntxWeightLinearQuantizer,
         IntxWeightEmbeddingQuantizer,
+        UIntxWeightOnlyLinearQuantizer,
     )
 
     quantizer_class_dict["linear:a8wxdq"] = Int8DynActIntxWeightLinearQuantizer
     quantizer_class_dict["embedding:wx"] = IntxWeightEmbeddingQuantizer
+    quantizer_class_dict["linear:afpwx"] = UIntxWeightOnlyLinearQuantizer
 
     # Try loading custom op
     try:
@@ -826,20 +833,16 @@ def quantized_model(self) -> nn.Module:
         libs = list(filter(lambda l: (l.endswith("so") or l.endswith("dylib")), libs))
         torch.ops.load_library(libs[0])
     except Exception as e:
-        print("Failed to torchao ops library with error: ", e)
-        print("Slow fallback kernels will be used.")
-
-except Exception as e:
+        print(
+            "Unabled to load torchao cpu ops library. Slow fallback kernels will be used."
+        )
 
-    class ErrorHandler(QuantHandler):
-        def __init__(
-            self, model: Optional[nn.Module] = None, device="cpu", precision=None
-        ):
-            global torchao_experimental_load_error
-            raise Exception(
-                f"Note: Failed to load torchao experimental quantizer with error: {torchao_experimental_load_error}"
-            )
+    try:
+        libname = "libtorchao_ops_mps_aten.dylib"
+        libpath = f"{torchao_build_path}/cmake-out/lib/{libname}"
+        torch.ops.load_library(libpath)
+    except Exception as e:
+        print("Unabled to load torchao mps ops library.")
 
-    torchao_experimental_load_error = e
-    quantizer_class_dict["linear:a8wxdq"] = ErrorHandler
-    quantizer_class_dict["embedding:wx"] = ErrorHandler
+except Exception as e:
+    print("Unabled to import torchao experimental quant_api with error: ", e)
diff --git a/torchchat/utils/scripts/build_torchao_ops.sh b/torchchat/utils/scripts/build_torchao_ops.sh
index a8fd8bea2..46e2479ac 100644
--- a/torchchat/utils/scripts/build_torchao_ops.sh
+++ b/torchchat/utils/scripts/build_torchao_ops.sh
@@ -5,12 +5,17 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+device=${1:-cpu}
 
+if [[ "$device" != "cpu" && "$device" != "mps" ]]; then
+  echo "Invalid argument: $device. Valid values are 'cpu' or 'mps'." >&2
+  exit 1
+fi
 
 source "$(dirname "${BASH_SOURCE[0]}")/install_utils.sh"
 
 pushd ${TORCHCHAT_ROOT}
 find_cmake_prefix_path
 clone_torchao
-install_torchao_aten_ops
+install_torchao_aten_ops "$device"
 popd
diff --git a/torchchat/utils/scripts/install_utils.sh b/torchchat/utils/scripts/install_utils.sh
index 84966cc35..94378960a 100644
--- a/torchchat/utils/scripts/install_utils.sh
+++ b/torchchat/utils/scripts/install_utils.sh
@@ -184,8 +184,18 @@ clone_torchao() {
 }
 
 install_torchao_aten_ops() {
-  echo "Building torchao custom ops for ATen"
-  pushd ${TORCHCHAT_ROOT}/torchao-build/src/ao/torchao/experimental
+  local device=${1:-cpu}
+
+  if [[ "$device" == "cpu" ]]; then
+    echo "Building torchao custom ops for ATen"
+    pushd ${TORCHCHAT_ROOT}/torchao-build/src/ao/torchao/experimental
+  elif [[ "$device" == "mps" ]]; then
+    echo "Building torchao mps custom ops for ATen"
+    pushd ${TORCHCHAT_ROOT}/torchao-build/src/ao/torchao/experimental/ops/mps
+  else
+    echo "Invalid argument: $device. Valid values are 'cpu' or 'mps'." >&2
+    return 1
+  fi
 
   CMAKE_OUT_DIR=${TORCHCHAT_ROOT}/torchao-build/cmake-out
   cmake -DCMAKE_PREFIX_PATH=${MY_CMAKE_PREFIX_PATH} \

From 5bc55525370dfea6af724912efe7e2e00e5980b0 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Fri, 13 Dec 2024 16:22:42 -0800
Subject: [PATCH 38/83] Multi Pin Bumps across PT/AO/tune/ET: pt dev20241213
 (#1367)

* Bump PyTorch pin to 20241111

* bump to 1112

* Update install_requirements.sh

* Update install_requirements.sh

* Update checkpoint.py typo

* Update install_requirements.sh

* Update install_requirements.sh

* Update install_requirements.sh

* Bump pins, waiting for nvjit fix

* Update install_requirements.sh

* bump tune

* fix tune major version

* Bump AO pin to pick up import fix

* misc

* Update linux_job CI to v2

* Update install_requirements.sh PT pin to 1202

* Vision nightly is delayed

* Bump Cuda version; drop PT version to one with vision nightly

* Bump to 1205 vision nightly

* Vision nightly 1205 needs 1204 torch(?)

* Drop PT version to 1126 (friendly vision version), update devtoolset to 11 for almalinux

* Test download toolchain instead of binutils

* Test removing devtoolset

* Remove dep on devtoolset 11 that doesnt' exist on the new machine

* Bump ET pin

* Test nightly with updated vision

* Attempt to account for int4wo packing pt#139611

* Naive gguf int4wo attempt

* Update install_requirements.sh to 1210

* Update install_requirements.sh to 20241213

Should fix the MacOS wheel regression

* Update torchvision minor version to 22
---
 .github/workflows/more-tests.yml          |  10 +-
 .github/workflows/periodic.yml            |   4 +-
 .github/workflows/pull.yml                |  42 +++------
 .github/workflows/run-readme-periodic.yml |  27 ++----
 .github/workflows/run-readme-pr.yml       | 108 +++++-----------------
 .github/workflows/runner-cuda-dtype.yml   |  10 +-
 install/.pins/torchao-pin.txt             |   2 +-
 install/install_requirements.sh           |  16 ++--
 torchchat/cli/builder.py                  |   3 +
 torchchat/distributed/checkpoint.py       |   1 +
 torchchat/utils/gguf_loader.py            |  17 +++-
 11 files changed, 74 insertions(+), 166 deletions(-)

diff --git a/.github/workflows/more-tests.yml b/.github/workflows/more-tests.yml
index 1e0652c96..f47740fe3 100644
--- a/.github/workflows/more-tests.yml
+++ b/.github/workflows/more-tests.yml
@@ -9,23 +9,17 @@ on:
 
 jobs:
   test-cuda:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
-
         echo "::group::Download checkpoints"
         # Install requirements
         ./install/install_requirements.sh cuda
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index a9561e3e8..5a0d9920b 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -108,7 +108,7 @@ jobs:
           set -eux
           PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "periodic" --backend "gpu"
   test-gpu:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
     secrets: inherit
@@ -119,7 +119,7 @@ jobs:
       secrets-env: "HF_TOKEN_PERIODIC"
       runner: ${{ matrix.runner }}
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       script: |
         echo "::group::Print machine info"
         nvidia-smi
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index d25c674dd..623b0e80f 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -215,7 +215,7 @@ jobs:
           set -eux
           PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "pull_request" --backend "gpu"
   test-gpu-compile:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
     strategy:
@@ -224,7 +224,7 @@ jobs:
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       script: |
         echo "::group::Print machine info"
         nvidia-smi
@@ -250,7 +250,7 @@ jobs:
         echo "::endgroup::"
 
   test-gpu-aoti-bfloat16:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu-aoti-bfloat16 (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
     strategy:
@@ -259,18 +259,13 @@ jobs:
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         nvidia-smi
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         echo "::group::Install required packages"
         ./install/install_requirements.sh cuda
         pip3 list
@@ -291,7 +286,7 @@ jobs:
         echo "::endgroup::"
 
   test-gpu-aoti-float32:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu-aoti-float32 (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
     strategy:
@@ -300,17 +295,12 @@ jobs:
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       script: |
         echo "::group::Print machine info"
         nvidia-smi
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         echo "::group::Install required packages"
         ./install/install_requirements.sh cuda
         pip list
@@ -337,7 +327,7 @@ jobs:
         echo "::endgroup::"
 
   test-gpu-aoti-float16:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu-aoti-float16 (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
     strategy:
@@ -346,17 +336,12 @@ jobs:
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       script: |
         echo "::group::Print machine info"
         nvidia-smi
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         echo "::group::Install required packages"
         ./install/install_requirements.sh cuda
         pip list
@@ -384,7 +369,7 @@ jobs:
         echo "::endgroup::"
 
   test-gpu-eval-sanity-check:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu-eval-sanity-check (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
     strategy:
@@ -393,17 +378,12 @@ jobs:
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       script: |
         echo "::group::Print machine info"
         nvidia-smi
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         echo "::group::Install required packages"
         ./install/install_requirements.sh cuda
         pip3 list
@@ -1031,7 +1011,7 @@ jobs:
           echo "Tests complete."
 
   test-build-runner-et-android:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.4xlarge
       script: |
diff --git a/.github/workflows/run-readme-periodic.yml b/.github/workflows/run-readme-periodic.yml
index 6a933b5f1..61501e0c4 100644
--- a/.github/workflows/run-readme-periodic.yml
+++ b/.github/workflows/run-readme-periodic.yml
@@ -10,24 +10,19 @@ on:
 
 jobs:
   test-readme:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     secrets: inherit
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       secrets-env: "HF_TOKEN_PERIODIC"
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         echo "::group::Create script to run README"
         python3 torchchat/utils/scripts/updown.py --create-sections --file README.md > ./run-readme.sh
         # for good measure, if something happened to updown processor,
@@ -44,23 +39,18 @@ jobs:
 
 
   test-quantization-any:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       secrets: inherit
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         echo "::group::Create script to run quantization"
         python3 torchchat/utils/scripts/updown.py --create-sections --file docs/quantization.md > ./run-quantization.sh
         # for good measure, if something happened to updown processor,
@@ -76,24 +66,19 @@ jobs:
         echo "::endgroup::"
 
   test-gguf-any:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     secrets: inherit
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       secrets-env: "HF_TOKEN_PERIODIC"
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         echo "::group::Create script to run gguf"
         python3 torchchat/utils/scripts/updown.py --file docs/GGUF.md > ./run-gguf.sh
         # for good measure, if something happened to updown processor,
diff --git a/.github/workflows/run-readme-pr.yml b/.github/workflows/run-readme-pr.yml
index 1dc2942ef..8694757e7 100644
--- a/.github/workflows/run-readme-pr.yml
+++ b/.github/workflows/run-readme-pr.yml
@@ -9,22 +9,17 @@ on:
 
 jobs:
   test-readme-any:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         .ci/scripts/run-docs readme
 
         echo "::group::Completion"
@@ -33,22 +28,17 @@ jobs:
         echo "::endgroup::"
 
   test-readme-cpu:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs readme
 
         echo "::group::Completion"
@@ -57,22 +47,17 @@ jobs:
         echo "::endgroup::"
 
   test-quantization-any:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         .ci/scripts/run-docs quantization
 
         echo "::group::Completion"
@@ -81,41 +66,31 @@ jobs:
         echo "::endgroup::"
 
   test-quantization-cpu:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization
 
   test-gguf-any:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         .ci/scripts/run-docs gguf
 
         echo "::group::Completion"
@@ -124,22 +99,17 @@ jobs:
         echo "::endgroup::"
 
   test-gguf-cpu:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs gguf
 
         echo "::group::Completion"
@@ -149,22 +119,17 @@ jobs:
 
 
   test-advanced-any:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         .ci/scripts/run-docs advanced
 
         echo "::group::Completion"
@@ -174,22 +139,17 @@ jobs:
 
 
   test-advanced-cpu:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs advanced
 
         echo "::group::Completion"
@@ -198,22 +158,17 @@ jobs:
         echo "::endgroup::"
 
   test-evaluation-any:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         .ci/scripts/run-docs evaluation
 
         echo "::group::Completion"
@@ -222,22 +177,17 @@ jobs:
         echo "::endgroup::"
 
   test-evaluation-cpu:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs evaluation
 
         echo "::group::Completion"
@@ -246,22 +196,17 @@ jobs:
         echo "::endgroup::"
 
   test-multimodal-any:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         .ci/scripts/run-docs multimodal
 
         echo "::group::Completion"
@@ -270,22 +215,17 @@ jobs:
         echo "::endgroup::"
 
   test-multimodal-cpu:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs multimodal
 
   test-native-any:
diff --git a/.github/workflows/runner-cuda-dtype.yml b/.github/workflows/runner-cuda-dtype.yml
index b83b9904b..1813f483e 100644
--- a/.github/workflows/runner-cuda-dtype.yml
+++ b/.github/workflows/runner-cuda-dtype.yml
@@ -9,24 +9,18 @@ on:
 
 jobs:
   test-runner-aot-cuda:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
       secrets-env: "HF_TOKEN_PERIODIC"
       gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      gpu-arch-version: "12.4"
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
-
         echo "::group::Download checkpoints"
         # Install requirements
 
diff --git a/install/.pins/torchao-pin.txt b/install/.pins/torchao-pin.txt
index 80a4751bc..c6161e78f 100644
--- a/install/.pins/torchao-pin.txt
+++ b/install/.pins/torchao-pin.txt
@@ -1 +1 @@
-7d7c14e898eca3fe66138d2a9445755a9270b800
+7d7c14e898eca3fe66138d2a9445755a9270b800
\ No newline at end of file
diff --git a/install/install_requirements.sh b/install/install_requirements.sh
index eab92a4f1..3db559dbc 100755
--- a/install/install_requirements.sh
+++ b/install/install_requirements.sh
@@ -62,13 +62,13 @@ echo "Using pip executable: $PIP_EXECUTABLE"
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # PYTORCH_NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-PYTORCH_NIGHTLY_VERSION=dev20241028
+PYTORCH_NIGHTLY_VERSION=dev20241213
 
 # Nightly version for torchvision
-VISION_NIGHTLY_VERSION=dev20241028
+VISION_NIGHTLY_VERSION=dev20241213
 
 # Nightly version for torchtune
-TUNE_NIGHTLY_VERSION=dev20241013
+TUNE_NIGHTLY_VERSION=dev20241126
 
 # Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same
 (
@@ -81,7 +81,7 @@ TUNE_NIGHTLY_VERSION=dev20241013
 # with cuda for faster execution on cuda GPUs.
 if [[ -x "$(command -v nvidia-smi)" ]];
 then
-  TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cu121"
+  TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cu124"
 elif [[ -x "$(command -v rocminfo)" ]];
 then
   TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/rocm6.2"
@@ -92,8 +92,8 @@ fi
 # pip packages needed by exir.
 REQUIREMENTS_TO_INSTALL=(
   torch=="2.6.0.${PYTORCH_NIGHTLY_VERSION}"
-  torchvision=="0.20.0.${VISION_NIGHTLY_VERSION}"
-  torchtune=="0.4.0.${TUNE_NIGHTLY_VERSION}"
+  torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}"
+  torchtune=="0.5.0.${TUNE_NIGHTLY_VERSION}"
 )
 
 # Install the requirements. --extra-index-url tells pip to look for package
@@ -104,9 +104,11 @@ REQUIREMENTS_TO_INSTALL=(
     "${REQUIREMENTS_TO_INSTALL[@]}"
 )
 
+# For torchao need to install from github since nightly build doesn't have macos build.
+# TODO: Remove this and install nightly build, once it supports macos
 (
   set -x
-  $PIP_EXECUTABLE install torchao=="0.5.0"
+  $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@2f97b0955953fa1a46594a27f0df2bc48d93e79d
 )
 
 if [[ -x "$(command -v nvidia-smi)" ]]; then
diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
index a39a2ed95..2773c8372 100644
--- a/torchchat/cli/builder.py
+++ b/torchchat/cli/builder.py
@@ -373,6 +373,8 @@ def _load_model_gguf(builder_args: BuilderArgs) -> Model:
         kwargs = {}
     else:
         kwargs = builder_args.gguf_kwargs
+
+    kwargs.setdefault("device", builder_args.device)
     model = Model.from_gguf(builder_args.gguf_path, **kwargs)
     return model
 
@@ -396,6 +398,7 @@ def _load_checkpoint(builder_args: BuilderArgs):
                     os.path.join(builder_args.checkpoint_dir, cp_name),
                     map_location=builder_args.device,
                     mmap=True,
+                    weights_only=False,
                 )
             )
         checkpoint = {}
diff --git a/torchchat/distributed/checkpoint.py b/torchchat/distributed/checkpoint.py
index 1830e3a75..11e397469 100644
--- a/torchchat/distributed/checkpoint.py
+++ b/torchchat/distributed/checkpoint.py
@@ -96,6 +96,7 @@ def _load_checkpoints_from_storage(
         checkpoint_path,
         map_location=builder_args.device,
         mmap=True,
+        weights_only=False,
     )
 
 
diff --git a/torchchat/utils/gguf_loader.py b/torchchat/utils/gguf_loader.py
index 309ff807c..c69bdf469 100644
--- a/torchchat/utils/gguf_loader.py
+++ b/torchchat/utils/gguf_loader.py
@@ -122,7 +122,7 @@ def linear_int4(input, weight_int4pack, scales_and_zeros, out_features, groupsiz
             input.dtype
         )  # cast back to input.dtype
     else:
-        c = torch.ops.aten._weight_int4pack_mm(
+        c = torch.ops.aten._weight_int4pack_mm_for_cpu(
             input,
             weight_int4pack,
             groupsize,
@@ -570,6 +570,7 @@ def load_model_and_state_dict(
     load_state_dict: bool = True,
     load_as_quantized: bool = True,
     inner_k_tiles=8,
+    device="cpu",
 ) -> torch.nn.Module:
     """
     Parses the GGUF file and returns an nn.Module on meta device along with a state_dict
@@ -609,9 +610,17 @@ def load_model_and_state_dict(
                 q, s, z = Q4_0.unpack(t)
                 scales_and_zeros = pack_scales_and_zeros(s, z)
                 q_uint8 = (q[::, ::2] << 4 | q[::, 1::2]).to(torch.uint8)
-                weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
-                    q_uint8, inner_k_tiles
-                )
+
+                if torch.device(device).type == "cpu":
+                    weight_int4pack = (
+                        torch.ops.aten._convert_weight_to_int4pack_for_cpu(
+                            q, inner_k_tiles
+                        )
+                    )
+                else:
+                    weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
+                        q_uint8, inner_k_tiles
+                    )
                 state_dict[f"{fqn}.weight"] = weight_int4pack
                 state_dict[f"{fqn}.scales_and_zeros"] = scales_and_zeros
 

From 902542db1d6d8ee50757f61d3edeb25cc3f85c5e Mon Sep 17 00:00:00 2001
From: YanbingJiang <yanbing.jiang@intel.com>
Date: Tue, 17 Dec 2024 11:26:10 +0800
Subject: [PATCH 39/83] Update int4pack related in torchchat gguf (#1404)

* Update int4pack related for gguf

* Update gguf_loader.py

---------

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 torchchat/utils/gguf_loader.py | 66 +++++++++++++++++++++-------------
 1 file changed, 41 insertions(+), 25 deletions(-)

diff --git a/torchchat/utils/gguf_loader.py b/torchchat/utils/gguf_loader.py
index c69bdf469..9e7b73b50 100644
--- a/torchchat/utils/gguf_loader.py
+++ b/torchchat/utils/gguf_loader.py
@@ -24,6 +24,8 @@
     pack_scales_and_zeros,
 )
 
+from torchao.dtypes.utils import is_device
+
 
 logger: logging.Logger = logging.getLogger(__name__)
 
@@ -128,6 +130,7 @@ def linear_int4(input, weight_int4pack, scales_and_zeros, out_features, groupsiz
             groupsize,
             scales_and_zeros,
         )
+
     new_shape = origin_input_size[:-1] + (out_features,)
     c = c.reshape(new_shape)
     return c
@@ -178,16 +181,27 @@ def __init__(
         ), "must specify both weights and scales_and_zeros, or neither"
 
         if weight is None:
-            weight = torch.empty(
-                (
-                    out_features // 8,
-                    in_features // (inner_k_tiles * 16),
-                    32,
-                    inner_k_tiles // 2,
-                ),
-                dtype=torch.int32,
-                device=device,
-            )
+            if is_device(device, "cpu"):
+                weight = torch.empty(
+                    (
+                        out_features,
+                        in_features // 2,
+                    ),
+                    dtype=torch.uint8,
+                    device=device,
+                )
+            else:
+                weight = torch.empty(
+                    (
+                        out_features // 8,
+                        in_features // (inner_k_tiles * 16),
+                        32,
+                        inner_k_tiles // 2,
+                    ),
+                    dtype=torch.int32,
+                    device=device,
+                )
+
             scales_and_zeros = torch.empty(
                 (in_features // groupsize, out_features, 2),
                 dtype=get_precision(),
@@ -223,12 +237,17 @@ def _prepare_weight_and_scales_and_zeros(
         weight_int32, scales_and_zeros = group_quantize_tensor(
             weight_bf16, n_bit=4, groupsize=groupsize
         )
-        weight_uint8 = (weight_int32[::, ::2] << 4 | weight_int32[::, 1::2]).to(
-            torch.uint8
-        )
-        weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
-            weight_uint8, inner_k_tiles
-        )
+        if is_device(weight_int32.device.type, "cpu"):
+            weight_int4pack = torch.ops.aten._convert_weight_to_int4pack_for_cpu(
+                weight_int32, inner_k_tiles
+            )
+        else:
+            weight_uint8 = (weight_int32[::, ::2] << 4 | weight_int32[::, 1::2]).to(
+                torch.uint8
+            )
+            weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
+                weight_uint8, inner_k_tiles
+            )
         return weight_int4pack, scales_and_zeros
 
     @classmethod
@@ -609,17 +628,14 @@ def load_model_and_state_dict(
             if load_state_dict:
                 q, s, z = Q4_0.unpack(t)
                 scales_and_zeros = pack_scales_and_zeros(s, z)
-                q_uint8 = (q[::, ::2] << 4 | q[::, 1::2]).to(torch.uint8)
-
-                if torch.device(device).type == "cpu":
-                    weight_int4pack = (
-                        torch.ops.aten._convert_weight_to_int4pack_for_cpu(
-                            q, inner_k_tiles
-                        )
+                if is_device(q.device.type, "cpu"):
+                    weight_int4pack = torch.ops.aten._convert_weight_to_int4pack_for_cpu(
+                        q, inner_k_tiles
                     )
                 else:
+                    q_tmp = (q[::, ::2] << 4 | q[::, 1::2]).to(torch.uint8)
                     weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
-                        q_uint8, inner_k_tiles
+                        q_tmp, inner_k_tiles
                     )
                 state_dict[f"{fqn}.weight"] = weight_int4pack
                 state_dict[f"{fqn}.scales_and_zeros"] = scales_and_zeros
@@ -632,7 +648,7 @@ def load_model_and_state_dict(
                     in_features=in_features,
                     out_features=out_features,
                     bias=False,
-                    device="meta",
+                    device="cpu",
                     groupsize=Q4_0.groupsize,
                     inner_k_tiles=inner_k_tiles,
                 ),

From 6de1a01d44123ba8a226e989a6793489a34ea87f Mon Sep 17 00:00:00 2001
From: Manuel Candales <42380156+manuelcandales@users.noreply.github.com>
Date: Wed, 18 Dec 2024 17:17:46 -0500
Subject: [PATCH 40/83] update torchao pin: optimized shaders (#1428)

---
 install/.pins/torchao-pin.txt | 2 +-
 torchchat/utils/quantize.py   | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/install/.pins/torchao-pin.txt b/install/.pins/torchao-pin.txt
index c6161e78f..2da70769c 100644
--- a/install/.pins/torchao-pin.txt
+++ b/install/.pins/torchao-pin.txt
@@ -1 +1 @@
-7d7c14e898eca3fe66138d2a9445755a9270b800
\ No newline at end of file
+2e032c6b0de960dee554dcb08126ace718b14c6d
diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py
index 7f060c365..b1dcf25f8 100644
--- a/torchchat/utils/quantize.py
+++ b/torchchat/utils/quantize.py
@@ -832,6 +832,7 @@ def quantized_model(self) -> nn.Module:
         libs = glob.glob(f"{torchao_build_path}/cmake-out/lib/libtorchao_ops_aten.*")
         libs = list(filter(lambda l: (l.endswith("so") or l.endswith("dylib")), libs))
         torch.ops.load_library(libs[0])
+        print("Loaded torchao cpu ops.")
     except Exception as e:
         print(
             "Unabled to load torchao cpu ops library. Slow fallback kernels will be used."
@@ -841,6 +842,7 @@ def quantized_model(self) -> nn.Module:
         libname = "libtorchao_ops_mps_aten.dylib"
         libpath = f"{torchao_build_path}/cmake-out/lib/{libname}"
         torch.ops.load_library(libpath)
+        print("Loaded torchao mps ops.")
     except Exception as e:
         print("Unabled to load torchao mps ops library.")
 

From ff2d53cab8583bd19e607210ed776d36f4007a58 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Wed, 18 Dec 2024 19:16:39 -0800
Subject: [PATCH 41/83] Update install_requirements.sh to tune + pt/pt
 dev20241218 (#1426)

* Update install_requirements.sh to pt/pt dev20241217

* Update install_requirements.sh to 1218

* Update install_requirements.sh

* Rearrange install order; previuosly inconsistent extra-index-url
---
 install/install_requirements.sh | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/install/install_requirements.sh b/install/install_requirements.sh
index 3db559dbc..b5ac414fd 100755
--- a/install/install_requirements.sh
+++ b/install/install_requirements.sh
@@ -44,17 +44,6 @@ fi
 
 echo "Using pip executable: $PIP_EXECUTABLE"
 
-#
-# First install requirements in install/requirements.txt. Older torch may be
-# installed from the dependency of other models. It will be overridden by
-# newer version of torch nightly installed later in this script.
-#
-
-(
-  set -x
-  $PIP_EXECUTABLE install -r install/requirements.txt --extra-index-url https://download.pytorch.org/whl/nightly/cu121
-)
-
 # Since torchchat often uses main-branch features of pytorch, only the nightly
 # pip versions will have the required features. The PYTORCH_NIGHTLY_VERSION value should
 # agree with the third-party/pytorch pinned submodule commit.
@@ -62,13 +51,13 @@ echo "Using pip executable: $PIP_EXECUTABLE"
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # PYTORCH_NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-PYTORCH_NIGHTLY_VERSION=dev20241213
+PYTORCH_NIGHTLY_VERSION=dev20241218
 
 # Nightly version for torchvision
-VISION_NIGHTLY_VERSION=dev20241213
+VISION_NIGHTLY_VERSION=dev20241218
 
 # Nightly version for torchtune
-TUNE_NIGHTLY_VERSION=dev20241126
+TUNE_NIGHTLY_VERSION=dev20241218
 
 # Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same
 (
@@ -96,6 +85,16 @@ REQUIREMENTS_TO_INSTALL=(
   torchtune=="0.5.0.${TUNE_NIGHTLY_VERSION}"
 )
 
+#
+# First install requirements in install/requirements.txt. Older torch may be
+# installed from the dependency of other models. It will be overridden by
+# newer version of torch nightly installed later in this script.
+#
+(
+  set -x
+  $PIP_EXECUTABLE install -r install/requirements.txt --extra-index-url "${TORCH_NIGHTLY_URL}"
+)
+
 # Install the requirements. --extra-index-url tells pip to look for package
 # versions on the provided URL if they aren't available on the default URL.
 (

From 5e16167f3c8756f80cd47674e7c171d4517f22e4 Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Thu, 19 Dec 2024 03:13:53 -0700
Subject: [PATCH 42/83] Add Granite code support (#1336)

* feat(models): Add models.json blocks for Granite Code 3b and 8b

Branch: GraniteCodeSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Initial model params for granite code 3b

Branch: GraniteCodeSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(model config): Fix model configs for Granite Code

* Use the right tokenizer_file name
* Use the right transformer_params_key based on the file name in
model_params
* Use the updated name to indicate HF tokenizers

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(granite): Add model params for granite-code-8b

Something isn't quite working with this model yet, but the config should be
accurate at this point.

Branch: GraniteCodeSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(deps): Add tokenizers to the deps explicitly

It was implicitly being pulled in via lm_eval -> transformers, but it's
better to have it explicit since we use it directly

Branch: GraniteCodeSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(tokenizer): Add basic support for jinja2 template rendering for HF tokenizers

This is a much simplified version of the corresponding logic in
transformers. I opted for this so that the full transformers dependency is
not added here.

CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/tokenization_utils_base.py#L1522

Branch: GraniteCodeSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(chat): Add HFTokenizerChatFormatter and use it for HF tokenizers

This will allow the jinja2 templates for HF tokenizers to be applied
without needing to hard-code the formatter logic. This will likely need to
be duplicated in the embedded code version of chat.

Branch: GraniteCodeSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(deps): Add jinja2 as an explicit dep

It was getting pulled in implicitly via flask and lm_eval -> transformers,
but better to have it explicit.

Branch: GraniteCodeSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(log): Add env-based LOG_LEVEL config to CLI

Branch: GraniteCodeSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(log): Add better logging in model and generate

In generate, there were a number of commented-out log lines. These are safe
to leave in as long as lazy string interpolation is used.

Branch: GraniteCodeSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(generate): Make prepending BOS model-conigurable

And disable it for Granite Code models

Branch: GraniteCodeSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(chat): Refactor chat template logic to encapsulate all formatting in classes

The formatted strings may not be perfectly 1:1 with the previous impl, but
they should be in line with the official model guidelines:

* https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3
* https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-2

Branch: GraniteCodeSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(chat): Fix small formatting bugs in llama3 chat formatter

Branch: GraniteCodeSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* test: Add initial unit tests for chat formatters

There's no formal execution framework for pytest yet, but these were
helpful in ensuring that the formatting was working correctly!

To run them, install pytest and run `pytest tests/`

Branch: GraniteCodeSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(logging): Disable logging in generate unless set in the env

There is an incompatibility with logging and torch._dynamo, so this
disables it unless the developer asks for it explicitly.

NOTE: The TC team has stated that they have holistic logging on the roadmap
so this is a short-term solution pending a more robust approach.

REF: https://github.com/pytorch/torchchat/actions/runs/11963066986/job/33493237302#step:14:3599

Branch: GraniteCodeSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix: Remove trailing n from llama3 <|eot_id|>

There's inconsistency in the documentation on whether or not there should
be a n after <|eot_id|>, but this maintains consistency with previous
formatting

Branch: GraniteCodeSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

---------

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 install/requirements.txt                    |   4 +
 tests/conftest.py                           |  12 ++
 tests/test_chat_formatters.py               | 216 ++++++++++++++++++++
 tokenizer/hf_tokenizer.py                   |  28 ++-
 torchchat/cli/cli.py                        |  10 +-
 torchchat/generate.py                       | 206 ++++++++++++-------
 torchchat/model.py                          |  20 +-
 torchchat/model_config/models.json          |  14 ++
 torchchat/model_params/Granite-3B-Code.json |  17 ++
 torchchat/model_params/Granite-8B-Code.json |  17 ++
 10 files changed, 469 insertions(+), 75 deletions(-)
 create mode 100644 tests/conftest.py
 create mode 100644 tests/test_chat_formatters.py
 create mode 100644 torchchat/model_params/Granite-3B-Code.json
 create mode 100644 torchchat/model_params/Granite-8B-Code.json

diff --git a/install/requirements.txt b/install/requirements.txt
index 8fb1832ba..457131275 100644
--- a/install/requirements.txt
+++ b/install/requirements.txt
@@ -9,6 +9,10 @@ gguf
 # Tiktoken tokenizer for Llama 3 and other advanced models
 tiktoken
 
+# Tokenizers and jinja2 for other non-llama models that use HF tokenizers
+tokenizers
+jinja2
+
 # Miscellaneous
 snakeviz
 sentencepiece
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 000000000..c1580e27b
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,12 @@
+"""
+Global pytest config, fixtures, and helpers go here!
+"""
+
+# Standard
+import os
+import sys
+
+# Make sure tests can import torchchat
+sys.path.append(
+    os.path.realpath(os.path.join(os.path.dirname(__file__), ".."))
+)
diff --git a/tests/test_chat_formatters.py b/tests/test_chat_formatters.py
new file mode 100644
index 000000000..2f7f7a955
--- /dev/null
+++ b/tests/test_chat_formatters.py
@@ -0,0 +1,216 @@
+"""
+Unit tests for chat formatters
+"""
+
+# Third Party
+import pytest
+
+# Local
+from torchchat.generate import (
+    HFTokenizerChatFormatter,
+    Llama2ChatFormatter,
+    Llama3ChatFormatter,
+)
+
+## Helpers #####################################################################
+
+class DummyTokenizer:
+    """Dummy tokenizer that encodes as strings so it's easy to check formatting"""
+    def encode(self, text, *_, **__):
+        return text
+
+
+class DummySPTokenizer(DummyTokenizer):
+    """Emulated Sentencepiece tokenizer with bos/eos"""
+    bos = "<s>"
+    eos = "</s>"
+
+
+class DummyLlama3Tokenizer(DummyTokenizer):
+    class _IdentityDict:
+        def __getitem__(self, key):
+            return key
+    special_tokens = _IdentityDict()
+
+
+class DummyHFTokenizer(DummyTokenizer):
+    """Dummy made up chat template scheme"""
+    # Sequence
+    bos = "<bos>"
+    # Turn
+    bot = "<bot>"
+    eot = "<eot>"
+    # Role
+    bor = "<bor>"
+    eor = "<eor>"
+    def apply_chat_template(self, messages, add_generation_prompt):
+        out = [self.bos]
+        role = None
+        for msg in messages:
+            role = msg["role"]
+            content = msg["content"]
+            out.append(f"{self.bot}{self.bor}{role}{self.eor}{content}{self.eot}")
+        if add_generation_prompt and role != "assistant":
+            out.append(f"{self.bot}{self.bor}assistant{self.eor}")
+        return "\n".join(out)
+
+
+def check_rendering(fmt, messages, expected, add_generation_prompt):
+    """Render messages and compare to expected output"""
+    assert "".join(fmt.encode_dialog_prompt(messages, add_generation_prompt)) == expected
+
+
+def make_message(role, text):
+    return {"role": role, "content": text}
+
+
+SYSTEM_PROMPT = "You are a helpful assistant, feel free to ask me anything."
+USER1 = "Hello world!"
+ASSISTANT1 = "Greetings! How can I help you?"
+USER2 = "Why is the sky blue?"
+ASSISTANT2 = "The sky appears blue because of a phenomenon called Rayleigh scattering."
+
+
+# Stock sets of messages to test
+MSGS_NO_SYS= [
+    make_message("user", USER1),
+]
+MSGS_SYS_USR = [
+    make_message("system", SYSTEM_PROMPT),
+    make_message("user", USER1),
+]
+MSGS_SYS_USR_ASST = [
+    make_message("system", SYSTEM_PROMPT),
+    make_message("user", USER1),
+    make_message("assistant", ASSISTANT1),
+]
+MSGS_MULTI_TURN = [
+    make_message("system", SYSTEM_PROMPT),
+    make_message("user", USER1),
+    make_message("assistant", ASSISTANT1),
+    make_message("user", USER2),
+    make_message("assistant", ASSISTANT2),
+]
+
+## Llama2ChatFormatter #########################################################
+
+@pytest.mark.parametrize(
+    ["messages", "expected"],
+    [
+        # single user message (no system prompt)
+        (MSGS_NO_SYS, f"<s>[INST] {USER1} [/INST]"),
+        # sys, usr
+        (MSGS_SYS_USR, f"""<s>[INST] <<SYS>>
+{SYSTEM_PROMPT}
+<</SYS>>
+
+{USER1} [/INST]"""),
+        # sys, usr, asst
+        (MSGS_SYS_USR_ASST, f"""<s>[INST] <<SYS>>
+{SYSTEM_PROMPT}
+<</SYS>>
+
+{USER1} [/INST] {ASSISTANT1} </s>
+"""),
+        # sys, usr, asst, usr, asst
+        (MSGS_MULTI_TURN, f"""<s>[INST] <<SYS>>
+{SYSTEM_PROMPT}
+<</SYS>>
+
+{USER1} [/INST] {ASSISTANT1} </s>
+<s>[INST] {USER2} [/INST] {ASSISTANT2} </s>
+"""),
+    ]
+)
+def test_llama2_chat_formatter(messages, expected):
+    """Tests for Llama2 following the official guide
+    https://www.llama.com/docs/model-cards-and-prompt-formats/meta-llama-2/
+    """
+    tok = DummySPTokenizer()
+    fmt = Llama2ChatFormatter(tok)
+    # NOTE: add_generation_prompt not used by Llama2
+    check_rendering(fmt, messages, expected, True)
+
+## Llama3ChatFormatter #########################################################
+
+@pytest.mark.parametrize(
+    ["messages", "expected"],
+    [
+        # single user message (no system prompt)
+        (MSGS_NO_SYS, f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
+
+{USER1}<|eot_id|>"""),
+        # sys, usr
+        (MSGS_SYS_USR, f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+{USER1}<|eot_id|>"""),
+        # sys, usr, asst
+        (MSGS_SYS_USR_ASST, f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+{USER1}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+{ASSISTANT1}<|eot_id|>"""),
+        # sys, usr, asst, usr, asst
+        (MSGS_MULTI_TURN, f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+{SYSTEM_PROMPT}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+{USER1}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+{ASSISTANT1}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+{USER2}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+{ASSISTANT2}<|eot_id|>"""),
+    ]
+)
+@pytest.mark.parametrize("add_generation_prompt", [True, False])
+def test_llama3_chat_formatter(messages, expected, add_generation_prompt):
+    """Tests for Llama3 following the official guide
+    https://www.llama.com/docs/model-cards-and-prompt-formats/meta-llama-3/
+    """
+    tok = DummyLlama3Tokenizer()
+    fmt = Llama3ChatFormatter(tok)
+    # No assistant prompt added if the last message is from the assistant
+    if add_generation_prompt and messages[-1]["role"] != "assistant":
+        expected += "<|start_header_id|>assistant<|end_header_id|>\n\n"
+    check_rendering(fmt, messages, expected, add_generation_prompt)
+
+## HFTokenizerChatFormatter ####################################################
+
+@pytest.mark.parametrize(
+    ["messages", "expected"],
+    [
+        # single user message (no system prompt)
+        (MSGS_NO_SYS, f"""<bos>
+<bot><bor>user<eor>{USER1}<eot>"""),
+        # sys, usr
+        (MSGS_SYS_USR, f"""<bos>
+<bot><bor>system<eor>{SYSTEM_PROMPT}<eot>
+<bot><bor>user<eor>{USER1}<eot>"""),
+        # sys, usr, asst
+        (MSGS_SYS_USR_ASST, f"""<bos>
+<bot><bor>system<eor>{SYSTEM_PROMPT}<eot>
+<bot><bor>user<eor>{USER1}<eot>
+<bot><bor>assistant<eor>{ASSISTANT1}<eot>"""),
+        # sys, usr, asst, usr, asst
+        (MSGS_MULTI_TURN, f"""<bos>
+<bot><bor>system<eor>{SYSTEM_PROMPT}<eot>
+<bot><bor>user<eor>{USER1}<eot>
+<bot><bor>assistant<eor>{ASSISTANT1}<eot>
+<bot><bor>user<eor>{USER2}<eot>
+<bot><bor>assistant<eor>{ASSISTANT2}<eot>"""),
+    ]
+)
+@pytest.mark.parametrize("add_generation_prompt", [True, False])
+def test_hf_chat_formatter(messages, expected, add_generation_prompt):
+    tok = DummyHFTokenizer()
+    fmt = HFTokenizerChatFormatter(tok)
+    # No assistant prompt added if the last message is from the assistant
+    if add_generation_prompt and messages[-1]["role"] != "assistant":
+        expected += f"\n{tok.bot}{tok.bor}assistant{tok.eor}"
+    check_rendering(fmt, messages, expected, add_generation_prompt)
diff --git a/tokenizer/hf_tokenizer.py b/tokenizer/hf_tokenizer.py
index 7ad5807d1..d10ecb076 100644
--- a/tokenizer/hf_tokenizer.py
+++ b/tokenizer/hf_tokenizer.py
@@ -5,11 +5,12 @@
 # LICENSE file in the root directory of this source tree.
 
 # Standard
-from typing import List, Optional
+from typing import Dict, List, Optional
 import json
 import os
 
 # Third Party
+import jinja2
 from tokenizers import Tokenizer
 
 # Local
@@ -37,6 +38,9 @@ def __init__(self, file_path: str):
         # Load the tokenizer itself
         self._tokenizer = Tokenizer.from_file(tokenizer_path)
 
+        # Load the chat template if we have a config path
+        self._chat_template: Optional[jinja2.Template] = None
+
         # If available, parse bos/eos tokens from the tokenizer config
         self._bos_id, self._eos_id = None, None
         if tokenizer_config_path is not None:
@@ -48,6 +52,8 @@ def __init__(self, file_path: str):
                 self._bos_id = self._tokenizer.token_to_id(bos_token)
             if eos_token is not None:
                 self._eos_id = self._tokenizer.token_to_id(eos_token)
+            if chat_template_str := tok_config.get("chat_template"):
+                self._chat_template = jinja2.Template(chat_template_str)
 
         # If no eos/bos tokens found, go looking for them!
         if None in [self._bos_id, self._eos_id]:
@@ -70,6 +76,8 @@ def _look_for_special_token(added_tokens: dict, search_strs: List[str]) -> Optio
             if len(candidate_toks) == 1:
                 return candidate_toks[0]["id"]
 
+    ## Interface ##
+
     def encode(
         self,
         s: str,
@@ -90,3 +98,21 @@ def bos_id(self) -> int:
 
     def eos_id(self) -> int:
         return self._eos_id
+
+    ## Additional Public Methods ##
+
+    def has_chat_template(self) -> bool:
+        return bool(self._chat_template)
+
+    def apply_chat_template(
+        self,
+        dialog: List[Dict[str, str]],
+        add_generation_prompt: bool = False,
+    ) -> str:
+        """If configured with a chat template, apply it to the list of messages
+        """
+        if not self._chat_template:
+            raise ValueError("No chat template configured!")
+        return self._chat_template.render(
+            messages=dialog, add_generation_prompt=add_generation_prompt
+        )
diff --git a/torchchat/cli/cli.py b/torchchat/cli/cli.py
index a7f7bbba2..91bdcaf26 100644
--- a/torchchat/cli/cli.py
+++ b/torchchat/cli/cli.py
@@ -17,7 +17,15 @@
     allowable_params_table,
 )
 
-logging.basicConfig(level=logging.INFO, format="%(message)s")
+_log_level_env = os.getenv("LOG_LEVEL", "INFO")
+try:
+    _log_level = getattr(logging, _log_level_env.upper())
+except AttributeError:
+    print(f"Invalid log level: {_log_level_env}", file=sys.stderr)
+    _log_level = logging.INFO
+
+
+logging.basicConfig(level=_log_level, format="%(message)s")
 logger = logging.getLogger(__name__)
 
 default_device = os.getenv("TORCHCHAT_DEVICE", "fast")
diff --git a/torchchat/generate.py b/torchchat/generate.py
index 9b4c6430a..4d2439d2f 100644
--- a/torchchat/generate.py
+++ b/torchchat/generate.py
@@ -45,13 +45,52 @@
 from torchchat.utils.device_info import get_device_info
 
 
+# NOTE: Logging disabled by default here due to conflicts with torch._dynamo
+class NoOpLogger:
+    def __no_op(self, *_, **__):
+        pass
+    def __getattr__(self, name):
+        return self.__no_op
+
+
+logger = (
+    NoOpLogger() if os.getenv("LOG_LEVEL") is None
+    else logging.getLogger(__name__)
+)
+
+## Chat Formatters #############################################################
+
 class _ChatFormatter(ABC):
+
+    # Messages can arrive as a standard dict with "role" and "content" as
+    # strings, or where "content" is a list of objects with "text" fields.
+    MESSAGE_TYPE = Dict[str, Union[str, List[Dict[str, str]]]]
+
+    # A dialog is a sequence of messages
+    DIALOG_TYPE = List[MESSAGE_TYPE]
+
     def __init__(self, tokenizer):
         self.tokenizer = tokenizer
 
     @abstractmethod
-    def encode_dialog_prompt(self, dialog) -> List[int]:
-        raise NotImplementedError()
+    def encode_dialog_prompt(
+        self,
+        dialog: DIALOG_TYPE,
+        add_generation_prompt: bool,
+    ) -> List[int]:
+        """Encode a sequence of messages into a sequence of token IDs, including
+        the chat template
+
+        Args:
+            dialog (DIALOG_TYPE): The sequence of dialog messages to encode.
+                This will be the additional messages on top of those that have
+                already been processed.
+            add_generation_prompt (bool): Whether to include a generation prompt
+                at the end of the encoded sequence.
+
+        Returns:
+            List[int]: A list of token IDs representing the encoded prompt.
+        """
 
 
 class Llama3ChatFormatter(_ChatFormatter):
@@ -61,7 +100,7 @@ class Llama3ChatFormatter(_ChatFormatter):
 
     """
 
-    def encode_header(self, role) -> List[int]:
+    def _encode_header(self, role) -> List[int]:
         tokens = []
         tokens.append(self.tokenizer.special_tokens["<|start_header_id|>"])
         tokens.extend(self.tokenizer.encode(role, bos=False, eos=False))
@@ -69,8 +108,8 @@ def encode_header(self, role) -> List[int]:
         tokens.extend(self.tokenizer.encode("\n\n", bos=False, eos=False))
         return tokens
 
-    def encode_message(self, message) -> List[int]:
-        tokens = self.encode_header(message["role"])
+    def _encode_message(self, message: _ChatFormatter.MESSAGE_TYPE) -> List[int]:
+        tokens = self._encode_header(message["role"])
         if isinstance(message["content"], str):
             tokens.extend(
                 self.tokenizer.encode(message["content"], bos=False, eos=False)
@@ -85,46 +124,80 @@ def encode_message(self, message) -> List[int]:
         tokens.append(self.tokenizer.special_tokens["<|eot_id|>"])
         return tokens
 
-    def encode_dialog_prompt(self, dialog) -> List[int]:
+    def encode_dialog_prompt(
+        self,
+        dialog: _ChatFormatter.DIALOG_TYPE,
+        add_generation_prompt: bool,
+    ) -> List[int]:
         tokens = []
         tokens.append(self.tokenizer.special_tokens["<|begin_of_text|>"])
         for message in dialog:
-            tokens.extend(self.encode_message(message))
+            tokens.extend(self._encode_message(message))
         # Add the start of an assistant message for the model to complete.
-        tokens.extend(self.encode_header("assistant"))  # Pass role directly as a string
+        if add_generation_prompt and dialog and dialog[-1]["role"] != "assistant":
+            tokens.extend(self._encode_header("assistant")) # Pass role directly as a string
         return tokens
 
 
-B_INST, E_INST = "[INST]", "[/INST]"
-B_SYS, E_SYS = "<<SYS>>", "<</SYS>>"
+class Llama2ChatFormatter(_ChatFormatter):
+    """
+    Chat formatting for Llama2
+    CITE: https://www.llama.com/docs/model-cards-and-prompt-formats/meta-llama-2/
+    """
+
+    B_INST, E_INST = "[INST] ", " [/INST]"
+    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
 
+    @staticmethod
+    def _get_content_str(message: _ChatFormatter.MESSAGE_TYPE) -> str:
+        if isinstance(message["content"], list):
+            return message["content"][0]["text"]
+        return message["content"]
 
-class Llama2ChatFormatter(_ChatFormatter):
-    def encode_dialog_prompt(self, dialog) -> List[int]:
-        tokens = self.tokenizer.encode(f"{B_INST} ")
-        first_message = True  # Bool to handle placing the B_INST token. Behavior is weird - the system prompt should have the B_INST, but not the first user message. All following user messages *should* have it. Also, if there is no system prompt, then the user message should have it.
+    def encode_dialog_prompt(
+        self,
+        dialog: _ChatFormatter.DIALOG_TYPE,
+        add_generation_prompt: bool, # UNUSED
+    ) -> List[int]:
+        new_turn = True
+        tokens = []
         for message in dialog:
-            if isinstance(message["content"], list):
-                content = message["content"][0]["text"]
+            if new_turn:
+                tokens += self.tokenizer.encode(f"{self.tokenizer.bos}{self.B_INST}")
+            content = self._get_content_str(message).strip()
+            role = message["role"]
+            if role == "system":
+                tokens += self.tokenizer.encode(f"{self.B_SYS}{content}{self.E_SYS}")
+                new_turn = False
+            elif role == "user":
+                tokens += self.tokenizer.encode(f"{content}{self.E_INST}")
+                new_turn = False
+            elif role == "assistant":
+                tokens += self.tokenizer.encode(f" {content} {self.tokenizer.eos}\n")
+                new_turn = True
             else:
-                content = message["content"]
-            content = content.strip()
-            if message["role"] == "system":
-                encoded = self.tokenizer.encode(f"{B_SYS}\n{content}\n{E_SYS}")
-                first_message = False
-            elif message["role"] == "user":
-                encoded = [self.tokenizer.bos_id()] + self.tokenizer.encode(
-                    f"{B_INST if first_message else ''} {content} {E_INST} "
-                )
-                first_message = True
-            elif message["role"] == "assistant":
-                encoded = self.tokenizer.encode(f"{content}\n\n") + [
-                    self.tokenizer.eos_id()
-                ]
-            tokens += encoded
+                raise ValueError("Invalid role in dialog.")
         return tokens
 
 
+
+class HFTokenizerChatFormatter(_ChatFormatter):
+    """Chat formatter that uses the built-in formatting capabilities of an HF
+    tokenizer instance
+    """
+    def encode_dialog_prompt(
+        self,
+        dialog: _ChatFormatter.DIALOG_TYPE,
+        add_generation_prompt: bool,
+    ) -> List[int]:
+        rendered = self.tokenizer.apply_chat_template(
+            dialog, add_generation_prompt=add_generation_prompt
+        )
+        logger.debug("Formatted chat prompt:\n%s", rendered)
+        return self.tokenizer.encode(rendered)
+
+## Generation ##################################################################
+
 @dataclass
 class GeneratorArgs:
     prompt: Optional[str] = (
@@ -283,9 +356,13 @@ def __init__(
         if self.is_llama3_model:
             self.chat_formatter = Llama3ChatFormatter(self.tokenizer)
             if generator_args.chat_mode:
-                logging.debug(
+                logger.debug(
                     "Llama3 model detected in chat mode. Using updated sentence schemas"
                 )
+        elif self.tokenizer_args.is_hf_tokenizer:
+            if not self.tokenizer.has_chat_template():
+                raise ValueError("Tokenizer must have a chat template")
+            self.chat_formatter = HFTokenizerChatFormatter(self.tokenizer)
         else:
             self.chat_formatter = Llama2ChatFormatter(self.tokenizer)
 
@@ -341,10 +418,12 @@ def sample(
         temperature: float = 0,
         top_k: Optional[int] = None,
     ):
+        logits = logits[0, -1]
+        logger.debug("Logits: %s", logits)
         if temperature == 0 and not need_probs:
-            _, idx_next = torch.topk(logits[0, -1], k=1, dim=-1)
+            _, idx_next = torch.topk(logits, k=1, dim=-1)
             return (idx_next, None)
-        probs = self.logits_to_probs(logits[0, -1], temperature, top_k)
+        probs = self.logits_to_probs(logits, temperature, top_k)
         idx_next = self.multinomial_sample_one_no_sync(probs)
         return idx_next, probs
 
@@ -358,7 +437,7 @@ def prefill(
         sequential_prefill=True,
         **sampling_kwargs,
     ) -> torch.Tensor:
-        # logging.debug(f"x: {x}, input_pos: {input_pos}")
+        logger.debug("x: %s, input_pos: %s", x, input_pos)
         width = x.size(1)
         assert input_pos.size(0) == width
 
@@ -394,7 +473,7 @@ def prefill(
         elif sequential_prefill:
             for i in range(width):
                 x_sliced, ip_sliced = x[:, i].view(-1, 1), input_pos[i].view(-1)
-                # logging.debug(f"<sliced> x: {x_sliced}, input_pos: {ip_sliced}")
+                logger.debug("<sliced> x: %s, input_pos: %s", x_sliced, ip_sliced)
                 logits = model(x_sliced, ip_sliced)  # (x[:, i], input_pos[i])da
         else:
             # input_pos: [B, S]
@@ -727,7 +806,8 @@ def encode_tokens(self, string, bos=True, device="cpu"):
         tokens = self.tokenizer.encode(string)
         if bos:
             tokens = [self.tokenizer.bos_id()] + tokens
-        logging.debug(f"Size after encode_tokens: {len(tokens)}")
+        logger.debug("Size after encode_tokens: %d", len(tokens))
+        logger.debug("Token IDs: %s", tokens)
         return torch.tensor(tokens, dtype=torch.int, device=device)
 
     def _callback(self, x, *, buffer, done_generating):
@@ -776,7 +856,7 @@ def _gen_model_input(
             # Single String prompt
             if isinstance(prompt, str):
                 encoded = self.encode_tokens(
-                    prompt, bos=True, device=self.builder_args.device
+                    prompt, bos=self.model.config.tokenizer_prepend_bos, device=self.builder_args.device
                 )
             # List of dialog
             else:
@@ -785,7 +865,7 @@ def _gen_model_input(
                     tokens, dtype=torch.int, device=self.builder_args.device
                 )
 
-            logging.debug(encoded)
+            logger.debug(encoded)
             return encoded, None
 
         # Llama 3.2 11B
@@ -900,7 +980,7 @@ def _gen_model_input(
                 value=0,
             )
 
-        logging.debug(encoded)
+        logger.debug(encoded)
         return encoded, batch
 
     def chat(
@@ -1021,38 +1101,21 @@ def chat(
                 if prompt == "/bye":
                     print("Exiting Chat.\n")
                     break
-                if not self.is_llama3_model:
-                    if self.system_prompt:
-                        prompt = f"{B_INST} {B_SYS}\n{self.system_prompt.strip()}\n{E_SYS}\n\n{prompt.strip()} {E_INST}"
-                        self.system_prompt = (
-                            None  # can only provide system prompt on first interaction
-                        )
-                    else:
-                        prompt = f"{B_INST} {prompt.strip()} {E_INST}"
-                    encoded = self.encode_tokens(
-                        prompt, bos=True, device=self.builder_args.device
-                    )
-                else:
-                    if self.system_prompt:
-                        encoded = self.chat_formatter.encode_dialog_prompt(
-                            [
-                                {"role": "system", "content": self.system_prompt},
-                                {"role": "user", "content": prompt},
-                            ]
-                        )
-                        self.system_prompt = None
-                    elif is_first_sample:
-                        encoded = self.chat_formatter.encode_dialog_prompt(
-                            [{"role": "user", "content": prompt}]
-                        )
-                    else:
-                        encoded = self.chat_formatter.encode_message(
-                            {"role": "user", "content": prompt}
-                        )
-                        encoded.extend(self.chat_formatter.encode_header("assistant"))
-                    encoded = torch.tensor(
-                        encoded, dtype=torch.int, device=self.builder_args.device
+
+                # Encode the additional messages added in this dialog turn. If
+                # this is the first turn, that includes any system prompt.
+                messages_to_encode = []
+                if is_first_sample and self.system_prompt:
+                    messages_to_encode.append(
+                        {"role": "system", "content": self.system_prompt}
                     )
+                messages_to_encode.append({"role": "system", "content": prompt})
+                encoded = self.chat_formatter.encode_dialog_prompt(
+                    messages_to_encode, add_generation_prompt=True,
+                )
+                encoded = torch.tensor(
+                    encoded, dtype=torch.int, device=self.builder_args.device
+                )
                 if encoded.size(0) + start_pos > max_seq_length:
                     print(
                         "This prompt would take us past the max_seq_length. Ending Conversation."
@@ -1231,6 +1294,7 @@ def main(args):
     speculative_builder_args = BuilderArgs.from_speculative_args(args)
     tokenizer_args = TokenizerArgs.from_args(args)
     generator_args = GeneratorArgs.from_args(args)
+    logger.debug("GeneratorArgs: %s", generator_args)
     if not builder_args.distributed:
         gen = Generator(
             builder_args,
diff --git a/torchchat/model.py b/torchchat/model.py
index 2a3b9f12f..1c78d4c63 100644
--- a/torchchat/model.py
+++ b/torchchat/model.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 import json
+import logging
 import os
 import warnings
 from abc import ABC, abstractmethod
@@ -48,6 +49,8 @@
 
 config_path = Path(f"{str(Path(__file__).parent)}/model_params")
 
+logger = logging.getLogger(__name__)
+
 
 class QuickGELUActivation(nn.Module):
     """
@@ -273,6 +276,7 @@ class TransformerArgs:
     # Select the desired tokenizer. Defaults to sentencepiece
     use_tiktoken: bool = False
     use_hf_tokenizer: bool = False
+    tokenizer_prepend_bos: bool = True
     max_seq_length: int = 8192
     rope_scaling: Optional[Dict[str, Any]] = None
     # For pipeline parallel
@@ -330,6 +334,7 @@ class ModelArgs:
     transformer_args: Dict[str, Dict[str, Any]]
     use_tiktoken: bool
     use_hf_tokenizer: bool
+    tokenizer_prepend_bos: bool
 
     def __init__(
         self,
@@ -337,6 +342,7 @@ def __init__(
         model_type: ModelType = ModelType.TextOnly,
         use_tiktoken: bool = False,
         use_hf_tokenizer: bool = False,
+        tokenizer_prepend_bos: bool = True,
     ) -> None:
         self._sanity_check(transformer_args, model_type)
 
@@ -346,6 +352,7 @@ def __init__(
         # Model-level attributes
         self.use_tiktoken = use_tiktoken
         self.use_hf_tokenizer = use_hf_tokenizer
+        self.tokenizer_prepend_bos = tokenizer_prepend_bos
 
     def _sanity_check(
         self,
@@ -373,7 +380,14 @@ def from_params(cls, params_path):
 
         use_tiktoken = loaded_params.get("use_tiktoken", False)
         use_hf_tokenizer = loaded_params.get("use_hf_tokenizer", False)
-        return cls(transformer_args, model_type, use_tiktoken, use_hf_tokenizer)
+        tokenizer_prepend_bos = loaded_params.get("tokenizer_prepend_bos", True)
+        return cls(
+            transformer_args=transformer_args,
+            model_type=model_type,
+            use_tiktoken=use_tiktoken,
+            use_hf_tokenizer=use_hf_tokenizer,
+            tokenizer_prepend_bos=tokenizer_prepend_bos,
+        )
 
     @classmethod
     def from_table(cls, name: str):
@@ -477,7 +491,9 @@ def build_model(self) -> nn.Module:
         for name, module_class in recipe.modules.items():
             config_args = self.config.transformer_args[name]
             if module_class == Transformer:
-                modules[name] = module_class(TransformerArgs.from_params(config_args))
+                transformer_args = TransformerArgs.from_params(config_args)
+                logger.debug("Transformer Args: %s", transformer_args)
+                modules[name] = module_class(transformer_args)
             else:
                 modules[name] = module_class(**config_args)
 
diff --git a/torchchat/model_config/models.json b/torchchat/model_config/models.json
index 2d3dfcbeb..8791601fb 100644
--- a/torchchat/model_config/models.json
+++ b/torchchat/model_config/models.json
@@ -164,5 +164,19 @@
             "https://github.com/karpathy/llama2.c/raw/master/tokenizer.model"
         ],
         "checkpoint_file": "stories110M.pt"
+    },
+    "ibm-granite/granite-3b-code-instruct-128k": {
+        "aliases": ["granite-code", "granite-code-3b"],
+        "distribution_channel": "HuggingFaceSnapshot",
+        "distribution_path": "ibm-granite/granite-3b-code-instruct-128k",
+        "transformer_params_key": "Granite-3B-Code",
+        "tokenizer_file": "tokenizer.json"
+    },
+    "ibm-granite/granite-8b-code-instruct-128k": {
+        "aliases": ["granite-code-8b"],
+        "distribution_channel": "HuggingFaceSnapshot",
+        "distribution_path": "ibm-granite/granite-8b-code-instruct-128k",
+        "transformer_params_key": "Granite-8B-Code",
+        "tokenizer_file": "tokenizer.json"
     }
 }
diff --git a/torchchat/model_params/Granite-3B-Code.json b/torchchat/model_params/Granite-3B-Code.json
new file mode 100644
index 000000000..0654a8f2c
--- /dev/null
+++ b/torchchat/model_params/Granite-3B-Code.json
@@ -0,0 +1,17 @@
+{
+    "block_size": 128000,
+    "dim": 2560,
+    "hidden_dim": 10240,
+    "n_heads": 32,
+    "n_local_heads": 32,
+    "n_layers": 32,
+    "rope_base": 10000000,
+    "vocab_size": 49152,
+    "use_hf_tokenizer": true,
+    "tokenizer_prepend_bos": false,
+    "norm_eps": 0.00001,
+    "rope_scaling": null,
+    "attention_bias": true,
+    "feed_forward_bias": true,
+    "tie_word_embeddings": true
+}
\ No newline at end of file
diff --git a/torchchat/model_params/Granite-8B-Code.json b/torchchat/model_params/Granite-8B-Code.json
new file mode 100644
index 000000000..079a32070
--- /dev/null
+++ b/torchchat/model_params/Granite-8B-Code.json
@@ -0,0 +1,17 @@
+{
+    "block_size": 128000,
+    "dim": 4096,
+    "hidden_dim": 14336,
+    "n_heads": 32,
+    "n_local_heads": 8,
+    "n_layers": 36,
+    "rope_base": 10000000,
+    "vocab_size": 49152,
+    "use_hf_tokenizer": true,
+    "tokenizer_prepend_bos": false,
+    "norm_eps": 0.00001,
+    "rope_scaling": null,
+    "attention_bias": true,
+    "feed_forward_bias": true,
+    "tie_word_embeddings": true
+}
\ No newline at end of file

From 582e5582db5361134e2c5c822e4d9ed4d11d6472 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Thu, 19 Dec 2024 10:16:45 -0800
Subject: [PATCH 43/83] Fix 3.2 11B inference, by updating
 padded_collate_tiled_images_and_mask args (#1431)

---
 torchchat/generate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchchat/generate.py b/torchchat/generate.py
index 4d2439d2f..987fb3e44 100644
--- a/torchchat/generate.py
+++ b/torchchat/generate.py
@@ -949,7 +949,7 @@ def _gen_model_input(
 
             if image_found:
                 batch = padded_collate_tiled_images_and_mask(
-                    [data], pad_direction="left", pad_max_images=1
+                    [data], pad_direction="left", pad_max_images=1, pad_max_tiles=transform.max_num_tiles
                 )
                 encoded = batch.pop("tokens").to(device).view(-1)
                 seq_len = encoded.size(0)

From 7dad56feb14b890fa1e999cd1a18d37d4ce0bbf7 Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Thu, 19 Dec 2024 13:33:24 -0800
Subject: [PATCH 44/83] Integrate distributed inference with chat/server
 (#1381)

* Integrate distributed inference without introducing abstraction

* Cleanup old distributed inference integration

* Read distribution from model_config

* Declare distribution_path if args.model is not given

* Address some nits from PR review

* Added comment on model size all reduce + type hint

* Apply suggestions from code review

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>

* Make sure speculative decoding is disable for pp >1 and remark this in the comments as well

* Refactor conditions in pp

* Rename and alter signature of setup_env to reflect that it also runs the target

* Rename setup_env in server + fix condition

* Update generate.py

* Add default value to add_generation_prompt to preserve bc

---------

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 torchchat/cli/builder.py                  | 111 +++-
 torchchat/distributed/checkpoint_utils.py |  32 ++
 torchchat/distributed/dist_run.py         | 629 ----------------------
 torchchat/distributed/generate.py         | 271 ----------
 torchchat/distributed/utils.py            |  14 +-
 torchchat/generate.py                     | 394 ++++++++++++--
 torchchat/usages/openai_api.py            |  15 +-
 torchchat/usages/server.py                |  86 ++-
 8 files changed, 596 insertions(+), 956 deletions(-)
 delete mode 100644 torchchat/distributed/dist_run.py
 delete mode 100644 torchchat/distributed/generate.py

diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
index 2773c8372..38d0e33b2 100644
--- a/torchchat/cli/builder.py
+++ b/torchchat/cli/builder.py
@@ -14,10 +14,17 @@
 import torch
 import torch._dynamo.config
 import torch._inductor.config
-import torch.nn as nn
+import torch.distributed as dist
 
-from torchchat.model import Model, ModelArgs, ModelType
+from torchchat.distributed.utils import(
+    Color as color,
+    CUDATrackTime,
+    init_distributed,
+    GPUMemoryMonitor,
+)
+from torchchat.distributed.logging_utils import SingletonLogger
 
+from torchchat.model import Model, ModelArgs, ModelType, Transformer, TransformerArgs
 from torchchat.model_config.model_config import resolve_model_config
 from torchchat.utils.build_utils import (
     device_sync,
@@ -28,6 +35,7 @@
 from torchchat.utils.measure_time import measure_time
 from torchchat.utils.quantize import quantize_model
 
+
 from torchtune.models.convert_weights import meta_to_tune
 
 from torchtune.models.llama3_1._position_embeddings import Llama3ScaledRoPE
@@ -56,6 +64,7 @@ class BuilderArgs:
     pp: int = 1
     tp: int = 1
     chpt_from: str = "hf"
+    distribution_path: Optional[str] = None
     is_chat_model: bool = False
     prefill_possible: bool = False
     dynamic_shapes: bool = False
@@ -107,6 +116,7 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
 
         checkpoint_path = args.checkpoint_path
         params_table = args.params_table
+        distribution_path = None
         if args.model:  # Using a named, well-known model
             model_config = resolve_model_config(args.model)
 
@@ -121,6 +131,8 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
                 model_config.transformer_params_key or model_config.name.split("/")[-1]
             )
 
+            distribution_path = model_config.distribution_path
+
         dso_path = getattr(args, "dso_path", None)
         pte_path = getattr(args, "pte_path", None)
         aoti_package_path = getattr(args, "aoti_package_path", None)
@@ -186,6 +198,7 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
             pp=pp,
             tp=tp,
             chpt_from=chpt_from,
+            distribution_path=distribution_path,
             is_chat_model=is_chat_model,
             dynamic_shapes=getattr(args, "dynamic_shapes", False),
             max_seq_length=getattr(args, "max_seq_length", None),
@@ -601,6 +614,100 @@ def do_nothing(max_batch_size, max_seq_length):
             model = PTEModel(config, builder_args.pte_path)
         except Exception:
             raise RuntimeError(f"Failed to load ET compiled {builder_args.pte_path}")
+    elif builder_args.distributed:
+        pp_degree = builder_args.pp
+        tp_degree = builder_args.tp
+
+        init_distributed()
+        rank = dist.get_rank()
+        torch.cuda.set_device(rank % torch.cuda.device_count())
+
+        logger = SingletonLogger.get_logger()
+
+        gpu_memory_monitor = GPUMemoryMonitor("cuda")
+        logger.info(f"{color.yellow} {gpu_memory_monitor.get_device_info()}{color.reset}")
+
+        # Model-level config
+        if builder_args.params_table:
+            model_config = ModelArgs.from_table(builder_args.params_table)
+        else:
+            raise NotImplementedError()
+        # Transformer-level config
+        config = TransformerArgs.from_params(model_config.transformer_args["text"])
+        logger.info(f"Transformer Config: {config}")
+
+        #TODO: Move into head of file after solving circular import
+        from torchchat.distributed.checkpoint_utils import (
+            load_model_weights,
+            )
+
+        # Validate pipeline degree
+        assert config.n_layers % pp_degree == 0
+
+        # Create device mesh
+        device_mesh = dist.init_device_mesh(
+            "cuda",
+            (pp_degree, tp_degree),
+            mesh_dim_names=("pp", "tp")
+            )
+        tp_mesh = device_mesh["tp"]
+        pp_mesh = device_mesh["pp"]
+        logger.info(f"Created device mesh: {device_mesh}\n{tp_mesh=}, {pp_mesh=}")
+
+        pp_rank = pp_mesh.get_local_rank()
+        logger.info(f"{pp_degree=}, {tp_degree=}")
+
+        # Assuming same number of GPUs per node
+        device = torch.device(f"cuda:{rank % torch.cuda.device_count()}")
+
+        # Fill in PP configs
+        config.stage_idx = pp_rank
+        config.n_stages = pp_degree
+
+        with torch.device("meta"):
+            # TODO: we should create model instead of Transformer
+            model = Transformer(config)
+
+        # Distribute model on TP mesh
+        # (Surprisingly, this works even though model is on meta device and mesh is of
+        # cuda devices)
+        model.distribute(tp_mesh)
+        if rank == 0:
+            logger.info(f"Model: {model}")
+
+        # Load weights
+        logger.info(f"Loading weights for {pp_rank=} on {device=}")
+        with CUDATrackTime() as timer:
+            load_model_weights(model, builder_args.distribution_path, device, config, builder_args.chpt_from)
+
+        logger.info(
+            f"{color.green}Total weight loading time: {timer.get_time()} {timer.unit} for rank {rank}{color.reset}"
+        )
+
+        # Setup KV caches (after model distribution)
+        # The number of cache lanes is the same as the maximum number of
+        # micro-batches that can be "in flight" in parallel -- imagine each
+        # micro-batch takes 1 "pipeline lane," they need distinct KV cache spaces.
+        # When decoding is done for certain micro-batches, we can reuse the KV cache
+        # lanes.
+        # TODO: bump up the lane count
+        pipeline_lanes = 1
+        seqlen_prefill=1024
+        with device:
+            model.setup_caches(1, seqlen_prefill, cache_lanes=pipeline_lanes)
+
+        # info on stage size and params
+        # stage_size = get_module_size(model)
+        # stage_size_formatted = bytes_to_readable(stage_size)
+        # stage_num_params = get_num_params(model)
+        # logger.info(
+        #     f"Stage {rank} has {color.blue}{stage_num_params} params{color.reset}, Size: {color.blue}{stage_size_formatted}{color.reset}"
+        # )
+        model.eval()
+
+        model.text_transformer_args = None
+        model.config.model_type = model_config.model_type
+        model.device_mesh = device_mesh
     else:
         with measure_time("Time to load model: {time:.02f} seconds"):
             model = _load_model(builder_args)
diff --git a/torchchat/distributed/checkpoint_utils.py b/torchchat/distributed/checkpoint_utils.py
index cf3206e4e..806855c4b 100644
--- a/torchchat/distributed/checkpoint_utils.py
+++ b/torchchat/distributed/checkpoint_utils.py
@@ -17,6 +17,7 @@
 from torch.distributed._tensor import DTensor
 from torchchat.distributed.dtensor_utils import convert_to_dtensor
 from torchchat.cli.builder import BuilderArgs, _load_checkpoint
+from torchchat.model import ModelArgs
 
 
 _DEFAULT_SAFETENSOR_FILE_NAME = "model.safetensors.index.json"
@@ -450,3 +451,34 @@ def load_weights_from_torchchat_format(stage_module, distribution, device, model
     # Fill state dict into stage module
     stage_module.load_state_dict(stage_state_dict, strict=False, assign=True)
     logger.info(f"Successfully loaded {len(updated_states)} weights into stage module")
+
+
+def load_model_weights(
+    stage_module: torch.nn.Module,
+    distribution: str,
+    device: torch.device,
+    model_config: ModelArgs,
+    chpt_from: str,
+):
+    """Load the weights from the safetensor file(s) into the model stage.
+    Model config is needed b/c we permute wq and wk weights based on attn heads.
+
+    Args:
+        stage_module (torch.nn.Module): The model stage to load the weights into.
+        distribution (str): The distribution name, e.g. "meta-llama/Meta-Llama-3-8B-Instruct".
+        device (torch.device): The device to load the weights onto.
+        model_config (ModelArgs): The model config.
+        chpt_from (str): The checkpoint format to load the weights from, e.g. "torchchat" or "hf".
+    """
+    if chpt_from == "hf":
+        # This format stands for: index file + multiple binary files
+        load_weights_from_hf_format(stage_module, distribution, device, model_config)
+    elif chpt_from == "torchchat":
+        # This format stands for:
+        # single binary file, OR
+        # multiple binary files without index files.
+        load_weights_from_torchchat_format(
+            stage_module, distribution, device, model_config
+        )
+    else:
+        raise ValueError(f"Unknown checkpoint format: {chpt_from}")
diff --git a/torchchat/distributed/dist_run.py b/torchchat/distributed/dist_run.py
deleted file mode 100644
index 389ae41c1..000000000
--- a/torchchat/distributed/dist_run.py
+++ /dev/null
@@ -1,629 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-
-# Example run command:
-# torchrun --nproc-per-node 4 dist_run.py llama2-7b-chat --pp 2
-# torchrun --nproc-per-node 4 dist_run.py llama3 --pp 2
-
-import argparse
-import os
-from enum import auto, Enum
-from pathlib import Path
-from types import MethodType, SimpleNamespace
-from typing import Any, Dict, List, Optional, Tuple
-
-import torch
-import torch.distributed as dist
-from torch.distributed.pipelining import PipelineStage, ScheduleGPipe
-from torchchat.cli.builder import TokenizerArgs
-
-# TODO - these are not distributed specific, consider moving to new package
-from torchchat.distributed.checkpoint_utils import (
-    get_hf_config_file,
-    load_weights_from_hf_format,
-    load_weights_from_torchchat_format,
-)
-
-from torchchat.distributed.logging_utils import SingletonLogger
-from torchchat.distributed.utils import (
-    bytes_to_readable,
-    Color as color,
-    CUDATrackTime,
-    get_module_size,
-    get_num_params,
-    GPUMemoryMonitor,
-)
-from torchchat.model import ModelArgs, Transformer, TransformerArgs
-from torchchat.utils.build_utils import set_precision
-
-try:
-    from tokenizer.tiktoken import Tokenizer as TiktokenTokenizer
-except ImportError:
-    TiktokenTokenizer = None
-try:
-    from sentencepiece import SentencePieceProcessor
-except ImportError:
-    SentencePieceProcessor = None
-
-
-logger = SingletonLogger.get_logger()
-
-# Using model name to identify the model to load, for example "llama2-7b-chat".
-# You can change it to other values listed below.
-# For details on the name-to-distribution mapping, see README.md or models.json.
-NAME_TO_DISTRIBUTION_AND_DTYPE = {
-    "llama2-7b-chat": ("meta-llama/Llama-2-7b-chat-hf", torch.float16),
-    "llama3": ("meta-llama/Meta-Llama-3-8B-Instruct", torch.bfloat16),
-    "llama3.1": ("meta-llama/Meta-Llama-3.1-8B-Instruct", torch.bfloat16),
-}
-
-
-def _init_distributed():
-    dist.init_process_group("nccl")
-    rank = dist.get_rank()
-    world_size = dist.get_world_size()
-    # Assuming same number of GPUs per node
-    torch.cuda.set_device(rank % torch.cuda.device_count())
-    return rank, world_size
-
-
-def _create_device_mesh(pp_degree, tp_degree):
-    return dist.init_device_mesh(
-        "cuda", (pp_degree, tp_degree), mesh_dim_names=("pp", "tp")
-    )
-
-
-def dict_to_args(dictionary: Dict[str, Any]) -> SimpleNamespace:
-    return SimpleNamespace(**dictionary)
-
-
-def _patch_tokenizer(tokenizer):
-    """Patch the tokenizer to support decoding of token ids."""
-    if isinstance(tokenizer, TiktokenTokenizer):
-        # Patch tiktokenizer to allow a list of sequences.
-        # TODO: Upstream to tokenizer modules
-        old_decode = tokenizer.decode
-
-        def decode(
-            self, token_ids: List[int | List[int]], *args, **kwargs
-        ) -> str | List[str]:
-            if len(token_ids) < 1:
-                return ""
-            if isinstance(token_ids[0], list):
-                return [old_decode(t, *args, **kwargs) for t in token_ids]
-            else:
-                return old_decode(token_ids, *args, **kwargs)
-
-        tokenizer.decode = MethodType(decode, tokenizer)
-    return tokenizer
-
-
-def _build_chat_tokenizer(
-    tokenizer_args: TokenizerArgs,
-) -> SentencePieceProcessor | TiktokenTokenizer:
-    """Builds a tokenizer for the given model name"""
-
-    tokenizer_args = TokenizerArgs.from_args(tokenizer_args)
-    tokenizer = tokenizer_args.t
-    assert tokenizer is not None, f"Failed to get tokenizer using {tokenconfig=}"
-    logger.info(
-        f"using tokenizer = {tokenizer.__class__.__module__}.{tokenizer.__class__.__name__}"
-    )
-
-    tokenizer = _patch_tokenizer(tokenizer)
-
-    return tokenizer
-
-
-def _load_model_weights(
-    stage_module: torch.nn.Module,
-    distribution: str,
-    device: torch.device,
-    model_config: ModelArgs,
-    chpt_from: str,
-):
-    """Load the weights from the safetensor file(s) into the model stage.
-    Model config is needed b/c we permute wq and wk weights based on attn heads.
-
-    Args:
-        stage_module (torch.nn.Module): The model stage to load the weights into.
-        distribution (str): The distribution name, e.g. "meta-llama/Meta-Llama-3-8B-Instruct".
-        device (torch.device): The device to load the weights onto.
-        model_config (ModelArgs): The model config.
-        chpt_from (str): The checkpoint format to load the weights from, e.g. "torchchat" or "hf".
-    """
-    if chpt_from == "hf":
-        # This format stands for: index file + multiple binary files
-        load_weights_from_hf_format(stage_module, distribution, device, model_config)
-    elif chpt_from == "torchchat":
-        # This format stands for:
-        # single binary file, OR
-        # multiple binary files without index files.
-        load_weights_from_torchchat_format(
-            stage_module, distribution, device, model_config
-        )
-    else:
-        raise ValueError(f"Unknown checkpoint format: {chpt_from}")
-
-
-def _encode_strings(
-    strings: List[str],
-    tokenizer,
-    bos: bool,
-    device: torch.device,
-    dtype=torch.int64,
-) -> List[torch.Tensor]:
-    """Encode a list of prompt strings into a list of tensor token ids."""
-    encoded_list = []
-    for string in strings:
-        tokens = tokenizer.encode(string)
-        if bos:
-            tokens = [tokenizer.bos_id()] + tokens
-        encoded_list.append(torch.tensor(tokens, dtype=dtype, device=device))
-    return encoded_list
-
-
-def _create_padded_prompts(
-    input_ids_list: List[torch.Tensor],
-    tokenizer,
-    seqlen: int,
-    start_pos: int,
-    device: torch.device,
-    pad_token_id: Optional[int] = None,
-) -> Tuple[torch.Tensor, List[int]]:
-    """
-    Create a padded tensor for multiple encoded input prompts.
-
-    Returns:
-        Tuple[torch.Tensor, List[int]]: A tuple containing the padded tensor and a list of prompt lengths.
-    """
-    pad_token_id = pad_token_id if pad_token_id is not None else tokenizer.eos_id()
-
-    # Find the maximum prompt length
-    max_prompt_len = max(ids.size(0) for ids in input_ids_list)
-
-    # Calculate the buffer size
-    max_new_tokens = max(0, min(seqlen - start_pos, seqlen - max_prompt_len))
-    token_buffer_size = max_prompt_len + max_new_tokens
-
-    # Create the padded batch tensor
-    batch_size = len(input_ids_list)
-    batch_seq = torch.full(
-        (batch_size, token_buffer_size), pad_token_id, dtype=torch.int64, device=device
-    )
-
-    prompt_lengths = []
-    for i, input_ids in enumerate(input_ids_list):
-        prompt_len = input_ids.size(0)
-        batch_seq[i, :prompt_len] = input_ids
-        prompt_lengths.append(prompt_len)
-
-    return batch_seq, prompt_lengths
-
-
-def _batch_decode_next_tokens(
-    output: torch.Tensor,
-    pos: List[int] = None,
-    temperature: float = 1.0,
-    topk: int = 10,
-) -> torch.Tensor:
-    """
-    Decode the next token for each prompt in the batch. Adds temperature option for non-deterministic decoding.
-
-    Args:
-        output (torch.Tensor): The output tensor to decode.
-        pos (List[int]): The positions of the `output` to decode in the sequence length dimension.
-        step (int): Step indicator. If -1, use positions from `pos`. Otherwise, use the first token.
-        temperature (float): Sampling temperature for non-deterministic decoding.
-
-    Returns:
-        torch.Tensor: Decoded token ids.
-    """
-    batch_size, seq_len, vocab_size = output.shape
-
-    if pos is None:
-        # `pos` is not provided, so we can use the first token
-        next_token_logits = output[:, 0, :]
-    else:
-        # get the logits for each prompt at the specified positions
-        next_token_logits = output[torch.arange(batch_size), torch.tensor(pos) - 1]
-
-    if temperature != 1.0:
-        next_token_logits = next_token_logits / temperature
-
-    # Uses top-k sampling if temperature is not 1.0, otherwise use argmax
-    if temperature != 1.0:
-        top_k = min(topk, vocab_size)  # Ensure top-k is not greater than vocab size
-        top_k_logits, top_k_indices = torch.topk(next_token_logits, k=top_k, dim=-1)
-        probs = torch.softmax(top_k_logits, dim=-1)
-        next_token_indices = torch.multinomial(probs, num_samples=1).squeeze(-1)
-        next_tokens = top_k_indices.gather(
-            -1, next_token_indices.unsqueeze(-1)
-        ).squeeze(-1)
-    else:
-        # Argmax (deterministic)
-        next_tokens = torch.argmax(next_token_logits, dim=-1, keepdim=True)
-
-    # Token ids in int tensor form
-    return next_tokens
-
-
-def _update_padded_sequence(
-    padded_sequence: torch.Tensor,
-    new_token: torch.Tensor,
-    prompt_lengths: List[int],
-) -> None:
-    for i in range(len(prompt_lengths)):
-        padded_sequence[i, prompt_lengths[i]] = new_token[i, 0]
-        # logger.info(f"updated prompt {i} with new token {new_token[i, 0]}")
-
-
-# Decode token id into string and print it
-def _decode_in_flight(token, tokenizer, tp_rank):
-    """decode token ids for all prompts in the batch and log them"""
-    # `token` is a tensor of shape (batch_size, 1).
-    # For TiktokenTokenizer, we need to squeeze it to 1D.
-    # For SentencePieceProcessor, we don't.
-    token_str = tokenizer.decode(token.tolist())
-    # print the token string on tp rank 0
-    if tp_rank == 0:
-        logger.info(
-            f"{color.green} responses ====>>>> "
-            f"{color.blue} {token_str} {color.reset}"
-        )
-    return token_str
-
-
-def _cleanup():
-    dist.barrier()
-    dist.destroy_process_group()
-
-
-prompts = [
-    "What is Snow?",
-    # "Can you explain what is the purpose of back propagation in neural networks?",
-    "Who is Santa Claus?",
-    "Where does Santa live?",
-    "Who is Abraham Lincoln?",
-    # "How are models trained?",
-]
-
-
-def main(
-    model_name,
-    builder_args,
-    tokenizer_args,
-    pipe,
-):
-    pp_degree = builder_args.pp
-
-    rank, world_size = _init_distributed()
-    logger.info(f"Worker started: {rank=}, {world_size=}")
-
-    gpu_memory_monitor = GPUMemoryMonitor("cuda")
-    logger.info(f"{color.yellow} {gpu_memory_monitor.get_device_info()}{color.reset}")
-
-    distribution, model_dtype = NAME_TO_DISTRIBUTION_AND_DTYPE[model_name]
-    logger.info(f"Using model weights from {distribution} and dtype {model_dtype}")
-
-    # Model-level config
-    model_config = ModelArgs.from_name(distribution)
-    # Transformer-level config
-    config = TransformerArgs.from_params(model_config.transformer_args["text"])
-    logger.info(f"Transformer Config: {config}")
-
-    tokenizer = _build_chat_tokenizer(tokenizer_args)
-
-    set_precision(model_dtype)
-    logger.info(f"Using cache precision {model_dtype}")
-
-    hf_config = get_hf_config_file(distribution)
-    if hf_config is None:
-        raise ValueError(f"Config file not found for model id {distribution}")
-
-    # Validate pipeline degree
-    assert world_size % pp_degree == 0
-    assert config.n_layers % pp_degree == 0
-
-    # Tensor parallel is enabled in this program
-    tp_degree = world_size // pp_degree
-
-    # Create device mesh
-    device_mesh = _create_device_mesh(pp_degree, tp_degree)
-    tp_mesh = device_mesh["tp"]
-    pp_mesh = device_mesh["pp"]
-    logger.info(f"Created device mesh: {device_mesh}\n{tp_mesh=}, {pp_mesh=}")
-
-    tp_rank = tp_mesh.get_local_rank()
-    pp_rank = pp_mesh.get_local_rank()
-    tp_group = tp_mesh.get_group()
-    pp_group = pp_mesh.get_group()
-    logger.info(f"{pp_degree=}, {tp_degree=}")
-
-    # Convenience variables
-    first_pp_rank = 0
-    last_pp_rank = pp_degree - 1
-
-    # Assuming same number of GPUs per node
-    device = torch.device(f"cuda:{rank % torch.cuda.device_count()}")
-
-    # Fill in PP configs
-    config.stage_idx = pp_rank
-    config.n_stages = pp_degree
-
-    with torch.device("meta"):
-        # TODO: we should create model instead of Transformer
-        model = Transformer(config)
-
-    # Distribute model on TP mesh
-    # (Surprisingly, this works even though model is on meta device and mesh is of
-    # cuda devices)
-    model.distribute(tp_mesh)
-    if rank == 0:
-        logger.info(f"Model: {model}")
-
-    # Load weights
-    logger.info(f"Loading weights for {pp_rank=} on {device=}")
-    with CUDATrackTime() as timer:
-        _load_model_weights(model, distribution, device, config, builder_args.chpt_from)
-
-    logger.info(
-        f"{color.green}Total weight loading time: {timer.get_time()} {timer.unit} for rank {rank}{color.reset}"
-    )
-
-    # Batch size. Since we push batches dynamically through the pipeline rather
-    # than chunking them, this is effectively micro-batch size in pipeline
-    # sense. Thus it is interchangeable with micro-batch size below.
-    batch_size = 1  # len(prompt)
-    seqlen_prefill = 1024  # sequence length
-    dim = 4096  # embedding dimension
-
-    # Setup KV caches (after model distribution)
-    # The number of cache lanes is the same as the maximum number of
-    # micro-batches that can be "in flight" in parallel -- imagine each
-    # micro-batch takes 1 "pipeline lane," they need distinct KV cache spaces.
-    # When decoding is done for certain micro-batches, we can reuse the KV cache
-    # lanes.
-    # TODO: bump up the lane count
-    pipeline_lanes = 1
-    with device:
-        model.setup_caches(batch_size, seqlen_prefill, cache_lanes=pipeline_lanes)
-
-    # info on stage size and params
-    stage_size = get_module_size(model)
-    stage_size_formatted = bytes_to_readable(stage_size)
-    stage_num_params = get_num_params(model)
-    logger.info(
-        f"Stage {rank} has {color.blue}{stage_num_params} params{color.reset}, Size: {color.blue}{stage_size_formatted}{color.reset}"
-    )
-    model.eval()
-
-    # Helper function to get example inputs and outputs for the stages.
-    def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
-        mb_ids = torch.randint(
-            0, config.vocab_size, (batch_size, seqlen), device=device
-        )
-        activation = torch.rand(
-            batch_size, seqlen, dim, device=device, dtype=model_dtype
-        )
-        logits = torch.rand(
-            batch_size, seqlen, config.vocab_size, device=device, dtype=model_dtype
-        )
-        example_inputs = (mb_ids if pp_rank == first_pp_rank else activation,)
-        example_outputs = (logits if pp_rank == last_pp_rank else activation,)
-        return example_inputs, example_outputs
-
-    # Create prefill stage
-    logger.info(f"Creating pipeline stage for prefill {pp_rank=}, {pp_degree=}")
-    example_inputs, example_outputs = get_example_ins_outs(seqlen_prefill)
-    prefill_stage = PipelineStage(
-        model,
-        pp_rank,
-        pp_degree,
-        device,
-        input_args=example_inputs,
-        output_args=example_outputs,
-        group=pp_group,
-    )
-
-    # Create schedule
-    # Number of micro-batches for the schedule is 1, because each step() call we
-    # only push 1 micro-batch into the pipeline. But we can continuously push
-    # new micro-batches into the pipeline as they arrive, achieving same
-    # pipelining effect.
-    prefiller = ScheduleGPipe(prefill_stage, 1)
-
-    # Need these global ids due to the API definition of dist.send and recv
-    first_pp_rank_global_id = dist.get_global_rank(pp_group, first_pp_rank)
-    last_pp_rank_global_id = dist.get_global_rank(pp_group, last_pp_rank)
-
-    pipe.send("ready")
-
-    while True:
-        command = pipe.recv()
-        assert isinstance(command, (str, list))
-        if isinstance(command, str):
-            if command == "stop":
-                break
-            else:
-                raise ValueError(f"Unknown command: {command}")
-        else:
-            prompt = command
-            assert (
-                len(prompt) == batch_size
-            ), f"Expecting {batch_size=} prompts but got {len(prompt)=}"
-            logger.info(f"{color.green}Prompt: {prompt}{color.reset}")
-
-            start_pos = 0
-            # Setup input position (input_pos) for prefill: a list of increasing integers from 0 to seqlen
-            input_pos = torch.arange(seqlen_prefill, device=device)
-
-        # encode the prompt
-        input_ids = _encode_strings(
-            prompt, tokenizer, bos=True, device=device, dtype=torch.int64
-        )
-
-        # create a padded tensor for the input prompt
-        padded_sequence, prompt_lengths = _create_padded_prompts(
-            input_ids, tokenizer, seqlen_prefill, start_pos, device
-        )
-
-        # New token generated each iteration
-        # need a row dimension for each prompt in the batch
-        new_token = torch.zeros(batch_size, 1, device=device, dtype=torch.int64)
-        # Store the generated tokens
-        res = []
-
-        # Prefill phase
-        # Run context input through pipeline
-        # TODO: we need to pass `input_pos` and `cache_lane` to each stage.
-        lane = 0
-        kwargs = {"input_pos": input_pos, "cache_lane": lane}
-        with torch.no_grad(), CUDATrackTime() as timer:
-            if pp_rank == first_pp_rank:
-                output = prefiller.step(padded_sequence, **kwargs)
-            elif pp_rank == last_pp_rank:
-                output = prefiller.step(**kwargs)
-            else:  # middle pp ranks
-                prefiller.step(**kwargs)
-
-        logger.info(
-            f"{color.green}Prefilling time: {timer.get_time()} {timer.unit} for rank {rank}{color.reset}"
-        )
-
-        # Decode the output -- first generated token
-        if pp_rank == last_pp_rank:
-            logger.info(f"{color.green}Decoding...{prompt_lengths=}{color.reset}")
-            new_token = _batch_decode_next_tokens(output, prompt_lengths)
-            res.append(new_token)
-            # TODO: Move to a separate decoding thread
-            resp = _decode_in_flight(new_token, tokenizer, tp_rank)
-            pipe.send((resp, new_token.tolist()))
-        else:
-            pipe.send(None)
-
-        # seqlen = 1 now
-        seqlen_decode = 1
-        input_pos = torch.tensor([prompt_lengths[0]], device=device)
-
-        # Create decode stage
-        logger.info(f"Creating pipeline stage for decode {pp_rank=}, {pp_degree=}")
-        example_inputs, example_outputs = get_example_ins_outs(seqlen_decode)
-        decode_stage = PipelineStage(
-            model,
-            pp_rank,
-            pp_degree,
-            device,
-            input_args=example_inputs,
-            output_args=example_outputs,
-            group=pp_group,
-        )
-        # create schedule
-        decoder = ScheduleGPipe(decode_stage, 1)
-
-        # Decoding
-        with torch.no_grad(), CUDATrackTime() as timer:
-            while True:
-                command = pipe.recv()
-                assert isinstance(command, str)
-                if command == "stop":
-                    break
-                elif command == "step":
-                    pass
-                else:
-                    raise ValueError(f"Unknown command: {command}")
-
-                kwargs = {"input_pos": input_pos, "cache_lane": lane}
-                # sendrecv between last and first ranks, only if:
-                # first_pp_rank != last_pp_rank.
-                if pp_rank == last_pp_rank and pp_rank != first_pp_rank:
-                    dist.send(
-                        new_token,
-                        dst=first_pp_rank_global_id,
-                        group=pp_group,
-                    )
-                elif pp_rank == first_pp_rank and pp_rank != last_pp_rank:
-                    dist.recv(
-                        new_token,
-                        src=last_pp_rank_global_id,
-                        group=pp_group,
-                    )
-
-                # Run data through pipeline
-                if pp_rank == first_pp_rank:
-                    output = decoder.step(new_token, **kwargs)
-                elif pp_rank == last_pp_rank:
-                    output = decoder.step(**kwargs)
-                else:  # middle pp ranks
-                    decoder.step(**kwargs)
-
-                # Decode the output
-                if pp_rank == last_pp_rank:
-                    new_token = _batch_decode_next_tokens(output)
-                    res.append(new_token)
-                    # TODO: Move to a separate decoding thread
-                    resp = _decode_in_flight(new_token, tokenizer, tp_rank)
-                    pipe.send((resp, new_token))
-                else:
-                    pipe.send(None)
-
-                # Increment input position
-                input_pos += 1
-
-        logger.info(
-            f"{color.green}Decoding time: {timer.get_time()} {timer.unit} for rank {rank}{color.reset}"
-        )
-
-        # Display the decoding results
-
-        # output formatted response via last pp group and tp rank 0
-        if pp_rank == last_pp_rank and tp_rank == 0:
-            # `res` is a list of tensors, each being a batch of generated token ids.
-            # We need to concatenate them to get the full sequence of generated
-            # token ids. Thus cat'ing along dim 1.
-            res = torch.cat(res, dim=1)
-            res_list = res.tolist()
-
-            responses = tokenizer.decode(res_list)
-
-            # Show prompts and responses
-            for prompt_text, response_text in zip(prompt, responses):
-                logger.info(f"Prompt: {color.green}{prompt_text} {color.reset}")
-                logger.info(f"Response: {color.red}{response_text} {color.reset}")
-
-    # Cleanup
-    _cleanup()
-    logger.info(
-        f"{color.green}Success{color.white} - {color.blue}Rank {rank} has completed.{color.reset}"
-    )
-
-# TODO: remove or make it work again
-# if __name__ == "__main__":
-#     parser = argparse.ArgumentParser()
-#     parser.add_argument(
-#         "model_name",
-#         type=str,
-#         default="llama3",
-#         help="Name of the model to load",
-#         choices=NAME_TO_DISTRIBUTION_AND_DTYPE.keys(),
-#     )
-#     parser.add_argument("--pp", type=int, default=1, help="Pipeline parallel degree")
-#     parser.add_argument(
-#         "--ntokens",
-#         type=int,
-#         default=40,
-#         help="Number of tokens to generate",
-#     )
-#     parser.add_argument(
-#         "--chpt-from",
-#         type=str,
-#         default="hf",  # TODO: change to torchchat once we support it well
-#         help="Checkpoint format to load from",
-#         choices=["hf", "torchchat"],
-#     )
-#     args = parser.parse_args()
-
-#     main()
diff --git a/torchchat/distributed/generate.py b/torchchat/distributed/generate.py
deleted file mode 100644
index 51c472e4a..000000000
--- a/torchchat/distributed/generate.py
+++ /dev/null
@@ -1,271 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import asyncio
-import atexit
-import importlib.util
-import subprocess
-import threading
-from abc import abstractmethod
-from collections import deque
-from dataclasses import dataclass
-from functools import partial
-from os import environ
-from pathlib import Path
-from typing import List, Optional
-from uuid import uuid4
-
-import torch.multiprocessing as mp
-from torchchat.cli.builder import BuilderArgs, TokenizerArgs
-from torchchat.distributed.dist_run import NAME_TO_DISTRIBUTION_AND_DTYPE
-from torchchat.distributed.logging_utils import SingletonLogger
-
-logger = SingletonLogger.get_logger()
-
-
-def _setup_env(world_size: int, rank: int, target: callable, *args, **kwargs):
-    environ["MASTER_ADDR"] = "localhost"
-    environ["MASTER_PORT"] = "29500"
-    environ["RDZV_BACKEND"] = "c10d"
-    environ["WORLD_SIZE"] = str(world_size)
-    environ["RANK"] = str(rank)
-    environ["LOCALRANK"] = str(rank)
-
-    return target(*args, **kwargs)
-
-
-def _launch_distributed_inference(
-    model_name: str, builder_args: BuilderArgs, tokenizer_args: TokenizerArgs
-) -> tuple[List]:
-    # launch distributed inference worker, each worker gets a pipe to communicate with the main process
-    logger.info("Launching distributed inference ...")
-
-    num_processes_per_node = builder_args.pp * builder_args.tp
-
-    from torchchat.distributed.dist_run import main
-
-    mp.set_start_method("spawn")
-
-    pipes = []
-    procs = []
-    try:
-        for rank in range(num_processes_per_node):
-            server_pipe, client_pipe = mp.Pipe(duplex=True)
-            pipes.append(server_pipe)
-            procs.append(
-                mp.Process(
-                    target=partial(_setup_env, num_processes_per_node, rank, main),
-                    args=(model_name, builder_args, tokenizer_args, client_pipe),
-                )
-            )
-            procs[-1].start()
-
-        for pipe in pipes:
-            assert pipe.recv() == "ready", "Starting the worker failed"
-    except Exception as e:
-        logger.error(f"Error during distributed inference: {str(e)}")
-        for p in procs:
-            p.kill()
-        raise e
-
-    logger.info(
-        f"Done launching distributed inference on {num_processes_per_node} GPUs."
-    )
-    return procs, pipes
-
-
-@dataclass
-class Output:
-    is_finished: bool = False
-    text: Optional[str] = None
-    token: Optional[list] = None
-
-
-@dataclass
-class Request:
-    request_id: int
-    prompt: str
-
-    @classmethod
-    def new_request(cls, prompt):
-        return cls(request_id=uuid4().int, prompt=prompt)
-
-
-class Scheduler(object):
-    def __init__(
-        self,
-        builder_args,
-        generator_args,
-        pipes,
-        loop,
-    ):
-        self.builder_args = builder_args
-        self.generator_args = generator_args
-        self.requests = {}
-        self.in_flight_requests = {}
-        self.in_flight_batch_order = []
-        self.pipes = pipes
-        self.req_to_states = {}
-        self.req_to_results = {}
-        self.request_queue = mp.Queue()
-        self.loop = loop
-
-    def schedule_request(self, req: Request):
-        # add request to queue and create deque and async event for response
-        self.req_to_states[req.request_id] = asyncio.Event()
-        self.req_to_results[req.request_id] = deque()
-        self.request_queue.put(req)
-
-    def process_requests_loop(self):
-        # Continuously process requests (one at a time for now), results are routed into the requests deque
-        while True:
-            req = self.request_queue.get()
-            if req == "stop":
-                break
-            self.requests = {req.request_id: req.prompt}
-
-            responses = {}
-            running = True
-            while running:
-                outputs = self.step()
-                self.req_to_results[req.request_id].append(outputs[0])
-
-                self.loop.call_soon_threadsafe(self.req_to_states[req.request_id].set)
-
-                running &= not outputs[0].is_finished
-
-    async def wait_for_request(self, req: Request) -> Output:
-        # Wait for request to deliver result, uses event to trigger and reads from left side of deque
-        is_finished = False
-        while not is_finished:
-            await self.req_to_states[req.request_id].wait()
-            while len(self.req_to_results[req.request_id]):
-                output = self.req_to_results[req.request_id].popleft()
-                is_finished |= output.is_finished
-                yield output
-        del self.req_to_states[req.request_id]
-        del self.req_to_results[req.request_id]
-
-    def step(self) -> List[Output]:
-        # Make a prefill or decoding step and receive results
-        responses = []
-        # TODO: Implement a scheduler to handle the requests
-        if len(self.in_flight_requests) > 0:
-            # Receive decoded token
-            for p in self.pipes:
-                p.send("step")
-            for p in self.pipes:
-                responses.append(p.recv())
-
-        else:
-            # Send requests to backend
-            self.in_flight_batch_order = list(self.requests.keys())
-            prompts = [self.requests[k] for k in self.in_flight_batch_order]
-            for p in self.pipes:
-                p.send(prompts)
-            self.in_flight_requests = self.requests
-            self.requests = {}
-            self.current_step = 0
-            # Receive first token
-            for p in self.pipes:
-                responses.append(p.recv())
-        # Filter out None responses from in-between stages
-        responses = [r for r in responses if r is not None][0]
-        outputs = []
-        for k, v in zip(self.in_flight_batch_order, zip(responses[0], responses[1])):
-            text, token_ids = v
-            outputs.append(
-                Output(
-                    # TODO: Look for tokenizer.eos_id as well
-                    is_finished=self.current_step >= self.generator_args.max_new_tokens,
-                    text=text,
-                    token=token_ids,
-                )
-            )
-        if self.current_step >= self.generator_args.max_new_tokens:
-            for p in self.pipes:
-                p.send("stop")
-            self.in_flight_requests = []
-
-        self.current_step += 1
-
-        return outputs
-
-
-class DistributedGenerator(object):
-    def __init__(
-        self,
-        # TODO: switch this to torchchat method
-        model_name: str,
-        builder_args: BuilderArgs,
-        tokenizer_args: TokenizerArgs,
-        # TODO: move GeneratorArgs into a different module
-        generator_args,
-        profile: Optional[Path],
-        quantize: bool,
-        draft_quantize: bool,
-    ):
-        self.model_name = model_name
-        self.builder_args = builder_args
-        self.generate_args = generator_args
-
-        self.check_args()
-
-        self.procs, self.pipes = _launch_distributed_inference(
-            model_name, builder_args, tokenizer_args
-        )
-
-        self.loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(self.loop)
-
-        self.scheduler = Scheduler(builder_args, generator_args, self.pipes, self.loop)
-
-        # TODO: Mode into process and use pipe or queue for comm
-        self.scheduler_thread = threading.Thread(
-            target=self.scheduler.process_requests_loop
-        )
-        self.scheduler_thread.start()
-
-        atexit.register(self.shutdown)
-
-    def shutdown(self):
-        # Stop all processes and threads
-        self.scheduler.request_queue.put("stop")
-        self.scheduler_thread.join()
-
-        for p in self.pipes:
-            p.send("stop")
-        for p in self.procs:
-            p.kill()
-
-    def generate(self, text):
-        # Function to generate text from prompt
-        req = Request.new_request(text)
-        self.scheduler.schedule_request(req)
-
-        generator = self.scheduler.wait_for_request(req)
-
-        running = True
-        while running:
-            output = self.loop.run_until_complete(generator.__anext__())
-            running &= not output.is_finished
-
-            yield output
-
-    def check_args(self):
-        if self.generate_args.chat_mode:
-            raise NotImplementedError(
-                "Currently we only support generate with --distributed"
-            )
-        elif self.builder_args.tp < 2:
-            raise ValueError("TP degree must be at least 2 for distributed inference")
-        elif self.model_name not in NAME_TO_DISTRIBUTION_AND_DTYPE.keys():
-            raise ValueError(
-                f"Distributed inference currently only supports then following models: {list(NAME_TO_DISTRIBUTION_AND_DTYPE.keys())}"
-            )
-        elif self.builder_args.chpt_from == "torchchat":
-            raise ValueError(
-                f"Distributed inference currently only supports HF checkpoints"
-            )
diff --git a/torchchat/distributed/utils.py b/torchchat/distributed/utils.py
index 46ea5d9a1..85bfe04fc 100644
--- a/torchchat/distributed/utils.py
+++ b/torchchat/distributed/utils.py
@@ -6,15 +6,15 @@
 
 import itertools
 import os
+import time
 from dataclasses import dataclass
 from datetime import timedelta
-import time
+from os import environ
 from typing import Optional
 
 
 import torch
 
-
 from torchchat.distributed.logging_utils import SingletonLogger
 logger = SingletonLogger.get_logger()
 
@@ -257,3 +257,13 @@ def get_device_info(
             f"with {self.device_capacity_gib:.2f}GiB memory"
         )
         return device_info
+
+def run_in_dist_env(world_size: int, rank: int, target: callable):
+    environ["MASTER_ADDR"] = "localhost"
+    environ["MASTER_PORT"] = "29500"
+    environ["RDZV_BACKEND"] = "c10d"
+    environ["WORLD_SIZE"] = str(world_size)
+    environ["RANK"] = str(rank)
+    environ["LOCALRANK"] = str(rank)
+
+    return target()
diff --git a/torchchat/generate.py b/torchchat/generate.py
index 987fb3e44..e271f5027 100644
--- a/torchchat/generate.py
+++ b/torchchat/generate.py
@@ -3,13 +3,15 @@
 
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
-import argparse
 import base64
+import contextlib
 import itertools
 import logging
 import os
 import textwrap
 import time
+from concurrent import futures
+from functools import partial
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
@@ -21,6 +23,9 @@
 import torch
 import torch._dynamo.config
 import torch._inductor.config
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from torch.distributed.pipelining import PipelineStage, ScheduleGPipe
 
 from PIL import Image
 
@@ -28,7 +33,6 @@
 from torchtune.data import Message, padded_collate_tiled_images_and_mask
 
 from torchtune.generation import sample as tune_sample
-from torchtune.models.llama3 import llama3_tokenizer
 
 from torchtune.models.llama3_2_vision._model_builders import llama3_2_vision_transform
 from torchtune.training import set_default_dtype
@@ -39,11 +43,16 @@
     BuilderArgs,
     TokenizerArgs,
 )
-from torchchat.distributed.generate import DistributedGenerator
+from torchchat.distributed.utils import (
+    Color as color,
+    run_in_dist_env,
+)
 from torchchat.model import Model, ModelType
 from torchchat.utils.build_utils import device_sync, set_precision
 from torchchat.utils.device_info import get_device_info
 
+logger = logging.getLogger(__name__)
+
 
 # NOTE: Logging disabled by default here due to conflicts with torch._dynamo
 class NoOpLogger:
@@ -76,7 +85,7 @@ def __init__(self, tokenizer):
     def encode_dialog_prompt(
         self,
         dialog: DIALOG_TYPE,
-        add_generation_prompt: bool,
+        add_generation_prompt: bool = True,
     ) -> List[int]:
         """Encode a sequence of messages into a sequence of token IDs, including
         the chat template
@@ -127,7 +136,7 @@ def _encode_message(self, message: _ChatFormatter.MESSAGE_TYPE) -> List[int]:
     def encode_dialog_prompt(
         self,
         dialog: _ChatFormatter.DIALOG_TYPE,
-        add_generation_prompt: bool,
+        add_generation_prompt: bool = True,
     ) -> List[int]:
         tokens = []
         tokens.append(self.tokenizer.special_tokens["<|begin_of_text|>"])
@@ -157,7 +166,7 @@ def _get_content_str(message: _ChatFormatter.MESSAGE_TYPE) -> str:
     def encode_dialog_prompt(
         self,
         dialog: _ChatFormatter.DIALOG_TYPE,
-        add_generation_prompt: bool, # UNUSED
+        add_generation_prompt: bool = True, # UNUSED
     ) -> List[int]:
         new_turn = True
         tokens = []
@@ -188,7 +197,7 @@ class HFTokenizerChatFormatter(_ChatFormatter):
     def encode_dialog_prompt(
         self,
         dialog: _ChatFormatter.DIALOG_TYPE,
-        add_generation_prompt: bool,
+        add_generation_prompt: bool = True,
     ) -> List[int]:
         rendered = self.tokenizer.apply_chat_template(
             dialog, add_generation_prompt=add_generation_prompt
@@ -287,7 +296,7 @@ def from_args(cls, args):
         )
 
 
-class Generator:
+class LocalGenerator:
     """
     Generates text samples based on a pre-trained Transformer model and tokenizer.
     Args:
@@ -324,6 +333,7 @@ def __init__(
         self.draft_quantize = draft_quantize
         self.is_torchtune_model = generator_args.is_torchtune_model
         self.dtype = builder_args.precision
+        self.get_user_input : Callable = input
 
         self.rank: Optional[int] = None
 
@@ -478,9 +488,7 @@ def prefill(
         else:
             # input_pos: [B, S]
             logits = model(x, input_pos)
-            # print(f"logits {logits.shape}")
 
-        # print(f"x: {x},\n  input_pos: {input_pos}\n")
         return self.sample(logits, need_probs=False, **sampling_kwargs)[0]
 
     def decode_one_token(
@@ -504,7 +512,6 @@ def decode_one_token(
             )[:, -1:]
         else:
             logits = model(x, input_pos)
-        # print(f"x: {x},\n  input_pos: {input_pos}\n")
         return self.sample(logits, need_probs=need_probs, **sampling_kwargs)
 
     """
@@ -827,7 +834,6 @@ def _callback(self, x, *, buffer, done_generating):
         if len(buffer) == 4 or done_generating:
             print("".join(buffer), end="", flush=True)
             buffer.clear()
-        # print(, end='', flush=True)
 
     def _gen_model_input(
         self,
@@ -996,6 +1002,13 @@ def chat(
                 for p in itertools.chain(self.model.parameters(), self.model.buffers())
             ]
         )
+        if self.builder_args.distributed:
+            # During distributed inference the model gets sharded among the ranks
+            # So we need to all reduce the model size to get the total model size
+            model_size = torch.tensor(model_size, dtype=torch.int64, device=self.device)
+            dist.all_reduce(model_size)
+            model_size = model_size.item()
+
         if generator_args.compile:
             if self.builder_args.device == "cpu":
                 if generator_args.max_autotune:
@@ -1054,11 +1067,11 @@ def chat(
             print(
                 f"Entering Chat Mode. Will continue chatting back and forth with the language model until the models max context length of {max_seq_length} tokens is hit or until the user says /bye"
             )
-            get_system_prompt = input(
+            get_system_prompt = self.get_user_input(
                 "Do you want to enter a system prompt? Enter y for yes and anything else for no. \n"
             )
             if get_system_prompt == "y" or get_system_prompt == "Y":
-                self.system_prompt = input("What is your system prompt? \n")
+                self.system_prompt = self.get_user_input("What is your system prompt? \n")
 
         # `is_torchtune_model` is a misnomer since it doesn't capture all
         # torchtune models (i.e. Flamingo)
@@ -1097,7 +1110,7 @@ def chat(
             device_sync(device=self.builder_args.device)
             is_first_sample: bool = i == 0
             if generator_args.chat_mode:
-                prompt = input("User: ")
+                prompt = self.get_user_input("User: ")
                 if prompt == "/bye":
                     print("Exiting Chat.\n")
                     break
@@ -1151,8 +1164,6 @@ def callback(x, *, done_generating=False):
                 torch._inductor.config.profiler_mark_wrapper_call = True
                 torch._inductor.config.cpp.enable_kernel_profile = True
             if i != generator_args.num_samples - 1 or not self.profile:
-                import contextlib
-
                 prof = contextlib.nullcontext()
             else:
                 torch.profiler._utils._init_for_cuda_graphs()
@@ -1280,22 +1291,319 @@ def callback(x, *, done_generating=False):
             print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
 
 
-def _launch_distributed_inference(
-    builder_args: BuilderArgs,
-):
-    from torch.distributed import launcher
-    from torch.distributed.elastic.utils.distributed import get_free_port
+class DistributedGenerator(LocalGenerator):
+    def __init__(
+        self,
+        builder_args: BuilderArgs,
+        speculative_builder_args: BuilderArgs,
+        tokenizer_args: TokenizerArgs,
+        generator_args: GeneratorArgs,
+        profile: Optional[Path],
+        quantize: bool,
+        draft_quantize: bool,
+        ):
+        
+        is_speculative = speculative_builder_args.checkpoint_path is not None
+        assert is_speculative == False, "Distributed inference with pp > 1 does not support speculative inference yet."
+        super().__init__(
+            builder_args,
+            speculative_builder_args,
+            tokenizer_args,
+            generator_args,
+            profile,
+            quantize,
+            draft_quantize,
+        )
+        self.rank = dist.get_rank()
+        # Assuming same number of GPUs per node
+        self.device = torch.device(f"cuda:{self.rank % torch.cuda.device_count()}")
 
-    print("Launching distributed inference within generator")
+        def distributed_input(prompt: str) -> str:
+            if dist.get_rank() == 0:
+                text = [input(prompt)]
+            else:
+                text = [None]
+            
+            dist.broadcast_object_list(text)
+            return text[0]
 
+        self.get_user_input: Callable = distributed_input
 
-def main(args):
+        if builder_args.pp > 1:
+            self.seqlen_prefill = 1024  # sequence length for prefill stage
+
+            logger.warn(f"{color.yellow}Pipeline parallelism is still experimental and might be slow{color.reset}")
+            pp_mesh = self.model.device_mesh["pp"]
+
+            self.pp_rank = pp_mesh.get_local_rank()
+            self.pp_group = pp_mesh.get_group()
+
+            self.pp_degree = pp_mesh.size()
+
+            # Convenience variables
+            self.first_pp_rank = 0
+            self.last_pp_rank = self.pp_degree - 1
+
+
+            self.first_pp_rank_global_id = dist.get_global_rank(self.pp_group, self.first_pp_rank)
+            self.last_pp_rank_global_id = dist.get_global_rank(self.pp_group, self.last_pp_rank)
+
+            self.prefiller = self.create_prefill_stage()
+            self.decoder = self.create_decode_stage()
+
+    def __del__(self):
+        dist.destroy_process_group()
+
+    # Helper function to get example inputs and outputs for the stages.
+    def get_example_ins_outs(self, batch_size: int , seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        This function generates example inputs and outputs for the prefill and decode stages.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: A tuple containing the example inputs and outputs.
+        """
+        model_dtype = torch.bfloat16
+        mb_ids = torch.randint(
+            0, self.model.config.vocab_size, (batch_size, seqlen), device=self.device
+        )
+        activation = torch.rand(
+            batch_size, seqlen, self.model.config.dim, device=self.device, dtype=model_dtype
+        )
+        logits = torch.rand(
+            batch_size, seqlen, self.model.config.vocab_size, device=self.device, dtype=model_dtype
+        )
+        example_inputs = (mb_ids if self.pp_rank == self.first_pp_rank else activation,)
+        example_outputs = (logits if self.pp_rank == self.last_pp_rank else activation,)
+        return example_inputs, example_outputs
+
+    def create_prefill_stage(self):
+        """
+        Creates a pipeline stage for prefilling.
+
+        Returns:
+            PipelineStage: The created pipeline stage.
+        """
+        batch_size = 1
+
+        # Create prefill stage
+        logger.debug(f"Creating pipeline stage for prefill {self.pp_rank=}, {self.pp_degree=}")
+        example_inputs, example_outputs = self.get_example_ins_outs(batch_size, self.seqlen_prefill)
+        prefill_stage = PipelineStage(
+            self.model,
+            self.pp_rank,
+            self.pp_degree,
+            self.device,
+            input_args=example_inputs,
+            output_args=example_outputs,
+            group=self.pp_group,
+        )
+
+        # Create schedule
+        # Number of micro-batches for the schedule is 1, because each step() call we
+        # only push 1 micro-batch into the pipeline. But we can continuously push
+        # new micro-batches into the pipeline as they arrive, achieving same
+        # pipelining effect.
+        prefiller = ScheduleGPipe(prefill_stage, 1)
+        return prefiller
+
+    def create_decode_stage(self):
+        """
+        Creates a decode stage for the pipeline parallelism.
+
+        Returns:
+            ScheduleGPipe: The decode stage.
+        """
+        # seqlen = 1 now
+        seqlen_decode = 1
+        batch_size = 1
+
+        # Create decode stage
+        # logger.info(f"Creating pipeline stage for decode {self.pp_rank=}, {self.pp_degree=}")
+        example_inputs, example_outputs = self.get_example_ins_outs(batch_size, seqlen_decode)
+        decode_stage = PipelineStage(
+            self.model,
+            self.pp_rank,
+            self.pp_degree,
+            self.device,
+            input_args=example_inputs,
+            output_args=example_outputs,
+            group=self.pp_group,
+        )
+        # create schedule
+        decoder = ScheduleGPipe(decode_stage, 1)
+
+        return decoder
+
+    def prefill(
+        self,
+        model: Model,
+        x: torch.Tensor,
+        input_pos: torch.Tensor,
+        batch: Optional[Dict[str, Any]] = None,  # Inputs for multimodal models
+        *,
+        sequential_prefill=True,
+        **sampling_kwargs,
+    ) -> torch.Tensor:
+        """
+        This function is used to prefill the model with a given prompt. For pipeline parallelism we need to pad the input.
+
+        Returns:
+            torch.Tensor: The prefilled tensor.
+        """
+        if self.builder_args.pp == 1:
+            return super().prefill(
+                model,
+                x,
+                input_pos,
+                batch,
+                sequential_prefill=sequential_prefill,
+                **sampling_kwargs,
+            )
+
+        pad_token_id = self.tokenizer.pad_id if self.tokenizer.pad_id is not None else self.tokenizer.eos_id
+        prompt_length = x.size(1)
+
+        padded_seq = torch.full(
+            (1, self.seqlen_prefill), pad_token_id, dtype=torch.int64, device=self.device
+            )
+        padded_seq[:,:prompt_length] = x
+        input_pos = torch.arange(
+            self.seqlen_prefill,
+            device=self.device,
+            dtype=torch.int,
+            )
+
+        # Prefill phase
+        # Run context input through pipeline
+        # TODO: we need to pass `input_pos` and `cache_lane` to each stage.
+        lane = 0
+        kwargs = {"input_pos": input_pos, "cache_lane": lane}
+        
+        if self.pp_rank == self.first_pp_rank:
+            logits = self.prefiller.step(padded_seq, **kwargs)
+        elif self.pp_rank == self.last_pp_rank:
+            logits = self.prefiller.step(**kwargs)
+        else:  # middle pp ranks
+            self.prefiller.step(**kwargs)
+
+        if self.pp_rank == self.last_pp_rank:
+            new_token = self.sample(logits[:,:prompt_length], need_probs=False, **sampling_kwargs)[0]
+            if self.pp_rank != self.first_pp_rank:
+                dist.send(
+                    new_token,
+                    dst=self.first_pp_rank_global_id,
+                    group=self.pp_group,
+                )
+        else:
+            new_token = torch.zeros(1, 1, device=self.device, dtype=torch.int64)
+            if self.pp_rank == self.first_pp_rank:
+                dist.recv(
+                    new_token,
+                    src=self.last_pp_rank_global_id,
+                    group=self.pp_group,
+                )
+
+        return new_token
+
+    def decode_one_token(
+        self,
+        model: Model,
+        x: torch.Tensor,
+        input_pos: torch.Tensor,
+        need_probs: bool,
+        batch: Optional[Dict[str, Any]] = None,  # Inputs for multimodal models
+        **sampling_kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Decodes a single token.
+
+        # TODO: implement speculative decoding with pp>1
+        Returns:
+            Tuple[torch.Tensor, None]: A tuple containing the decoded token and None.
+        """
+        if self.builder_args.pp == 1:
+            return super().decode_one_token(
+                model,
+                x,
+                input_pos,
+                need_probs,
+                batch=batch,
+                **sampling_kwargs,
+            )
+
+        # input_pos: [B, 1]
+        assert input_pos.shape[-1] == 1
+
+        new_token = x.view(1, -1)
+
+        lane = 0
+        kwargs = {"input_pos": input_pos, "cache_lane": lane}
+        # Run data through pipeline
+        if self.pp_rank == self.first_pp_rank:
+            logits = self.decoder.step(new_token, **kwargs)
+        elif self.pp_rank == self.last_pp_rank:
+            logits = self.decoder.step(**kwargs)
+        else:  # middle pp ranks
+            self.decoder.step(**kwargs)
+
+        # Decode the output
+        if self.pp_rank == self.last_pp_rank:
+            new_token, _ = self.sample(logits, need_probs=need_probs, **sampling_kwargs)
+            if self.pp_rank != self.first_pp_rank:
+                dist.send(
+                    new_token,
+                    dst=self.first_pp_rank_global_id,
+                    group=self.pp_group,
+                )
+        else:
+            new_token = torch.zeros(1, 1, device=self.device, dtype=torch.int64)
+            if self.pp_rank == self.first_pp_rank:
+                dist.recv(
+                    new_token,
+                    src=self.last_pp_rank_global_id,
+                    group=self.pp_group,
+                )
+                #TODO: Why do we get 2d tensor here?
+                new_token=new_token[0]
+        return new_token, None
+
+    def sample(
+        self,
+        logits,
+        need_probs: bool,
+        temperature: float = 0,
+        top_k: Optional[int] = None,
+    ):
+        if temperature == 0 and not need_probs:
+            _, idx_next = torch.topk(logits[0, -1], k=1, dim=-1)
+            return (idx_next, None)
+        probs = self.logits_to_probs(logits[0, -1], temperature, top_k)
+        idx_next = self.multinomial_sample_one_no_sync(probs)
+        
+        return idx_next, probs
+
+
+def run_generator(
+    args,
+    rank: Optional[int] =None
+    ):
+    """
+    This function creates and executes a generator 
+    """
     builder_args = BuilderArgs.from_args(args)
     speculative_builder_args = BuilderArgs.from_speculative_args(args)
     tokenizer_args = TokenizerArgs.from_args(args)
-    generator_args = GeneratorArgs.from_args(args)
-    logger.debug("GeneratorArgs: %s", generator_args)
-    if not builder_args.distributed:
+    generator_args = GeneratorArgs.from_args(args)    
+    #Setup rank 1 and up to suppress log messages and print messages
+    if builder_args.distributed and rank != 0:
+        logger.setLevel(logging.CRITICAL)
+        context = contextlib.redirect_stdout(None)
+    else:
+        context = contextlib.nullcontext()
+
+    with context:
+        Generator = DistributedGenerator if builder_args.distributed else LocalGenerator
+        logger.debug("GeneratorArgs: %s", generator_args)
         gen = Generator(
             builder_args,
             speculative_builder_args,
@@ -1310,20 +1618,20 @@ def main(args):
 
         for _ in gen.chat(generator_args):
             pass
-    else:
-        dist_gen = DistributedGenerator(
-            args.model,
-            builder_args,
-            tokenizer_args,
-            generator_args,
-            args.profile,
-            args.quantize,
-            args.draft_quantize,
-        )
 
-        response = ""
-        for output in dist_gen.generate(generator_args.prompt):
-            response += output.text
-
-        print(f"Model output: {response}")
-        dist_gen.shutdown()
+def main(args):
+    builder_args = BuilderArgs.from_args(args)
+    
+    if builder_args.distributed:
+        world_size = builder_args.tp * builder_args.pp
+
+        ctx = mp.get_context('spawn')
+        with futures.ProcessPoolExecutor(max_workers=world_size-1, mp_context=ctx) as executor:
+            for i in range(1,world_size):
+                fn = partial(run_generator, args, i)
+                executor.submit(run_in_dist_env, world_size, i, fn)
+            #Starting rank 0
+            fn = partial(run_generator, args, 0)
+            run_in_dist_env(world_size, 0, fn)
+    else:
+        run_generator(args)
diff --git a/torchchat/usages/openai_api.py b/torchchat/usages/openai_api.py
index 99fd82fe8..b67cd0eac 100644
--- a/torchchat/usages/openai_api.py
+++ b/torchchat/usages/openai_api.py
@@ -13,7 +13,7 @@
 from dataclasses import dataclass
 from io import BytesIO
 from pwd import getpwuid
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union, Type
 
 import torch
 
@@ -24,7 +24,7 @@
 from torchtune.models.llama3_2_vision._model_builders import llama3_2_vision_transform
 
 from torchchat.cli.download import is_model_downloaded, load_model_configs
-from torchchat.generate import Generator, GeneratorArgs
+from torchchat.generate import LocalGenerator, DistributedGenerator, GeneratorArgs
 from torchchat.model import FlamingoModel
 
 from torchchat.utils.build_utils import device_sync
@@ -267,7 +267,7 @@ class CompletionResponseChunk:
     usage: Optional[UsageStats] = None
 
 
-class OpenAiApiGenerator(Generator):
+class OpenAiApiGeneratorMixin:
     """A wrapper over the Generator class to interface with the OpenAI API.
 
     Implements endpoints for completion requests, both chunked and non-chunked using the dataclasses
@@ -486,6 +486,15 @@ def _callback(self, x, *, buffer, done_generating):
         pass
 
 
+def create_openai_api_generator(distributed: bool) -> Type:
+    """
+    Factory method to create an OpenAiApiGenerator
+    """
+
+    # Base class order matters to make sure OpenAiApiGeneratorMixin overrides methods in DistributedGenerator and Generator
+    return type('OpenAiApiGenerator', (OpenAiApiGeneratorMixin, DistributedGenerator if distributed else LocalGenerator), {})
+
+
 """
 Helper functions for the OpenAI API Models endpoint.
 
diff --git a/torchchat/usages/server.py b/torchchat/usages/server.py
index 1fb76953b..550539a88 100644
--- a/torchchat/usages/server.py
+++ b/torchchat/usages/server.py
@@ -4,38 +4,89 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
+import atexit
 import json
 
 import logging
 
 logger = logging.getLogger(__name__)
 
+from contextlib import nullcontext
 from dataclasses import asdict
+from functools import partial
+from os import environ
 from typing import Dict, List, Union
 
 import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from concurrent import futures
 from flask import Flask, request, Response
 
 from torchchat.cli.builder import BuilderArgs, TokenizerArgs
+from torchchat.distributed.utils import run_in_dist_env
 from torchchat.generate import GeneratorArgs
 
 from torchchat.usages.openai_api import (
     CompletionRequest,
     get_model_info_list,
-    OpenAiApiGenerator,
+    create_openai_api_generator,
     retrieve_model_info,
 )
 
 OPENAI_API_VERSION = "v1"
 
 
+def run_worker(
+    args,
+    rank,
+    queue,
+    ):
+    """
+    This function creates and executes a generator 
+    """
+    gen = initialize_generator(args)
+    
+    while True:
+        try:
+            req = queue.get()
+        except KeyboardInterrupt:
+            break
+
+        if req == "stop":
+            break
+        
+        for _ in gen.chunked_completion(req):
+            pass
+
 def create_app(args):  # noqa: C901
     """
     Creates a flask app that can be used to serve the model as a chat API.
     """
     app = Flask(__name__)
 
-    gen: OpenAiApiGenerator = initialize_generator(args)
+    builder_args = BuilderArgs.from_args(args)
+    procs = []
+    queue = None
+    if builder_args.distributed:
+        world_size = builder_args.tp * builder_args.pp
+        mp_context = mp.get_context('spawn')
+        queue = mp_context.Queue()
+    
+        for i in range(1, world_size):
+            fn = partial(run_worker, args, i, queue)
+            mp_context = mp.get_context('spawn')
+            procs.append(mp_context.Process(target=run_in_dist_env, args=(world_size, i, fn)))
+            procs[-1].start()
+
+        environ["MASTER_ADDR"] = "localhost"
+        environ["MASTER_PORT"] = "29500"
+        environ["RDZV_BACKEND"] = "c10d"
+        environ["WORLD_SIZE"] = str(world_size)
+        environ["RANK"] = str(0)
+        environ["LOCALRANK"] = str(0)
+
+    gen = initialize_generator(args)
 
     def _del_none(d: Union[Dict, List]) -> Union[Dict, List]:
         """Recursively delete None values from a dictionary."""
@@ -69,6 +120,10 @@ def chat_endpoint():
 
         if req.stream:
 
+            if builder_args.distributed:
+                for _ in range(world_size-1):
+                    queue.put(req)
+
             def chunk_processor(chunked_completion_generator):
                 """Inline function for postprocessing CompletionResponseChunk objects.
 
@@ -86,8 +141,11 @@ def chunk_processor(chunked_completion_generator):
             )
             return resp
         else:
+            if builder_args.distributed:
+                for _ in range(world_size-1):
+                    queue.put(req)
+
             response = gen.sync_completion(req)
-            print(response.choices[0].message.content)
 
             return json.dumps(_del_none(asdict(response)))
 
@@ -102,16 +160,18 @@ def models_retrieve_endpoint(model_id):
         else:
             return "Model not found", 404
 
-    return app
+    return app, (procs, queue)
 
 
-def initialize_generator(args) -> OpenAiApiGenerator:
+def initialize_generator(args) -> type:
     builder_args = BuilderArgs.from_args(args)
     speculative_builder_args = BuilderArgs.from_speculative_args(args)
     tokenizer_args = TokenizerArgs.from_args(args)
     generator_args = GeneratorArgs.from_args(args)
     generator_args.chat_mode = False
 
+    OpenAiApiGenerator = create_openai_api_generator(builder_args.distributed)
+
     return OpenAiApiGenerator(
         builder_args=builder_args,
         speculative_builder_args=speculative_builder_args,
@@ -124,5 +184,19 @@ def initialize_generator(args) -> OpenAiApiGenerator:
 
 
 def main(args):
-    app = create_app(args)
+    app, (procs, queue) = create_app(args)
+
+    def shutdown_worker():
+        while not queue.empty():
+            queue.get(block=False)
+        for p in procs:
+            queue.put("stop")
+        for p in procs:
+            p.join(timeout=0.5)
+        for p in procs:
+            if p.is_alive():
+                p.kill()
+
+    atexit.register(shutdown_worker)
+
     app.run()

From 155bd4b347a77cdb8cd63c4acee19905f8280019 Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Fri, 20 Dec 2024 14:16:13 -0700
Subject: [PATCH 45/83] Granite 3.0 / 3.1 dense support (#1432)

* feat(granite3): Add config plumbing for granite3-2b

This does not yet implement the usage of the new multipliers in the
architecture, so the output is garbage at the moment.

NOTE: There is currently a bug where this model is missing tokenizer.json
in HF, but that should be resolved soon.

Branch: GraniteThreeDenseSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Use multipliers conditionally in the model architecture

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Add plumbing for Granite 3.0 8b and 3.1 2b/8b

Branch: GraniteThreeDenseSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

---------

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---
 torchchat/model.py                            | 34 +++++++++++++++++--
 torchchat/model_config/models.json            | 28 +++++++++++++++
 .../model_params/Granite-3.0-2B-Instruct.json | 21 ++++++++++++
 .../model_params/Granite-3.0-8B-Instruct.json | 20 +++++++++++
 .../model_params/Granite-3.1-2B-Instruct.json | 20 +++++++++++
 .../model_params/Granite-3.1-8B-Instruct.json | 20 +++++++++++
 6 files changed, 140 insertions(+), 3 deletions(-)
 create mode 100644 torchchat/model_params/Granite-3.0-2B-Instruct.json
 create mode 100644 torchchat/model_params/Granite-3.0-8B-Instruct.json
 create mode 100644 torchchat/model_params/Granite-3.1-2B-Instruct.json
 create mode 100644 torchchat/model_params/Granite-3.1-8B-Instruct.json

diff --git a/torchchat/model.py b/torchchat/model.py
index 1c78d4c63..f50d2a8be 100644
--- a/torchchat/model.py
+++ b/torchchat/model.py
@@ -287,6 +287,11 @@ class TransformerArgs:
     feed_forward_bias: bool = False
     # Whether or not to tie the input word embeddings to the output
     tie_word_embeddings: bool = False
+    # Granite architecture multipliers
+    embedding_multiplier: Optional[float] = None
+    attention_multiplier: Optional[float] = None
+    residual_multiplier: Optional[float] = None
+    logits_scaling: Optional[float] = None
 
     def __post_init__(self):
         if self.n_local_heads == -1:
@@ -723,6 +728,10 @@ def forward(self, x: Tensor, input_pos: Optional[Tensor] = None, cache_lane: int
         if self.tok_embeddings:
             x = self.tok_embeddings(x)
 
+            # For Granite architectures
+            if self.config.embedding_multiplier:
+                x = x * self.config.embedding_multiplier
+
         for _, layer in self.layers.items():
             x = layer(x, input_pos, freqs_cis, mask, cache_lane=cache_lane)
 
@@ -730,6 +739,9 @@ def forward(self, x: Tensor, input_pos: Optional[Tensor] = None, cache_lane: int
             x = self.norm(x)
         if self.output:
             x = self.output(x)
+        # For granite architectures
+        if self.config.logits_scaling:
+            x = x / self.config.logits_scaling
         # print(f"output shape: {x.shape}")
         return x
 
@@ -741,6 +753,12 @@ def __init__(self, config: TransformerArgs) -> None:
         self.feed_forward = FeedForward(config)
         self.ffn_norm = RMSNorm(config.dim, config.norm_eps)
         self.attention_norm = RMSNorm(config.dim, config.norm_eps)
+        # None for llama architecture, set for granite architectures
+        self.residual_multiplier = (
+            config.residual_multiplier
+            if config.residual_multiplier is not None
+            else 1.0
+        )
 
     def distribute(self, device_mesh: DeviceMesh):
         self.attention.distribute(device_mesh)
@@ -751,8 +769,8 @@ def forward(
     ) -> Tensor:
         h = x + self.attention(
             self.attention_norm(x), freqs_cis, mask, input_pos, cache_lane=cache_lane
-        )
-        out = h + self.feed_forward(self.ffn_norm(h))
+        ) * self.residual_multiplier
+        out = h + self.feed_forward(self.ffn_norm(h)) * self.residual_multiplier
         return out
 
 
@@ -779,6 +797,7 @@ def __init__(self, config: TransformerArgs):
         self.head_dim = config.head_dim
         self.n_local_heads = config.n_local_heads
         self.dim = config.dim
+        self.attention_scale = config.attention_multiplier
         self._register_load_state_dict_pre_hook(self.load_hook)
 
     def setup_cache(self, max_batch_size, max_seq_length, cache_lanes: int = 1):
@@ -875,7 +894,16 @@ def forward(
 
         k = k.repeat_interleave(self.n_heads // self.n_local_heads, dim=1)
         v = v.repeat_interleave(self.n_heads // self.n_local_heads, dim=1)
-        y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0)
+        y = F.scaled_dot_product_attention(
+            query=q,
+            key=k,
+            value=v,
+            attn_mask=mask,
+            dropout_p=0.0,
+            # This is None (default) for llama architecture and set for granite
+            # architectures
+            scale=self.attention_scale,
+        )
 
         # -1 = self.dim
         y = y.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
diff --git a/torchchat/model_config/models.json b/torchchat/model_config/models.json
index 8791601fb..d2252e6dd 100644
--- a/torchchat/model_config/models.json
+++ b/torchchat/model_config/models.json
@@ -178,5 +178,33 @@
         "distribution_path": "ibm-granite/granite-8b-code-instruct-128k",
         "transformer_params_key": "Granite-8B-Code",
         "tokenizer_file": "tokenizer.json"
+    },
+    "ibm-granite/granite-3.0-2b-instruct": {
+        "aliases": ["granite3-2b", "granite3"],
+        "distribution_channel": "HuggingFaceSnapshot",
+        "distribution_path": "ibm-granite/granite-3.0-2b-instruct",
+        "transformer_params_key": "Granite-3.0-2B-Instruct",
+        "tokenizer_file": "tokenizer.json"
+    },
+    "ibm-granite/granite-3.0-8b-instruct": {
+        "aliases": ["granite3-8b"],
+        "distribution_channel": "HuggingFaceSnapshot",
+        "distribution_path": "ibm-granite/granite-3.0-8b-instruct",
+        "transformer_params_key": "Granite-3.0-8B-Instruct",
+        "tokenizer_file": "tokenizer.json"
+    },
+    "ibm-granite/granite-3.1-2b-instruct": {
+        "aliases": ["granite3.1-2b", "granite3.1"],
+        "distribution_channel": "HuggingFaceSnapshot",
+        "distribution_path": "ibm-granite/granite-3.1-2b-instruct",
+        "transformer_params_key": "Granite-3.1-2B-Instruct",
+        "tokenizer_file": "tokenizer.json"
+    },
+    "ibm-granite/granite-3.1-8b-instruct": {
+        "aliases": ["granite3.1-8b"],
+        "distribution_channel": "HuggingFaceSnapshot",
+        "distribution_path": "ibm-granite/granite-3.1-8b-instruct",
+        "transformer_params_key": "Granite-3.1-8B-Instruct",
+        "tokenizer_file": "tokenizer.json"
     }
 }
diff --git a/torchchat/model_params/Granite-3.0-2B-Instruct.json b/torchchat/model_params/Granite-3.0-2B-Instruct.json
new file mode 100644
index 000000000..1e9779cb3
--- /dev/null
+++ b/torchchat/model_params/Granite-3.0-2B-Instruct.json
@@ -0,0 +1,21 @@
+{
+    "block_size": 8192,
+    "dim": 2048,
+    "hidden_dim": 8192,
+    "n_heads": 32,
+    "n_local_heads": 8,
+    "n_layers": 40,
+    "rope_base": 10000,
+    "vocab_size": 49155,
+    "use_hf_tokenizer": true,
+    "tokenizer_prepend_bos": false,
+    "norm_eps": 0.00001,
+    "rope_scaling": null,
+    "attention_bias": false,
+    "feed_forward_bias": false,
+    "tie_word_embeddings": true,
+    "embedding_multiplier": 12.0,
+    "attention_multiplier": 0.015625,
+    "residual_multiplier": 0.22,
+    "logits_scaling": 8.0
+}
diff --git a/torchchat/model_params/Granite-3.0-8B-Instruct.json b/torchchat/model_params/Granite-3.0-8B-Instruct.json
new file mode 100644
index 000000000..35db0f90d
--- /dev/null
+++ b/torchchat/model_params/Granite-3.0-8B-Instruct.json
@@ -0,0 +1,20 @@
+{
+    "attention_multiplier": 0.0078125,
+    "embedding_multiplier": 12.0,
+    "dim": 4096,
+    "block_size": 12800,
+    "hidden_dim": 12800,
+    "logits_scaling": 16.0,
+    "n_heads": 32,
+    "n_layers": 40,
+    "n_local_heads": 8,
+    "residual_multiplier": 0.22,
+    "norm_eps": 1e-05,
+    "rope_base": 10000,
+    "tie_word_embeddings": true,
+    "vocab_size": 49155,
+    "use_hf_tokenizer": true,
+    "tokenizer_prepend_bos": false,
+    "attention_bias": false,
+    "feed_forward_bias": false
+}
diff --git a/torchchat/model_params/Granite-3.1-2B-Instruct.json b/torchchat/model_params/Granite-3.1-2B-Instruct.json
new file mode 100644
index 000000000..1e82036ab
--- /dev/null
+++ b/torchchat/model_params/Granite-3.1-2B-Instruct.json
@@ -0,0 +1,20 @@
+{
+    "attention_multiplier": 0.015625,
+    "embedding_multiplier": 12.0,
+    "dim": 2048,
+    "block_size": 8192,
+    "hidden_dim": 8192,
+    "logits_scaling": 8.0,
+    "n_heads": 32,
+    "n_layers": 40,
+    "n_local_heads": 8,
+    "residual_multiplier": 0.22,
+    "norm_eps": 1e-05,
+    "rope_base": 5000000.0,
+    "tie_word_embeddings": true,
+    "vocab_size": 49155,
+    "use_hf_tokenizer": true,
+    "tokenizer_prepend_bos": false,
+    "attention_bias": false,
+    "feed_forward_bias": false
+}
diff --git a/torchchat/model_params/Granite-3.1-8B-Instruct.json b/torchchat/model_params/Granite-3.1-8B-Instruct.json
new file mode 100644
index 000000000..646340580
--- /dev/null
+++ b/torchchat/model_params/Granite-3.1-8B-Instruct.json
@@ -0,0 +1,20 @@
+{
+    "attention_multiplier": 0.0078125,
+    "embedding_multiplier": 12.0,
+    "dim": 4096,
+    "block_size": 12800,
+    "hidden_dim": 12800,
+    "logits_scaling": 16.0,
+    "n_heads": 32,
+    "n_layers": 40,
+    "n_local_heads": 8,
+    "residual_multiplier": 0.22,
+    "norm_eps": 1e-05,
+    "rope_base": 10000000.0,
+    "tie_word_embeddings": true,
+    "vocab_size": 49155,
+    "use_hf_tokenizer": true,
+    "tokenizer_prepend_bos": false,
+    "attention_bias": false,
+    "feed_forward_bias": false
+}

From a325191ceb8b02477ef0920fddb4db0d1aff0b35 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Sun, 22 Dec 2024 17:12:21 -0800
Subject: [PATCH 46/83] Fix typo in quantize.py (#1434)

Typos
---
 torchchat/utils/quantize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py
index b1dcf25f8..f1ebf2902 100644
--- a/torchchat/utils/quantize.py
+++ b/torchchat/utils/quantize.py
@@ -844,7 +844,7 @@ def quantized_model(self) -> nn.Module:
         torch.ops.load_library(libpath)
         print("Loaded torchao mps ops.")
     except Exception as e:
-        print("Unabled to load torchao mps ops library.")
+        print("Unable to load torchao mps ops library.")
 
 except Exception as e:
     print("Unabled to import torchao experimental quant_api with error: ", e)

From 86efcd381ff4490be80c26734ba1d17fcce4137b Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Sun, 22 Dec 2024 22:49:09 -0800
Subject: [PATCH 47/83] Update sh -> bash in quantization.md (#1437)

Resolve one instance of #1436 where we say sh but mean bash (sh is not bash on every system).
---
 docs/quantization.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/quantization.md b/docs/quantization.md
index 08086d8d1..704a7ed6a 100644
--- a/docs/quantization.md
+++ b/docs/quantization.md
@@ -142,7 +142,7 @@ To use linear:a8wxdq and embedding:wx, you must set up the torchao experimental
 
 From the torchchat root directory, run
 ```
-sh torchchat/utils/scripts/build_torchao_ops.sh
+bash torchchat/utils/scripts/build_torchao_ops.sh
 ```
 
 This should take about 10 seconds to complete.
@@ -150,14 +150,14 @@ This should take about 10 seconds to complete.
 Note: if you want to use the new kernels in the AOTI and C++ runners, you must pass the flag link_torchao_ops when running the scripts the build the runners.
 
 ```
-sh torchchat/utils/scripts/build_native.sh aoti link_torchao_ops
+bash torchchat/utils/scripts/build_native.sh aoti link_torchao_ops
 ```
 
 ```
-sh torchchat/utils/scripts/build_native.sh et link_torchao_ops
+bash torchchat/utils/scripts/build_native.sh et link_torchao_ops
 ```
 
-Note before running `sh torchchat/utils/scripts/build_native.sh et link_torchao_ops`, you must first install executorch with `sh torchchat/utils/scripts/install_et.sh` if you have not done so already.
+Note before running `bash torchchat/utils/scripts/build_native.sh et link_torchao_ops`, you must first install executorch with `bash torchchat/utils/scripts/install_et.sh` if you have not done so already.
 
 ### Examples
 
@@ -212,7 +212,7 @@ Currently, torchchat can only run them on Eager mode.
 
 From the torchchat root directory, run
 ```
-sh torchchat/utils/scripts/build_torchao_ops.sh mps
+bash torchchat/utils/scripts/build_torchao_ops.sh mps
 ```
 
 ### Examples

From a1ba6a1cb0d04c314fd23475bf16371effbca7e2 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Mon, 23 Dec 2024 08:49:03 -0800
Subject: [PATCH 48/83] Output explicit selection of /bin/bash as interpreter
 for test scripts generated from .md with updown.py (#1421)

Output explicit selection of /bin/bash as interpreter for generated script.

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 torchchat/utils/scripts/updown.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torchchat/utils/scripts/updown.py b/torchchat/utils/scripts/updown.py
index 86ebf803f..306e5855b 100644
--- a/torchchat/utils/scripts/updown.py
+++ b/torchchat/utils/scripts/updown.py
@@ -267,6 +267,8 @@ def updown_processor(
         lines = file.readlines()
     print_flag = False
 
+    # Use bash; set it to fail on the first failing command
+    output("#! /bin/bash", replace_list=None, suppress_list=None)
     output("set -eou pipefail", replace_list=None, suppress_list=None)
 
     if create_sections:

From 490ad3921394b95a3bf107e7156c6261196b2a22 Mon Sep 17 00:00:00 2001
From: Matthias Reso <13337103+mreso@users.noreply.github.com>
Date: Wed, 25 Dec 2024 12:30:08 -0800
Subject: [PATCH 49/83] Fix how stream flag is read from request (#1441)

---
 torchchat/usages/openai_api.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/torchchat/usages/openai_api.py b/torchchat/usages/openai_api.py
index b67cd0eac..0d1d3dce7 100644
--- a/torchchat/usages/openai_api.py
+++ b/torchchat/usages/openai_api.py
@@ -180,7 +180,10 @@ class CompletionRequest:
     user: Optional[str] = None  # unimplemented
 
     def __post_init__(self):
-        self.stream = bool(self.stream)
+        if isinstance(self.stream, str):
+            self.stream = self.stream.lower() != "false"
+        else:
+            self.stream = bool(self.stream)
 
 
 @dataclass

From b95074bb69ebb650c656b908d3b99669ce39c51c Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu0820@users.noreply.github.com>
Date: Thu, 2 Jan 2025 19:32:35 -0800
Subject: [PATCH 50/83] [retry] Use pytorch-labs/tokenizers and remove
 tokenizer/ (#1401) (#1443)

* [retry] Use pytorch-labs/tokenizers and remove tokenizer/ (#1401)

Summary: Retry of #1401

Test Plan: Re-run the repro command in #1413:

```
python3 torchchat.py generate llama3.2-1b-base --prompt "write me a story about a boy and his bear"

```

Reviewers:

Subscribers:

Tasks:

Tags:

* Use latest commit

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:

* Try enabling _GLIBCXX_USE_CXX11_ABI

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:

* Fix PUBLIC issue

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
---
 .github/workflows/pull.yml              |   4 +-
 .gitmodules                             |  12 +-
 CMakeLists.txt                          |   9 +-
 runner/run.cpp                          | 517 +++++++++++-------------
 runner/third-party/tokenizers           |   1 +
 tokenizer/CMakeLists.txt                |  29 --
 tokenizer/base64.h                      | 187 ---------
 tokenizer/sentencepiece.cpp             | 125 ------
 tokenizer/third-party/abseil-cpp        |   1 -
 tokenizer/third-party/re2               |   1 -
 tokenizer/third-party/sentencepiece     |   1 -
 tokenizer/tiktoken.cpp                  | 390 ------------------
 tokenizer/tokenizer.h                   | 147 -------
 torchchat/utils/scripts/build_native.sh |   4 +-
 14 files changed, 245 insertions(+), 1183 deletions(-)
 create mode 160000 runner/third-party/tokenizers
 delete mode 100644 tokenizer/CMakeLists.txt
 delete mode 100644 tokenizer/base64.h
 delete mode 100644 tokenizer/sentencepiece.cpp
 delete mode 160000 tokenizer/third-party/abseil-cpp
 delete mode 160000 tokenizer/third-party/re2
 delete mode 160000 tokenizer/third-party/sentencepiece
 delete mode 100644 tokenizer/tiktoken.cpp
 delete mode 100644 tokenizer/tokenizer.h

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 623b0e80f..670c0205a 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -922,7 +922,7 @@ jobs:
           path: |
             ./et-build
             ./torchchat/utils/scripts
-          key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}-${{ hashFiles('**/install_et.sh') }}
+          key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}-${{ hashFiles('**/install_et.sh', '**/build_native.sh') }}
       - if: ${{ steps.install-et.outputs.cache-hit != 'true' }}
         continue-on-error: true
         run: |
@@ -1033,7 +1033,7 @@ jobs:
 
           # Pull submodules (re2, abseil) for Tiktoken
           git submodule sync
-          git submodule update --init
+          git submodule update --init --recursive
           ./runner/build_android.sh
           echo "Tests complete."
 
diff --git a/.gitmodules b/.gitmodules
index 7681823df..76bc1b9fd 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,9 +1,3 @@
-[submodule "tokenizer/third-party/abseil-cpp"]
-	path = tokenizer/third-party/abseil-cpp
-	url = https://github.com/abseil/abseil-cpp.git
-[submodule "tokenizer/third-party/re2"]
-	path = tokenizer/third-party/re2
-	url = https://github.com/google/re2.git
-[submodule "tokenizer/third-party/sentencepiece"]
-	path = tokenizer/third-party/sentencepiece
-	url = https://github.com/google/sentencepiece.git
+[submodule "runner/third-party/tokenizers"]
+	path = runner/third-party/tokenizers
+	url = https://github.com/pytorch-labs/tokenizers
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 61fd4d5a6..e004dbfcb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,18 +7,21 @@ ELSE()
 ENDIF()
 
 project(Torchchat)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes")
 
 # include tokenizer
-add_subdirectory(tokenizer)
+add_subdirectory(runner/third-party/tokenizers)
 
 # include et_run executable
 include(runner/et.cmake)
 if(TARGET et_run)
-    target_link_libraries(et_run PUBLIC tokenizer microkernels-prod)
+    target_link_libraries(et_run PUBLIC tokenizers microkernels-prod)
+    target_include_directories(et_run PUBLIC runner/third-party/tokenizers/include)
 endif()
 
 # include aoti_run executable
 include(runner/aoti.cmake)
 if(TARGET aoti_run)
-    target_link_libraries(aoti_run tokenizer)
+    target_link_libraries(aoti_run tokenizers)
+    target_include_directories(aoti_run PUBLIC runner/third-party/tokenizers/include)
 endif()
diff --git a/runner/run.cpp b/runner/run.cpp
index abfbb4584..f2b8e8e6b 100644
--- a/runner/run.cpp
+++ b/runner/run.cpp
@@ -7,20 +7,21 @@ LICENSE file in the root directory of this source tree.
 */
 
 /* Inference for Llama-2 Transformer model in pure C++ */
+#include "sentencepiece.h"
+#include "tiktoken.h"
+#include <algorithm>
+#include <cinttypes>
+#include <cstdint>
+#include <cstdlib>
 #include <ctype.h>
+#include <iterator>
 #include <math.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <time.h>
-#include <tokenizer.h>
-#include <algorithm>
-#include <cstdint>
-#include <cstdlib>
-#include <iterator>
 #include <string>
-
+#include <time.h>
 #ifdef DEBUG
 #include <cassert>
 #include <iostream>
@@ -47,13 +48,25 @@ torch::Device aoti_device(torch::kCPU);
 #endif
 
 using exec_aten::ScalarType;
-using torch::executor::EValue;
-using executorch::extension::TensorPtr;
 using executorch::extension::make_tensor_ptr;
+using executorch::extension::TensorPtr;
+using torch::executor::EValue;
 using torch::executor::Module;
 using torch::executor::Result;
 #endif
 
+using tokenizers::SPTokenizer;
+using tokenizers::Tiktoken;
+using tokenizers::Tokenizer;
+
+#define UNWRAP(x)                                                              \
+  ({                                                                           \
+    if (!(x).ok()) {                                                           \
+      fprintf(stderr, "Got error code % " PRIu32, x.error());                  \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+    std::move(x.get());                                                        \
+  })
 // ----------------------------------------------------------------------------
 // Transformer model
 
@@ -65,56 +78,56 @@ enum ModelType {
 
 ModelType get_model_type(int model_int) {
   switch (model_int) {
-    case 2:
-      return LLAMA2_MODEL;
-      break;
-    case 3:
-      return LLAMA3_MODEL;
-      break;
-    default:
-      return UNKNOWN_MODEL;
+  case 2:
+    return LLAMA2_MODEL;
+    break;
+  case 3:
+    return LLAMA3_MODEL;
+    break;
+  default:
+    return UNKNOWN_MODEL;
   }
 }
 
 typedef struct {
   int vocab_size; // vocabulary size, usually 256 (byte-level)
-  int seq_len; // max sequence length
+  int seq_len;    // max sequence length
 } Config;
 
 typedef struct {
-  float* logits; // output logits
-  int64_t* toks; // tokens seen so far; no kv-cache :(
+  float *logits; // output logits
+  int64_t *toks; // tokens seen so far; no kv-cache :(
 } RunState;
 
 typedef struct {
-  Config config; // the hyperparameters of the architecture (the blueprint)
+  Config config;  // the hyperparameters of the architecture (the blueprint)
   RunState state; // buffers for the "wave" of activations in the forward pass
 
 #ifdef __AOTI_MODEL__
-  torch::inductor::AOTIModelPackageLoader* runner;
+  torch::inductor::AOTIModelPackageLoader *runner;
 #else // __ET_MODEL__
-  Module* runner;
+  Module *runner;
 #endif
 
 } Transformer;
 
-void malloc_run_state(RunState* s, Config* p) {
+void malloc_run_state(RunState *s, Config *p) {
   // we calloc instead of malloc to keep valgrind happy
-  s->logits = (float*)calloc(p->vocab_size, sizeof(float));
-  s->toks = (int64_t*)calloc(p->seq_len, sizeof(int64_t));
+  s->logits = (float *)calloc(p->vocab_size, sizeof(float));
+  s->toks = (int64_t *)calloc(p->seq_len, sizeof(int64_t));
   if (!s->logits || !s->toks) {
     fprintf(stderr, "malloc failed!\n");
     exit(EXIT_FAILURE);
   }
 }
 
-void free_run_state(RunState* s) {
+void free_run_state(RunState *s) {
   free(s->logits);
   free(s->toks);
 }
 
-void read_checkpoint(char* checkpoint, Config* config) {
-  FILE* file = fopen(checkpoint, "rb");
+void read_checkpoint(char *checkpoint, Config *config) {
+  FILE *file = fopen(checkpoint, "rb");
   if (!file) {
     fprintf(stderr, "Couldn't open file %s\n", checkpoint);
     exit(EXIT_FAILURE);
@@ -128,11 +141,8 @@ void read_checkpoint(char* checkpoint, Config* config) {
   config->vocab_size = abs(config->vocab_size);
 }
 
-void build_transformer(
-    Transformer* t,
-    char* model_path,
-    int vocab_size,
-    int seq_len) {
+void build_transformer(Transformer *t, char *model_path, int vocab_size,
+                       int seq_len) {
   // read in the Config and the Weights from the model
   // read_checkpoint(model_path, &t->config);
   // allocate the RunState buffers
@@ -142,7 +152,9 @@ void build_transformer(
 
 #ifdef __AOTI_MODEL__
   t->runner = new torch::inductor::AOTIModelPackageLoader(model_path);
-  aoti_device = t->runner->get_metadata()["AOTI_DEVICE_KEY"] == "cpu" ? torch::Device(torch::kCPU) : torch::Device(torch::kCUDA);
+  aoti_device = t->runner->get_metadata()["AOTI_DEVICE_KEY"] == "cpu"
+                    ? torch::Device(torch::kCPU)
+                    : torch::Device(torch::kCUDA);
 #else //__ET_MODEL__
   t->runner = new Module(
       /* path to PTE model */ model_path,
@@ -150,7 +162,7 @@ void build_transformer(
 #endif
 }
 
-void free_transformer(Transformer* t) {
+void free_transformer(Transformer *t) {
   // free the RunState buffers
   free_run_state(&t->state);
   delete t->runner;
@@ -159,7 +171,7 @@ void free_transformer(Transformer* t) {
 // ----------------------------------------------------------------------------
 // neural net blocks; the dynamics of the Transformer
 
-void softmax(float* x, int size) {
+void softmax(float *x, int size) {
   // find max value (for numerical stability)
   float max_val = x[0];
   for (int i = 1; i < size; i++) {
@@ -179,9 +191,9 @@ void softmax(float* x, int size) {
   }
 }
 
-float* forward(Transformer* transformer, int token, int pos) {
-  Config* p = &transformer->config;
-  RunState* s = &transformer->state;
+float *forward(Transformer *transformer, int token, int pos) {
+  Config *p = &transformer->config;
+  RunState *s = &transformer->state;
   s->toks[pos] = token;
   long token_buffer[1] = {token};
   long pos_buffer[1] = {pos};
@@ -194,8 +206,8 @@ float* forward(Transformer* transformer, int token, int pos) {
   torch::Tensor token_tensor =
       torch::from_blob(token_buffer, {1, 1}, torch::kLong);
   torch::Tensor pos_tensor = torch::from_blob(pos_buffer, {1}, torch::kLong);
-  std::vector<torch::Tensor> inputs{
-      token_tensor.to(aoti_device), pos_tensor.to(aoti_device)};
+  std::vector<torch::Tensor> inputs{token_tensor.to(aoti_device),
+                                    pos_tensor.to(aoti_device)};
 
   torch::Tensor result = transformer->runner->run(inputs)[0]
                              .to(torch::dtype(torch::kFloat32))
@@ -204,7 +216,8 @@ float* forward(Transformer* transformer, int token, int pos) {
   memcpy(s->logits, logits, p->vocab_size * sizeof(float));
 #else // __ET_MODEL__
   TensorPtr pos_managed = make_tensor_ptr({1}, pos_buffer, ScalarType::Long);
-  TensorPtr tokens_managed = make_tensor_ptr({1, 1}, token_buffer, ScalarType::Long);
+  TensorPtr tokens_managed =
+      make_tensor_ptr({1, 1}, token_buffer, ScalarType::Long);
   std::vector<EValue> inputs;
   auto tmp1 = EValue(tokens_managed);
   auto tmp2 = EValue(pos_managed);
@@ -221,17 +234,12 @@ float* forward(Transformer* transformer, int token, int pos) {
   // HACK: the rest of this runner assumes that logits must be float,
   // so we simply convert them rather than plumbing
   // templating/switch-on-type through the rest of this file.
-  const auto& result_tensor = result[0].toTensor();
+  const auto &result_tensor = result[0].toTensor();
   ET_SWITCH_REALHBBF16_TYPES(
-      result_tensor.scalar_type(),
-      unused,
-      "forward",
-      CTYPE,
-      [&]() {
-        const CTYPE* logits = result_tensor.const_data_ptr<CTYPE>();
-        std::transform(logits, logits + p->vocab_size, s->logits, [](auto x) {
-          return static_cast<float>(x);
-        });
+      result_tensor.scalar_type(), unused, "forward", CTYPE, [&]() {
+        const CTYPE *logits = result_tensor.const_data_ptr<CTYPE>();
+        std::transform(logits, logits + p->vocab_size, s->logits,
+                       [](auto x) { return static_cast<float>(x); });
       });
 #endif
 
@@ -249,13 +257,13 @@ typedef struct {
 
 typedef struct {
   int vocab_size;
-  ProbIndex* probindex; // buffer used in top-p sampling
+  ProbIndex *probindex; // buffer used in top-p sampling
   float temperature;
   float topp;
   unsigned long long rng_state;
 } Sampler;
 
-int sample_argmax(float* probabilities, int n) {
+int sample_argmax(float *probabilities, int n) {
   // return the index that has the highest probability
   int max_i = 0;
   float max_p = probabilities[0];
@@ -268,7 +276,7 @@ int sample_argmax(float* probabilities, int n) {
   return max_i;
 }
 
-int sample_mult(float* probabilities, int n, float coin) {
+int sample_mult(float *probabilities, int n, float coin) {
   // sample index from probabilities (they must sum to 1!)
   // coin is a random number in [0, 1), usually from random_f32()
   float cdf = 0.0f;
@@ -281,9 +289,9 @@ int sample_mult(float* probabilities, int n, float coin) {
   return n - 1; // in case of rounding errors
 }
 
-int compare(const void* a, const void* b) {
-  ProbIndex* a_ = (ProbIndex*)a;
-  ProbIndex* b_ = (ProbIndex*)b;
+int compare(const void *a, const void *b) {
+  ProbIndex *a_ = (ProbIndex *)a;
+  ProbIndex *b_ = (ProbIndex *)b;
   if (a_->prob > b_->prob)
     return -1;
   if (a_->prob < b_->prob)
@@ -291,12 +299,8 @@ int compare(const void* a, const void* b) {
   return 0;
 }
 
-int sample_topp(
-    float* probabilities,
-    int n,
-    float topp,
-    ProbIndex* probindex,
-    float coin) {
+int sample_topp(float *probabilities, int n, float topp, ProbIndex *probindex,
+                float coin) {
   // top-p sampling (or "nucleus sampling") samples from the smallest set of
   // tokens that exceed probability topp. This way we never sample tokens that
   // have very low probabilities and are less likely to go "off the rails".
@@ -339,37 +343,31 @@ int sample_topp(
   return probindex[last_idx].index; // in case of rounding errors
 }
 
-void build_sampler(
-    Sampler* sampler,
-    int vocab_size,
-    float temperature,
-    float topp,
-    unsigned long long rng_seed) {
+void build_sampler(Sampler *sampler, int vocab_size, float temperature,
+                   float topp, unsigned long long rng_seed) {
   sampler->vocab_size = vocab_size;
   sampler->temperature = temperature;
   sampler->topp = topp;
   sampler->rng_state = rng_seed;
   // buffer only used with nucleus sampling; may not need but it's ~small
   sampler->probindex =
-      (ProbIndex*)malloc(sampler->vocab_size * sizeof(ProbIndex));
+      (ProbIndex *)malloc(sampler->vocab_size * sizeof(ProbIndex));
 }
 
-void free_sampler(Sampler* sampler) {
-  free(sampler->probindex);
-}
+void free_sampler(Sampler *sampler) { free(sampler->probindex); }
 
-unsigned int random_u32(unsigned long long* state) {
+unsigned int random_u32(unsigned long long *state) {
   // xorshift rng: https://en.wikipedia.org/wiki/Xorshift#xorshift.2A
   *state ^= *state >> 12;
   *state ^= *state << 25;
   *state ^= *state >> 27;
   return (*state * 0x2545F4914F6CDD1Dull) >> 32;
 }
-float random_f32(unsigned long long* state) { // random float32 in [0,1)
+float random_f32(unsigned long long *state) { // random float32 in [0,1)
   return (random_u32(state) >> 8) / 16777216.0f;
 }
 
-int sample(Sampler* sampler, float* logits) {
+int sample(Sampler *sampler, float *logits) {
   // sample the token given the logits and some hyperparameters
   int next;
   if (sampler->temperature == 0.0f) {
@@ -390,39 +388,37 @@ int sample(Sampler* sampler, float* logits) {
       next = sample_mult(logits, sampler->vocab_size, coin);
     } else {
       // top-p (nucleus) sampling, clamping the least likely tokens to zero
-      next = sample_topp(
-          logits, sampler->vocab_size, sampler->topp, sampler->probindex, coin);
+      next = sample_topp(logits, sampler->vocab_size, sampler->topp,
+                         sampler->probindex, coin);
     }
   }
   return next;
 }
 
-Tokenizer* build_tokenizer(const char* tokenizer_path, ModelType model_type) {
-  Tokenizer* tokenizer = NULL;
+Tokenizer *build_tokenizer(const char *tokenizer_path, ModelType model_type) {
+  Tokenizer *tokenizer = NULL;
   switch (model_type) {
-    case LLAMA2_MODEL:
-      tokenizer = new SPTokenizer();
-      tokenizer->load(tokenizer_path);
-      break;
-    case LLAMA3_MODEL:
-      tokenizer = new Tiktoken();
-      tokenizer->load(tokenizer_path);
-      break;
-    default:
-      fprintf(stderr, "No tokenizer defined for model type %d.\n", model_type);
-      exit(EXIT_FAILURE);
+  case LLAMA2_MODEL:
+    tokenizer = new SPTokenizer();
+    tokenizer->load(tokenizer_path);
+    break;
+  case LLAMA3_MODEL:
+    tokenizer = new Tiktoken();
+    tokenizer->load(tokenizer_path);
+    break;
+  default:
+    fprintf(stderr, "No tokenizer defined for model type %d.\n", model_type);
+    exit(EXIT_FAILURE);
   }
   return tokenizer;
 }
 
-void free_tokenizer(Tokenizer* tokenizer) {
-  delete tokenizer;
-}
+void free_tokenizer(Tokenizer *tokenizer) { delete tokenizer; }
 
 // ----------------------------------------------------------------------------
 // utilities: time
 
-void safe_printf(const char* piece) {
+void safe_printf(const char *piece) {
   // piece might be a raw byte token, and we only want to print printable chars
   // or whitespace because some of the other bytes can be various control codes,
   // backspace, etc.
@@ -454,21 +450,18 @@ long time_in_ms() {
 // Prints decoded tokens generated from the transformer.
 // The first token is not printed and is assumed to be a BOS or other similar
 // token
-unsigned generate_from_prompt_tokens(
-    Transformer* transformer,
-    Tokenizer* tokenizer,
-    Sampler* sampler,
-    const std::vector<uint64_t>& prompt_tokens,
-    unsigned pos,
-    const std::vector<uint64_t>& stop_tokens,
-    int stop_pos,
-    bool print_prompt,
-    bool print_tok_per_sec) {
+unsigned generate_from_prompt_tokens(Transformer *transformer,
+                                     Tokenizer *tokenizer, Sampler *sampler,
+                                     const std::vector<uint64_t> &prompt_tokens,
+                                     unsigned pos,
+                                     const std::vector<uint64_t> &stop_tokens,
+                                     int stop_pos, bool print_prompt,
+                                     bool print_tok_per_sec) {
   if (prompt_tokens.size() == 0) {
     return pos;
   }
 
-  uint64_t next; // will store the next token in the sequence
+  uint64_t next;  // will store the next token in the sequence
   uint64_t token; // stores the current token to feed into the transformer
   bool done_with_prompt; // whether we are done processing prompt
 
@@ -486,7 +479,7 @@ unsigned generate_from_prompt_tokens(
     if (pos_in_prompt < prompt_tokens.size()) {
       // Token comes from prompt
       token = prompt_tokens[pos_in_prompt++];
-      float* logits = forward(transformer, token, pos);
+      float *logits = forward(transformer, token, pos);
 
       // Next token is either from prompt or if on last
       // prompt token, next is sampled
@@ -498,29 +491,27 @@ unsigned generate_from_prompt_tokens(
     } else {
       // Token comes from next sampled from previous round.
       token = next;
-      float* logits = forward(transformer, token, pos);
+      float *logits = forward(transformer, token, pos);
       next = sample(sampler, logits);
     }
     done_with_prompt = (pos_in_prompt >= prompt_tokens.size());
 
     // we terminate on finding the stop_token if we are done processing the
     // prompt (stop_tokens in the prompt do not terminate the loop)
-    if (done_with_prompt &&
-        (std::find(stop_tokens.begin(), stop_tokens.end(), token) !=
-         stop_tokens.end())) {
+    if (done_with_prompt && (std::find(stop_tokens.begin(), stop_tokens.end(),
+                                       token) != stop_tokens.end())) {
       found_stop_token = true;
     }
 
     // We print next in each iteration of the loop, not token
     if (!found_stop_token && (print_prompt || done_with_prompt)) {
       // The stop_token is printed as newline
-      bool next_is_stop =
-          std::find(stop_tokens.begin(), stop_tokens.end(), next) !=
-          stop_tokens.end();
+      bool next_is_stop = std::find(stop_tokens.begin(), stop_tokens.end(),
+                                    next) != stop_tokens.end();
       if (next_is_stop) {
         printf("\n");
       } else {
-        std::string piece = tokenizer->decode(token, next);
+        std::string piece = UNWRAP(tokenizer->decode(token, next));
         safe_printf(piece.c_str()); // same as printf("%s", piece), but skips
                                     // "unsafe" bytes
         fflush(stdout);
@@ -538,23 +529,16 @@ unsigned generate_from_prompt_tokens(
   // iteration)
   if (print_tok_per_sec && pos > 1) {
     long end = time_in_ms();
-    fprintf(
-        stderr,
-        "\n\nachieved tok/s: %f\n",
-        (pos - 1) / (double)(end - start) * 1000);
+    fprintf(stderr, "\n\nachieved tok/s: %f\n",
+            (pos - 1) / (double)(end - start) * 1000);
   }
 
   return pos;
 }
 
-void generate(
-    Transformer* transformer,
-    Tokenizer* tokenizer,
-    Sampler* sampler,
-    const char* prompt,
-    int steps,
-    ModelType model_type) {
-  const char* default_prompt = "Once upon a time";
+void generate(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler,
+              const char *prompt, int steps, ModelType model_type) {
+  const char *default_prompt = "Once upon a time";
   if (prompt == NULL) {
     prompt = default_prompt;
   }
@@ -566,33 +550,30 @@ void generate(
   std::vector<uint64_t> prompt_tokens;
   std::vector<uint64_t> stop_tokens;
   switch (model_type) {
-    case LLAMA2_MODEL:
-      prompt_tokens = tokenizer->encode(prompt, 1, 0);
-      stop_tokens.push_back(tokenizer->eos_tok());
-      break;
-    case LLAMA3_MODEL:
-      prompt_tokens = tokenizer->encode(prompt, 1, 0);
-      stop_tokens.push_back(tokenizer->encode("<|end_of_text|>", 0, 0)[0]);
-      stop_tokens.push_back(tokenizer->encode("<|eot_id|>", 0, 0)[0]);
-      break;
-    default:
-      fprintf(stderr, "Generate does not support model type %d.\n", model_type);
-      exit(EXIT_FAILURE);
-  }
-
-  generate_from_prompt_tokens(
-      transformer,
-      tokenizer,
-      sampler,
-      prompt_tokens,
-      /*pos=*/0,
-      /*stop_tokens=*/stop_tokens,
-      /*stop_pos=*/steps - 1,
-      /*print_prompt=*/true,
-      /*print_tok_per_sec=*/true);
+  case LLAMA2_MODEL:
+    prompt_tokens = UNWRAP(tokenizer->encode(prompt, 1, 0));
+    stop_tokens.push_back(tokenizer->eos_tok());
+    break;
+  case LLAMA3_MODEL:
+    prompt_tokens = UNWRAP(tokenizer->encode(prompt, 1, 0));
+    stop_tokens.push_back(
+        UNWRAP(tokenizer->encode("<|end_of_text|>", 0, 0))[0]);
+    stop_tokens.push_back(UNWRAP(tokenizer->encode("<|eot_id|>", 0, 0))[0]);
+    break;
+  default:
+    fprintf(stderr, "Generate does not support model type %d.\n", model_type);
+    exit(EXIT_FAILURE);
+  }
+
+  generate_from_prompt_tokens(transformer, tokenizer, sampler, prompt_tokens,
+                              /*pos=*/0,
+                              /*stop_tokens=*/stop_tokens,
+                              /*stop_pos=*/steps - 1,
+                              /*print_prompt=*/true,
+                              /*print_tok_per_sec=*/true);
 }
 
-void read_stdin(const char* guide, char* buffer, size_t bufsize) {
+void read_stdin(const char *guide, char *buffer, size_t bufsize) {
   // read a line from stdin, up to but not including \n
   printf("%s", guide);
   if (fgets(buffer, bufsize, stdin) != NULL) {
@@ -609,11 +590,10 @@ void read_stdin(const char* guide, char* buffer, size_t bufsize) {
 // python reference and that seemed ok, but this was not thoroughly tested and
 // is not safely implemented, it's more a proof of concept atm.
 
-std::vector<uint64_t> get_initial_prompt_tokens(
-    const char* cli_system_prompt,
-    const char* cli_user_prompt,
-    Tokenizer* tokenizer,
-    ModelType model_type) {
+std::vector<uint64_t> get_initial_prompt_tokens(const char *cli_system_prompt,
+                                                const char *cli_user_prompt,
+                                                Tokenizer *tokenizer,
+                                                ModelType model_type) {
   char system_prompt[512];
   char user_prompt[512];
   char rendered_prompt[512 * 2 + 200]; // the prompt template is ~170
@@ -622,10 +602,8 @@ std::vector<uint64_t> get_initial_prompt_tokens(
   if (cli_system_prompt != NULL) {
     strcpy(system_prompt, cli_system_prompt);
   } else {
-    read_stdin(
-        "Enter system prompt (optional): ",
-        system_prompt,
-        sizeof(system_prompt));
+    read_stdin("Enter system prompt (optional): ", system_prompt,
+               sizeof(system_prompt));
   }
 
   if (cli_user_prompt != NULL) {
@@ -637,48 +615,40 @@ std::vector<uint64_t> get_initial_prompt_tokens(
   std::vector<uint64_t> tokens;
 
   switch (model_type) {
-    case LLAMA2_MODEL:
-      if (system_prompt[0] != '\0') {
-        snprintf(
-            rendered_prompt,
-            sizeof(rendered_prompt) - 1,
-            "[INST] <<SYS>>\n%s\n<</SYS>>\n\n%s [/INST]",
-            system_prompt,
-            user_prompt);
-      } else {
-        snprintf(
-            rendered_prompt,
-            sizeof(rendered_prompt) - 1,
-            "[INST] %s [/INST]",
-            user_prompt);
-      }
+  case LLAMA2_MODEL:
+    if (system_prompt[0] != '\0') {
+      snprintf(rendered_prompt, sizeof(rendered_prompt) - 1,
+               "[INST] <<SYS>>\n%s\n<</SYS>>\n\n%s [/INST]", system_prompt,
+               user_prompt);
+    } else {
+      snprintf(rendered_prompt, sizeof(rendered_prompt) - 1,
+               "[INST] %s [/INST]", user_prompt);
+    }
 
-      // We need to add BOS token here and not in template because llama2
-      // tokenizer does not pattern match special tokens
-      tokens = tokenizer->encode(rendered_prompt, 1, 0);
-      break;
-
-    case LLAMA3_MODEL:
-      if (system_prompt[0] != '\0') {
-        snprintf(
-            rendered_prompt,
-            sizeof(rendered_prompt) - 1,
-            "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
-            system_prompt,
-            user_prompt);
-      } else {
-        snprintf(
-            rendered_prompt,
-            sizeof(rendered_prompt) - 1,
-            "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
-            user_prompt);
-      }
-      tokens = tokenizer->encode(rendered_prompt, 0, 0);
-      break;
+    // We need to add BOS token here and not in template because llama2
+    // tokenizer does not pattern match special tokens
+    tokens = UNWRAP(tokenizer->encode(rendered_prompt, 1, 0));
+    break;
+
+  case LLAMA3_MODEL:
+    if (system_prompt[0] != '\0') {
+      snprintf(rendered_prompt, sizeof(rendered_prompt) - 1,
+               "<|begin_of_text|><|start_header_id|>system<|end_header_id|>"
+               "\n\n%s<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n%s<"
+               "|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+               system_prompt, user_prompt);
+    } else {
+      snprintf(rendered_prompt, sizeof(rendered_prompt) - 1,
+               "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n%"
+               "s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
+               user_prompt);
+    }
+    tokens = UNWRAP(tokenizer->encode(rendered_prompt, 0, 0));
+    break;
 
-    default:
-      fprintf(stderr, "Chat does not support model type %d.\n", model_type);
-      exit(EXIT_FAILURE);
+  default:
+    fprintf(stderr, "Chat does not support model type %d.\n", model_type);
+    exit(EXIT_FAILURE);
   }
 
 #ifdef DEBUG
@@ -695,9 +665,8 @@ std::vector<uint64_t> get_initial_prompt_tokens(
   return tokens;
 }
 
-std::vector<uint64_t> get_next_user_prompt_tokens(
-    Tokenizer* tokenizer,
-    ModelType model_type) {
+std::vector<uint64_t> get_next_user_prompt_tokens(Tokenizer *tokenizer,
+                                                  ModelType model_type) {
   char user_prompt[512];
   char rendered_prompt[512 + 150]; // the prompt template is ~100 characters. We
                                    // use 150 to be safe.
@@ -706,30 +675,26 @@ std::vector<uint64_t> get_next_user_prompt_tokens(
   std::vector<uint64_t> tokens;
 
   switch (model_type) {
-    case LLAMA2_MODEL:
-      snprintf(
-          rendered_prompt,
-          sizeof(rendered_prompt) - 1,
-          "[INST] %s [/INST]",
-          user_prompt);
-
-      // We need to add BOS token here and not in template because llama2
-      // tokenizer does not pattern match special tokens
-      tokens = tokenizer->encode(rendered_prompt, /*bos*/ 1, /*eos*/ 0);
-      break;
-
-    case LLAMA3_MODEL:
-      snprintf(
-          rendered_prompt,
-          sizeof(rendered_prompt) - 1,
-          "<|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
-          user_prompt);
-      tokens = tokenizer->encode(rendered_prompt, 0, 0);
-      break;
-
-    default:
-      fprintf(stderr, "Chat does not support model type %d.\n", model_type);
-      exit(EXIT_FAILURE);
+  case LLAMA2_MODEL:
+    snprintf(rendered_prompt, sizeof(rendered_prompt) - 1, "[INST] %s [/INST]",
+             user_prompt);
+
+    // We need to add BOS token here and not in template because llama2
+    // tokenizer does not pattern match special tokens
+    tokens = UNWRAP(tokenizer->encode(rendered_prompt, /*bos*/ 1, /*eos*/ 0));
+    break;
+
+  case LLAMA3_MODEL:
+    snprintf(rendered_prompt, sizeof(rendered_prompt) - 1,
+             "<|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_"
+             "header_id|>assistant<|end_header_id|>\n\n",
+             user_prompt);
+    tokens = UNWRAP(tokenizer->encode(rendered_prompt, 0, 0));
+    break;
+
+  default:
+    fprintf(stderr, "Chat does not support model type %d.\n", model_type);
+    exit(EXIT_FAILURE);
   }
 
 #ifdef DEBUG
@@ -746,14 +711,9 @@ std::vector<uint64_t> get_next_user_prompt_tokens(
   return tokens;
 }
 
-void chat(
-    Transformer* transformer,
-    Tokenizer* tokenizer,
-    Sampler* sampler,
-    const char* cli_user_prompt,
-    const char* cli_system_prompt,
-    unsigned steps,
-    ModelType model_type) {
+void chat(Transformer *transformer, Tokenizer *tokenizer, Sampler *sampler,
+          const char *cli_user_prompt, const char *cli_system_prompt,
+          unsigned steps, ModelType model_type) {
   if (steps == 0) {
     return;
   }
@@ -761,16 +721,16 @@ void chat(
   uint64_t eot_token;
   std::vector<uint64_t> prompt_tokens;
   switch (model_type) {
-    case LLAMA2_MODEL:
-      // llama2 uses EOS as EOT token
-      eot_token = tokenizer->eos_tok();
-      break;
-    case LLAMA3_MODEL:
-      eot_token = tokenizer->encode("<|eot_id|>", 0, 0)[0];
-      break;
-    default:
-      fprintf(stderr, "Chat does not support model type %d.\n", model_type);
-      exit(EXIT_FAILURE);
+  case LLAMA2_MODEL:
+    // llama2 uses EOS as EOT token
+    eot_token = tokenizer->eos_tok();
+    break;
+  case LLAMA3_MODEL:
+    eot_token = UNWRAP(tokenizer->encode("<|eot_id|>", 0, 0))[0];
+    break;
+  default:
+    fprintf(stderr, "Chat does not support model type %d.\n", model_type);
+    exit(EXIT_FAILURE);
   }
 
   std::vector<uint64_t> stop_tokens{eot_token};
@@ -784,11 +744,7 @@ void chat(
     }
     printf("Assistant: ");
     pos = generate_from_prompt_tokens(
-        transformer,
-        tokenizer,
-        sampler,
-        prompt_tokens,
-        pos,
+        transformer, tokenizer, sampler, prompt_tokens, pos,
         /*stop_tokens=*/stop_tokens,
         /*stop_pos=*/steps - 1, // We could pass in -1 here if we do not want
                                 // the model to stop mid-reply
@@ -803,46 +759,43 @@ void chat(
 
 void error_usage() {
   fprintf(stderr, "Usage:   run <model_path> [options]\n");
-  fprintf(
-      stderr, "Example: run model.{so,pte} -n 256 -i \"Once upon a time\"\n");
+  fprintf(stderr,
+          "Example: run model.{so,pte} -n 256 -i \"Once upon a time\"\n");
   fprintf(stderr, "Options:\n");
   fprintf(stderr, "  -t <float>  temperature in [0,inf], default 1.0\n");
-  fprintf(
-      stderr,
-      "  -p <float>  p value in top-p (nucleus) sampling in [0,1], default 0.9\n");
+  fprintf(stderr, "  -p <float>  p value in top-p (nucleus) sampling in [0,1], "
+                  "default 0.9\n");
   fprintf(stderr, "  -s <int>    random seed, default time(NULL)\n");
-  fprintf(
-      stderr,
-      "  -n <int>    number of steps to run for, default 256. 0 = max_seq_len\n");
+  fprintf(stderr, "  -n <int>    number of steps to run for, default 256. 0 = "
+                  "max_seq_len\n");
   fprintf(stderr, "  -i <string> input prompt\n");
   fprintf(stderr, "  -z <string> path to tokenizer\n");
   fprintf(stderr, "  -m <string> mode: generate|chat, default: generate\n");
   fprintf(stderr, "  -y <string> (optional) system prompt in chat mode\n");
-  fprintf(
-      stderr,
-      "  -v <int>    (optional) vocab size, default is model-specific.\n");
-  fprintf(
-      stderr, "  -l <int>    (optional) llama version (2 or 3), default 2.\n");
+  fprintf(stderr,
+          "  -v <int>    (optional) vocab size, default is model-specific.\n");
+  fprintf(stderr,
+          "  -l <int>    (optional) llama version (2 or 3), default 2.\n");
   fprintf(
       stderr,
       "  -d <string> (optional) device(CUDA or CPU)  model was exported for\n");
   exit(EXIT_FAILURE);
 }
 
-int main(int argc, char* argv[]) {
+int main(int argc, char *argv[]) {
   // default parameters
-  char* model_path = NULL;
-  char* tokenizer_path = NULL;
+  char *model_path = NULL;
+  char *tokenizer_path = NULL;
   float temperature =
       1.0f; // 0.0 = greedy deterministic. 1.0 = original. don't set higher
   float topp = 0.9f; // top-p in nucleus sampling. 1.0 = off. 0.9 works well,
                      // but slower
 
-  int steps = 128; // number of steps to run for
-  const char* prompt = NULL; // prompt string
+  int steps = 128;                 // number of steps to run for
+  const char *prompt = NULL;       // prompt string
   unsigned long long rng_seed = 0; // seed rng with time by default
-  const char* mode = "generate"; // generate|chat
-  char* system_prompt =
+  const char *mode = "generate";   // generate|chat
+  char *system_prompt =
       NULL; // the (optional) system prompt to use in chat mode
 
   int vocab_size = -1;
@@ -916,10 +869,8 @@ int main(int argc, char* argv[]) {
 
   ModelType model_type = get_model_type(llama_ver);
   if (model_type == UNKNOWN_MODEL) {
-    fprintf(
-        stderr,
-        "Unknown model type passed by -l argument.  Received l=%d.",
-        llama_ver);
+    fprintf(stderr, "Unknown model type passed by -l argument.  Received l=%d.",
+            llama_ver);
     error_usage();
   }
 
@@ -943,7 +894,7 @@ int main(int argc, char* argv[]) {
   if (steps < 0)
     steps = 0;
 
-  Tokenizer* tokenizer = build_tokenizer(tokenizer_path, model_type);
+  Tokenizer *tokenizer = build_tokenizer(tokenizer_path, model_type);
 
   // If no tokenizer path provided, get default for model_type
   if (vocab_size == -1) {
@@ -959,14 +910,8 @@ int main(int argc, char* argv[]) {
   if (strcmp(mode, "generate") == 0) {
     generate(&transformer, tokenizer, &sampler, prompt, steps, model_type);
   } else if (strcmp(mode, "chat") == 0) {
-    chat(
-        &transformer,
-        tokenizer,
-        &sampler,
-        prompt,
-        system_prompt,
-        steps,
-        model_type);
+    chat(&transformer, tokenizer, &sampler, prompt, system_prompt, steps,
+         model_type);
   } else {
     fprintf(stderr, "unknown mode: %s\n", mode);
     error_usage();
diff --git a/runner/third-party/tokenizers b/runner/third-party/tokenizers
new file mode 160000
index 000000000..3f536fc01
--- /dev/null
+++ b/runner/third-party/tokenizers
@@ -0,0 +1 @@
+Subproject commit 3f536fc0139f7987940f69de2aef58eec1794f6a
diff --git a/tokenizer/CMakeLists.txt b/tokenizer/CMakeLists.txt
deleted file mode 100644
index 39c20885d..000000000
--- a/tokenizer/CMakeLists.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-cmake_minimum_required(VERSION 3.24)
-set(CMAKE_CXX_STANDARD 17)
-IF(DEFINED ENV{TORCHCHAT_ROOT})
-    set(TORCHCHAT_ROOT $ENV{TORCHCHAT_ROOT})
-ELSE()
-    set(TORCHCHAT_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
-ENDIF()
-
-# build tokenizer library
-add_library(
-    tokenizer
-    tokenizer.h
-    sentencepiece.cpp
-    tiktoken.cpp)
-
-target_include_directories(tokenizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} third-party/sentencepiece/src)
-
-# add RE2 as subdirectory
-set(ABSL_ENABLE_INSTALL ON)
-set(ABSL_PROPAGATE_CXX_STD ON)
-set(_pic_flag
-${CMAKE_POSITION_INDEPENDENT_CODE})
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-add_subdirectory(third-party/abseil-cpp)
-add_subdirectory(third-party/re2)
-add_subdirectory(third-party/sentencepiece)
-set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
-
-target_link_libraries(tokenizer PUBLIC re2::re2 sentencepiece-static)
diff --git a/tokenizer/base64.h b/tokenizer/base64.h
deleted file mode 100644
index 12b8703a8..000000000
--- a/tokenizer/base64.h
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-// @lint-ignore-every LICENSELINT
-/**************************************************************************
-   Copyright (c) 2023 sewenew
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
- *************************************************************************/
-
-#pragma once
-
-#include <cassert>
-#include <cstdint>
-#include <string>
-#include <string_view>
-
-namespace base64 {
-
-std::string decode(const std::string_view& input);
-
-namespace detail {
-
-constexpr uint32_t DECODE_TABLE[] = {
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62,  255,
-    255, 255, 63,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  255, 255,
-    255, 255, 255, 255, 255, 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
-    10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
-    25,  255, 255, 255, 255, 255, 255, 26,  27,  28,  29,  30,  31,  32,  33,
-    34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
-    49,  50,  51,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255};
-
-inline void validate(uint32_t v) {
-  if (v == 255) {
-    fprintf(stderr, "invalid char");
-    exit(EXIT_FAILURE);
-  }
-}
-
-inline void decode(const std::string_view& input, std::string& output) {
-  if (input.size() != 4) {
-    fprintf(stderr, "input length must be 4, got %zu", input.size());
-    exit(EXIT_FAILURE);
-  }
-
-  uint32_t val = 0;
-
-  uint8_t c = input[0];
-  auto v = DECODE_TABLE[c];
-  validate(v);
-  val = v;
-
-  c = input[1];
-  v = DECODE_TABLE[c];
-  validate(v);
-  val = (val << 6) | v;
-
-  c = input[2];
-  v = DECODE_TABLE[c];
-  validate(v);
-  val = (val << 6) | v;
-
-  c = input[3];
-  v = DECODE_TABLE[c];
-  validate(v);
-  val = (val << 6) | v;
-
-  output.push_back(static_cast<char>((val >> 16) & 0xFF));
-  output.push_back(static_cast<char>((val >> 8) & 0xFF));
-  output.push_back(static_cast<char>(val & 0xFF));
-}
-
-inline void decode_1_padding(
-    const std::string_view& input,
-    std::string& output) {
-  if (input.size() != 3) {
-    fprintf(stderr, "input length must be 3, got %zu", input.size());
-    exit(EXIT_FAILURE);
-  }
-
-  uint32_t val = 0;
-
-  uint8_t c = input[0];
-  auto v = DECODE_TABLE[c];
-  validate(v);
-  val = v;
-
-  c = input[1];
-  v = DECODE_TABLE[c];
-  validate(v);
-  val = (val << 6) | v;
-
-  c = input[2];
-  v = DECODE_TABLE[c];
-  validate(v);
-  val = (val << 6) | v;
-
-  output.push_back(static_cast<char>((val >> 10) & 0xFF));
-  output.push_back(static_cast<char>((val >> 2) & 0xFF));
-}
-
-inline void decode_2_padding(
-    const std::string_view& input,
-    std::string& output) {
-  assert(input.size() == 2);
-
-  uint32_t val = 0;
-
-  uint8_t c = input[0];
-  auto v = DECODE_TABLE[c];
-  validate(v);
-  val = v;
-
-  c = input[1];
-  v = DECODE_TABLE[c];
-  validate(v);
-  val = (val << 6) | v;
-
-  output.push_back(static_cast<char>((val >> 4) & 0xFF));
-}
-
-} // namespace detail
-
-inline std::string decode(const std::string_view& input) {
-  if (input.empty()) {
-    fprintf(stderr, "empty input");
-    exit(EXIT_FAILURE);
-  }
-
-  // Faster than `input.size() % 4`.
-  if ((input.size() & 3) != 0 || input.size() < 4) {
-    fprintf(
-        stderr,
-        "input length must be larger than 4 and is multiple of 4, got %zu",
-        input.size());
-    exit(EXIT_FAILURE);
-  }
-
-  std::string output;
-  output.reserve(input.size() / 4 * 3);
-  auto idx = 0U;
-  for (; idx < input.size() - 4; idx += 4) {
-    detail::decode(input.substr(idx, 4), output);
-  }
-
-  // Last 4 bytes. Might contain paddings.
-  if (input[idx + 3] == '=') {
-    if (input[idx + 2] == '=') {
-      // Tow paddings.
-      detail::decode_2_padding(input.substr(idx, 2), output);
-    } else {
-      // One padding.
-      detail::decode_1_padding(input.substr(idx, 3), output);
-    }
-  } else {
-    // No padding.
-    detail::decode(input.substr(idx, 4), output);
-  }
-
-  return output;
-}
-} // namespace base64
diff --git a/tokenizer/sentencepiece.cpp b/tokenizer/sentencepiece.cpp
deleted file mode 100644
index 0cdfc7e30..000000000
--- a/tokenizer/sentencepiece.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// sentencepiece tokenizer
-
-#include <sentencepiece_processor.h>
-#include <tokenizer.h>
-#include <cinttypes>
-#include <string>
-#include "absl/strings/str_replace.h"
-
-const char kSpaceSymbol[] = "\xe2\x96\x81";
-
-SPTokenizer::SPTokenizer()
-    : Tokenizer(),
-      _processor(std::make_unique<sentencepiece::SentencePieceProcessor>()) {}
-
-/**
- * @brief Load the tokenizer from a file. The tokenizer file contains the
- * vocabulary and scores. The format is: the first integer is the maximum
- * token length, followed by a list of (word_len, word) pairs. Here we
- * are reading all the vocabulary into memory and keep it sorted for fast
- * lookup.
- *
- * @param tokenizer_path The path to the tokenizer file.
- * @return void
- */
-void SPTokenizer::load(const std::string& tokenizer_path) {
-  if (initialized_) {
-    fprintf(stderr, "Tokenizer already initialized.\n");
-    return;
-  }
-  // read in the file
-  const auto status = _processor->Load(tokenizer_path);
-  if (!status.ok()) {
-    fprintf(stderr, "couldn't load %s\n. If this tokenizer artifact is for llama3, please pass `-l 3`.", tokenizer_path.c_str());
-    exit(EXIT_FAILURE);
-  }
-  // load vocab_size, bos_tok, eos_tok
-  vocab_size_ = _processor->GetPieceSize();
-  bos_tok_ = _processor->bos_id();
-  eos_tok_ = _processor->eos_id();
-  initialized_ = true;
-}
-
-SPTokenizer::~SPTokenizer() {}
-
-/**
- * @brief Decode a token into string.
- *
- * @param prev_token The previous token.
- * @param token The current token.
- * @return std::string A pointer to the string representation of the
- * token.
- */
-std::string SPTokenizer::decode(uint64_t prev_token, uint64_t token) {
-  if (!initialized_) {
-    fprintf(stderr, "Tokenizer not initialized\n");
-    exit(EXIT_FAILURE);
-  }
-  // get rid of the control ids <s> and </s>
-  if (_processor->IsControl(token)) {
-    // NB: returning empty string doesn't work for some reason. It causes
-    // free(): invalid pointer error.
-    return " ";
-  }
-
-  std::string result =
-      absl::StrReplaceAll(_processor->IdToPiece(token), {{kSpaceSymbol, " "}});
-
-  // following BOS token, sentencepiece decoder strips any leading
-  // whitespace
-  if (prev_token == bos_tok_ && result[0] == ' ') {
-    result = result.substr(1);
-  }
-
-  // handle <0x0A>
-  result = absl::StrReplaceAll(result, {{"<0x0A>", "\n"}});
-
-  return result;
-}
-
-/**
- * @brief Encode a string into a sequence of tokens.
- *
- * @param text The string to be encoded.
- * @param bos The number of BOS to prepend to the token list.
- * @param eos The number of EOS to append to the token list.
- * @return std::vector<uint64_t>
- */
-std::vector<uint64_t>
-SPTokenizer::encode(const std::string& text, int8_t bos, int8_t eos) {
-  if (!initialized_) {
-    fprintf(stderr, "Tokenizer not initialized\n");
-    exit(EXIT_FAILURE);
-  }
-  // workaround a weird issue that text doesn't have correct size()
-  std::string input(text.c_str());
-  // should we reserve memory?
-  std::vector<int> res;
-  auto status = _processor->Encode(input, &res);
-  if (!status.ok()) {
-    fprintf(stderr, "couldn't encode %s\n", text.c_str());
-    exit(EXIT_FAILURE);
-  }
-
-  std::vector<uint64_t> tokens;
-  for (auto i = 0; i < bos; ++i) {
-    tokens.push_back(bos_tok_);
-  }
-
-  for (auto i = 0; i < res.size(); ++i) {
-    tokens.push_back(res[i]);
-  }
-
-  for (auto i = 0; i < eos; ++i) {
-    tokens.push_back(eos_tok_);
-  }
-  return tokens;
-}
diff --git a/tokenizer/third-party/abseil-cpp b/tokenizer/third-party/abseil-cpp
deleted file mode 160000
index 854193071..000000000
--- a/tokenizer/third-party/abseil-cpp
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 854193071498f330b71083d7e06a7cd18e02a4cc
diff --git a/tokenizer/third-party/re2 b/tokenizer/third-party/re2
deleted file mode 160000
index ac82d4f62..000000000
--- a/tokenizer/third-party/re2
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit ac82d4f628a2045d89964ae11c48403d3b091af1
diff --git a/tokenizer/third-party/sentencepiece b/tokenizer/third-party/sentencepiece
deleted file mode 160000
index 7dcb54145..000000000
--- a/tokenizer/third-party/sentencepiece
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 7dcb541451b1862d73f473b3804ccf8f2a9e10f6
diff --git a/tokenizer/tiktoken.cpp b/tokenizer/tiktoken.cpp
deleted file mode 100644
index 2f31f057a..000000000
--- a/tokenizer/tiktoken.cpp
+++ /dev/null
@@ -1,390 +0,0 @@
-// @lint-ignore-every LICENSELINT
-/**************************************************************************
-   Copyright (c) 2023 sewenew
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
- *************************************************************************/
-
-#include <base64.h>
-#include <tokenizer.h>
-#include <cctype>
-#include <cinttypes>
-#include <cstdint>
-#include <fstream>
-#include <functional>
-#include <limits>
-#include <memory>
-#include <regex>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-// ------------------------------Util start------------------------------------
-
-static uint64_t _max_size() {
-  return std::numeric_limits<uint64_t>::max();
-}
-
-static Re2UPtr _create_regex(const std::string& pattern) {
-  assert(!pattern.empty());
-
-  return std::make_unique<re2::RE2>("(" + pattern + ")");
-}
-
-static Re2UPtr _build_special_token_regex(const Encoder& special_encoder) {
-  std::string special_pattern;
-  for (const auto& ele : special_encoder) {
-    if (!special_pattern.empty()) {
-      special_pattern += "|";
-    }
-    special_pattern += re2::RE2::QuoteMeta(ele.first);
-  }
-
-  if (special_pattern.empty()) {
-    return nullptr;
-  }
-
-  return _create_regex(special_pattern);
-}
-
-static std::pair<std::string, uint64_t> _parse(const std::string& line) {
-  auto pos = line.find(" ");
-  if (pos == std::string::npos) {
-    throw std::invalid_argument("invalid encoder line: " + line);
-  }
-
-  auto token = base64::decode({line.data(), pos});
-  uint64_t rank = 0;
-  try {
-    rank = std::stoul(line.substr(pos + 1));
-  } catch (const std::exception&) {
-    throw std::invalid_argument("invalid encoder rank:  " + line);
-  }
-
-  return {std::move(token), rank};
-}
-
-static Encoder _load_encoder(const std::string& path) {
-  std::ifstream file(path);
-  if (!file) {
-    fprintf(stderr, "failed to open encoder file: %s\n", path.c_str());
-    exit(EXIT_FAILURE);
-  }
-
-  Encoder encoder;
-  std::string line;
-  while (std::getline(file, line)) {
-    auto [token, rank] = _parse(line);
-
-    if (!encoder.emplace(std::move(token), rank).second) {
-      fprintf(stderr, "duplicate item: %s\n", line.c_str());
-    }
-  }
-  return encoder;
-}
-
-static Decoder _build_decoder(const Encoder& encoder) {
-  Decoder decoder;
-  for (const auto& [k, v] : encoder) {
-    decoder.emplace(v, k);
-  }
-
-  if (encoder.size() != decoder.size()) {
-    fprintf(stderr, "duplicate items in encoder");
-    exit(EXIT_FAILURE);
-  }
-
-  return decoder;
-}
-
-static std::vector<uint64_t> _byte_pair_merge(
-    const std::string& piece,
-    const std::unordered_map<std::string, uint64_t>& ranks,
-    std::function<uint64_t(uint64_t, uint64_t)> func) {
-  // This is a vector of (start, rank).
-  // The rank is of the byte pair starting at position start.
-  // The rank of the last item in the vector is not a valid value.
-  std::vector<std::pair<uint64_t, uint64_t>> parts;
-  parts.reserve(piece.size() + 1);
-  for (auto idx = 0U; idx < piece.size() + 1; ++idx) {
-    parts.emplace_back(idx, _max_size());
-  }
-
-  auto get_rank = [&piece, &ranks](
-                      const std::vector<std::pair<uint64_t, uint64_t>>& parts,
-                      uint64_t start_idx,
-                      uint64_t skip) -> std::optional<uint64_t> {
-    if (start_idx + skip + 2 < parts.size()) {
-      auto s = parts[start_idx].first;
-      auto e = parts[start_idx + skip + 2].first;
-      auto key = piece.substr(s, e - s);
-      auto iter = ranks.find(key);
-      if (iter != ranks.end()) {
-        return iter->second;
-      }
-    }
-    return std::nullopt;
-  };
-
-  // We look up the ranks once in the beginning and iteratively update
-  // them during each merge, which reduces the number of rank lookups.
-  for (auto i = 0U; i < parts.size() - 2; ++i) {
-    auto rank = get_rank(parts, i, 0);
-    if (rank) {
-      // usize::MAX is a sentinel value and cannot be a valid rank
-      if (*rank == _max_size()) {
-        fprintf(stderr, "at %" PRIu32 " rank is too large\n", i);
-      }
-      parts[i].second = *rank;
-    }
-  }
-
-  // If you have n parts and m merges, this does O(mn) work.
-  // We could do something with a heap and do O(m log n) work.
-  // It is important to consider that n is often small (<100), and as such
-  // the cache-locality benefits outweigh the algorithmic complexity downsides
-  // of the `parts` vector data structure above.
-
-  // Note that we hash bytes, not token pairs. As long as we train BPE the way
-  // we currently do, this is equivalent. An easy way to break this would be
-  // to decouple merge priority from token index or to prevent specific token
-  // merges.
-  while (true) {
-    if (parts.size() == 1) {
-      break;
-    }
-
-    // usize::MAX is a sentinel rank value allowing us to
-    // take the min more quickly
-    auto min_rank = std::make_pair<uint64_t, uint64_t>(_max_size(), 0);
-    for (auto i = 0U; i < parts.size() - 1; ++i) {
-      auto rank = parts[i].second;
-      if (rank < min_rank.first) {
-        min_rank.first = rank;
-        min_rank.second = i;
-      }
-    }
-
-    if (min_rank.first != _max_size()) {
-      auto i = min_rank.second;
-
-      // NOTE: We are about to remove parts[i + 1]. We do not do it
-      // yet because there are cache-locality benefits to updating
-      // parts[i] and parts[i-1] before removing, which could thrash
-      // the cache. Thus, we update the rank calculation by skipping over
-      // parts[i + 1], by invoking `get_rank!` with `skip = 1`.
-      auto rank = get_rank(parts, i, 1);
-      if (rank) {
-        parts[i].second = *rank;
-      } else {
-        parts[i].second = _max_size();
-      }
-      if (i > 0) {
-        rank = get_rank(parts, i - 1, 1);
-        if (rank) {
-          parts[i - 1].second = *rank;
-        } else {
-          parts[i - 1].second = _max_size();
-        }
-      }
-
-      parts.erase(parts.begin() + (i + 1));
-    } else {
-      break;
-    }
-  }
-  std::vector<uint64_t> out;
-  out.reserve(parts.size() - 1);
-  for (auto i = 0U; i < parts.size() - 1; ++i) {
-    auto s = parts[i].first;
-    auto e = parts[i + 1].first;
-    out.push_back(func(s, e));
-  }
-  return out;
-}
-
-static std::vector<uint64_t> _byte_pair_encode(
-    const std::string& piece,
-    const Encoder& encoder) {
-  if (piece.size() == 1) {
-    auto iter = encoder.find(piece);
-    if (iter != encoder.end()) {
-      return std::vector<uint64_t>({iter->second});
-    } else {
-      // TODO: is it possible?
-      return {};
-    }
-  }
-
-  return _byte_pair_merge(
-      piece, encoder, [&piece, &encoder](uint64_t start, uint64_t stop) {
-        std::string key = piece.substr(start, stop - start);
-        auto iter = encoder.find(key);
-        if (iter != encoder.end()) {
-          return iter->second;
-        } else {
-          // TODO: what if key does not exist? Should we return `unknown`?
-          // assert(false); // ??
-          return uint64_t(0);
-        }
-      });
-}
-// ------------------------------Util end------------------------------------
-// -------------------------private method start-------------------------------
-
-template <typename T>
-std::pair<std::optional<std::string>, re2::StringPiece>
-Tiktoken::_split_with_allowed_special_token(
-    re2::StringPiece& input,
-    const T& allowed_special) {
-  if (!_special_token_regex) {
-    return std::make_pair(std::nullopt, input);
-  }
-
-  auto start = input.begin();
-  std::string special;
-  while (true) {
-    if (!re2::RE2::FindAndConsume(&input, *_special_token_regex, &special)) {
-      // No special token.
-      break;
-    }
-
-    if (allowed_special.count(special) == 1) {
-      // Found an allowed special token, split the text with it.
-      return std::make_pair(
-          special,
-          re2::StringPiece(start, input.begin() - start - special.size()));
-    } // else try to find the next special token
-  }
-
-  return std::make_pair(std::nullopt, input);
-}
-
-void Tiktoken::_encode(
-    re2::StringPiece& input,
-    std::vector<uint64_t>& ret,
-    uint64_t& last_piece_token_len) {
-  std::string piece;
-  assert(_regex);
-  while (re2::RE2::FindAndConsume(&input, *_regex, &piece)) {
-    auto iter = _encoder.find(piece);
-    if (iter != _encoder.end()) {
-      last_piece_token_len = 1;
-      ret.push_back(iter->second);
-      continue;
-    }
-    auto tokens = _byte_pair_encode(piece, _encoder);
-    last_piece_token_len = tokens.size();
-    ret.insert(ret.end(), tokens.begin(), tokens.end());
-  }
-}
-
-template <typename T>
-std::pair<std::vector<uint64_t>, uint64_t> Tiktoken::_encode_with_special_token(
-    const std::string& text,
-    const T& allowed_special) {
-  std::vector<uint64_t> tokens;
-  uint64_t last_piece_token_len = 0;
-  re2::StringPiece input(text);
-  while (true) {
-    auto [special, sub_input] =
-        _split_with_allowed_special_token(input, allowed_special);
-
-    _encode(sub_input, tokens, last_piece_token_len);
-
-    if (special) {
-      uint64_t token = 0;
-      try {
-        token = _special_token_encoder.at(*special);
-      } catch (const std::out_of_range&) {
-        // Should never go here, since special pattern includes all special
-        // chars.
-        fprintf(stderr, "unknown special token: %s\n", special->c_str());
-        exit(EXIT_FAILURE);
-      }
-
-      tokens.push_back(token);
-      last_piece_token_len = 0;
-    } else {
-      break;
-    }
-  }
-
-  // last_piece_token_len is how many tokens came from the last regex split.
-  // This is used for determining unstable tokens, since you can't merge
-  // across (stable) regex splits
-  return std::make_pair(tokens, last_piece_token_len);
-}
-
-// -------------------------private method end-------------------------------
-// -------------------------public method start-------------------------------
-
-Tiktoken::Tiktoken() : Tokenizer() {}
-
-void Tiktoken::load(const std::string& path) {
-  _encoder = _load_encoder(path);
-  _special_token_encoder = _get_special_tokens(_encoder.size());
-
-  _decoder = _build_decoder(_encoder);
-  _special_token_decoder = _build_decoder(_special_token_encoder);
-
-  _regex = _create_regex(_pattern);
-  _special_token_regex = _build_special_token_regex(_special_token_encoder);
-
-  // initialize vocab_size, bos_tok, eos_tok
-  vocab_size_ = _encoder.size() + _special_token_encoder.size();
-  bos_tok_ = _encoder.size(); // hardcoded (see _get_special_tokens)
-  eos_tok_ = _encoder.size() + 1; // hardcoded (see _get_special_tokens)
-  initialized_ = true;
-}
-
-std::vector<uint64_t>
-Tiktoken::encode(const std::string& text, int8_t bos, int8_t eos) {
-  if (!initialized_) {
-    exit(EXIT_FAILURE);
-  }
-  auto res = _encode_with_special_token(text, _special_token_encoder).first;
-  for (auto i = 0; i < bos; ++i) {
-    res.insert(res.begin(), bos_tok_);
-  }
-  for (auto i = 0; i < eos; ++i) {
-    res.push_back(eos_tok_);
-  }
-  return res;
-}
-
-std::string Tiktoken::decode(uint64_t prev, uint64_t cur) {
-  (void)prev;
-  if (!initialized_) {
-    exit(EXIT_FAILURE);
-  }
-  std::string ret;
-
-  std::string token_bytes;
-  auto iter = _decoder.find(cur);
-  if (iter != _decoder.end()) {
-    token_bytes = iter->second;
-  } else {
-    iter = _special_token_decoder.find(cur);
-    if (iter != _special_token_decoder.end()) {
-      token_bytes = iter->second;
-    } else {
-      fprintf(stderr, "unknown token: %" PRIu64 "\n", cur);
-      exit(EXIT_FAILURE);
-    }
-  }
-  ret += token_bytes;
-
-  return ret;
-}
-// -------------------------public method end-------------------------------
diff --git a/tokenizer/tokenizer.h b/tokenizer/tokenizer.h
deleted file mode 100644
index 9e1977b71..000000000
--- a/tokenizer/tokenizer.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// A simple Tokenizer interface.
-#pragma once
-
-#include <re2/re2.h>
-#include <cctype>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <functional>
-#include <memory>
-#include <optional>
-#include <regex>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "sentencepiece_processor.h"
-
-class Tokenizer {
- public:
-  explicit Tokenizer() {}
-  virtual ~Tokenizer() {}
-
-  virtual void load(const std::string& tokenizer_path) = 0;
-
-  virtual std::vector<uint64_t>
-  encode(const std::string& input, int8_t bos, int8_t eos) = 0;
-
-  virtual std::string decode(uint64_t prev_token, uint64_t token) = 0;
-
-  // getters
-  int32_t vocab_size() const {
-    return vocab_size_;
-  }
-
-  uint64_t bos_tok() const {
-    return bos_tok_;
-  }
-
-  uint64_t eos_tok() const {
-    return eos_tok_;
-  }
-
- protected:
-  bool initialized_ = false;
-  int32_t vocab_size_;
-  uint64_t bos_tok_, eos_tok_;
-};
-
-// ----------------------- SPTokenizer -----------------------
-// Used by sentencepiece. Adapted from llama2.c.
-struct TokenIndex {
-  const char* str;
-  int32_t id;
-};
-
-class SPTokenizer : public Tokenizer {
- public:
-  explicit SPTokenizer();
-  ~SPTokenizer() override;
-
-  void load(const std::string& tokenizer_path) override;
-
-  std::vector<uint64_t> encode(const std::string& input, int8_t bos, int8_t eos)
-      override;
-
-  std::string decode(uint64_t prev_token, uint64_t token) override;
-
- private:
-  std::unique_ptr<sentencepiece::SentencePieceProcessor> _processor;
-};
-
-// ----------------------- Tiktoken -----------------------
-// Used by OpenAI, adapted from https://github.com/sewenew/tokenizer
-
-using Encoder = std::unordered_map<std::string, uint64_t>;
-using Decoder = std::unordered_map<uint64_t, std::string>;
-using Re2UPtr = std::unique_ptr<re2::RE2>;
-
-class Tiktoken : public Tokenizer {
- public:
-  explicit Tiktoken();
-  ~Tiktoken(){};
-
-  void load(const std::string& tokenizer_path);
-
-  std::vector<uint64_t>
-  encode(const std::string& input, int8_t bos, int8_t eos);
-
-  std::string decode(uint64_t prev_token, uint64_t token);
-
- private:
-  static inline const Encoder _get_special_tokens(ssize_t num_base_tokens) {
-    Encoder special_tokens;
-    special_tokens.emplace("<|begin_of_text|>", num_base_tokens++);
-    special_tokens.emplace("<|end_of_text|>", num_base_tokens++);
-    special_tokens.emplace("<|reserved_special_token_0|>", num_base_tokens++);
-    special_tokens.emplace("<|reserved_special_token_1|>", num_base_tokens++);
-    special_tokens.emplace("<|reserved_special_token_2|>", num_base_tokens++);
-    special_tokens.emplace("<|reserved_special_token_3|>", num_base_tokens++);
-    special_tokens.emplace("<|start_header_id|>", num_base_tokens++);
-    special_tokens.emplace("<|end_header_id|>", num_base_tokens++);
-    special_tokens.emplace("<|reserved_special_token_4|>", num_base_tokens++);
-    special_tokens.emplace("<|eot_id|>", num_base_tokens++);
-    for (auto i = 5; i < 251; ++i) {
-      special_tokens.emplace(
-          "<|reserved_special_token_" + std::to_string(i) + "|>",
-          num_base_tokens++);
-    }
-    return special_tokens;
-  }
-
-  template <typename T>
-  std::pair<std::optional<std::string>, re2::StringPiece>
-  _split_with_allowed_special_token(
-      re2::StringPiece& input,
-      const T& allowed_special);
-
-  void _encode(
-      re2::StringPiece& input,
-      std::vector<uint64_t>& ret,
-      uint64_t& last_piece_token_len);
-
-  template <typename T>
-  std::pair<std::vector<uint64_t>, uint64_t> _encode_with_special_token(
-      const std::string& text,
-      const T& allowed_special);
-
-  // Removed negative lookahead \s+(?!\S) since it's not supported by RE2.
-  const std::string _pattern =
-      R"((?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+)";
-  Encoder _encoder;
-  Encoder _special_token_encoder;
-  Decoder _decoder;
-  Decoder _special_token_decoder;
-
-  Re2UPtr _regex;
-  Re2UPtr _special_token_regex;
-};
diff --git a/torchchat/utils/scripts/build_native.sh b/torchchat/utils/scripts/build_native.sh
index 3c2c1c846..e2b8b4fc0 100755
--- a/torchchat/utils/scripts/build_native.sh
+++ b/torchchat/utils/scripts/build_native.sh
@@ -64,7 +64,7 @@ fi
 
 
 pushd ${TORCHCHAT_ROOT}
-git submodule update --init
+git submodule update --init --recursive
 git submodule sync
 if [[ "$TARGET" == "et" ]]; then
   if [ ! -d "${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install" ]; then
@@ -93,7 +93,7 @@ popd
 if [[ "$TARGET" == "et" ]]; then
     cmake -S . -B ./cmake-out -DCMAKE_PREFIX_PATH=`python3 -c 'import torch;print(torch.utils.cmake_prefix_path)'` -DLINK_TORCHAO_OPS="${LINK_TORCHAO_OPS}" -DET_USE_ADAPTIVE_THREADS=ON -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=1" -G Ninja
 else
-    cmake -S . -B ./cmake-out -DCMAKE_PREFIX_PATH=`python3 -c 'import torch;print(torch.utils.cmake_prefix_path)'` -DLINK_TORCHAO_OPS="${LINK_TORCHAO_OPS}" -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" -G Ninja
+    cmake -S . -B ./cmake-out -DCMAKE_PREFIX_PATH=`python3 -c 'import torch;print(torch.utils.cmake_prefix_path)'` -DLINK_TORCHAO_OPS="${LINK_TORCHAO_OPS}" -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=1" -G Ninja
 fi
 cmake --build ./cmake-out --target "${TARGET}"_run
 

From 3f0fec386b41f5b5b112fd9a23c25f0603b23645 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Sun, 5 Jan 2025 08:50:34 -0800
Subject: [PATCH 51/83] Update README.md to include granite (#1445)

* Update README.md to include granite

@gabe-l-hart @Jack-Khuu adding to table since #1262 is complete.  Does torchchat need anything else before it's appropriate to say granite is supported?

* Update README.md

Remove extraneous white space

* Update README.md

Typos

* Update build_native.sh

Use new ABI
---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index 91f5a7e51..1cfb06e22 100644
--- a/README.md
+++ b/README.md
@@ -69,6 +69,13 @@ aliases.
 |[tinyllamas/stories42M](https://huggingface.co/karpathy/tinyllamas/tree/main)|✅|Toy model for `generate`. Alias to `stories42M`.|
 |[tinyllamas/stories110M](https://huggingface.co/karpathy/tinyllamas/tree/main)|✅|Toy model for `generate`. Alias to `stories110M`.|
 |[openlm-research/open_llama_7b](https://huggingface.co/openlm-research/open_llama_7b)|✅|Best for `generate`. Alias to `open-llama`.|
+| [ibm-granite/granite-3b-code-instruct-128k](https://huggingface.co/ibm-granite/granite-3b-code-instruct-128k) |✅| Alias to `granite-code` and `granite-code-3b`.|
+| [ibm-granite/granite-8b-code-instruct-128k](https://huggingface.co/ibm-granite/granite-8b-code-instruct-128k) |✅| Alias to `granite-code-8b`.|
+| [ibm-granite/granite-3.0-2b-instruct](https://huggingface.co/ibm-granite/granite-3.0-2b-instruct) |✅| Alias to `granite3-2b` and `granite3`.|
+| [ibm-granite/granite-3.0-8b-instruct](https://huggingface.co/ibm-granite/granite-3.0-8b-instruct) |✅| Alias to `granite3-8b`.|
+| [ibm-granite/granite-3.1-2b-instruct](https://huggingface.co/ibm-granite/granite-3.1-2b-instruct) |✅| Alias to `granite3.1-2b` and `granite3.1`.|
+| [ibm-granite/granite-3.1-8b-instruct](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct) |✅| Alias to `granite3.1-8b`.|
+
 
 ## Installation
 The following steps require that you have [Python 3.10](https://www.python.org/downloads/release/python-3100/) installed.

From c121ed2ace246e5922dcd05e687b783033b63b30 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Sun, 5 Jan 2025 17:54:04 -0800
Subject: [PATCH 52/83] Create local-model.md (#1448)

Initial documentation how to use local checkpoints with torchchat.
xref: #1446
---
 docs/local-model.md | 138 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 138 insertions(+)
 create mode 100644 docs/local-model.md

diff --git a/docs/local-model.md b/docs/local-model.md
new file mode 100644
index 000000000..2d48e2438
--- /dev/null
+++ b/docs/local-model.md
@@ -0,0 +1,138 @@
+# Using Local Models in Torchcha/
+Torchchat provides powerful capabilities for running large language models (LLMs) locally. This guide focuses on utilizing local copies of 
+model checkpoints or models in GGUF format to create a chat application. It also highlights relevant options for advanced users.
+
+## Prerequisites
+To work with local models, you need:
+1. **Model Weights**: A checkpoint file (e.g., `.pth`, `.pt`) or a GGUF file (e.g., `.gguf`).
+2. **Tokenizer**: A tokenizer model file.This can either be in SentencePiece or TikToken format, depending on the tokenizer used with the model.
+3. **Parameter File**: (a) A custom parameter file in JSON format, or (b) a pre-existing parameter file with `--params-path`
+   or `--params-table`, or (c) a pathname that’s matched against known models by longest substring in configuration name, using the same algorithm as GPT-fast.
+
+Ensure the tokenizer and parameter files are in the same directory as the checkpoint or GGUF file for automatic detection.
+Let’s use a local download of the stories15M tinyllama model as an example:
+
+```
+mkdir stories15M
+cd stories15M
+wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
+wget https://github.com/karpathy/llama2.c/raw/refs/heads/master/tokenizer.model
+cp ../torchchat/model_params/stories15M.json model.json
+cd ..
+``` 
+
+
+## Using Local Checkpoints
+Torchchat provides the CLI flag `--checkpoint-path` for specifying local model weights. The tokenizer is 
+loaded from the same directory as the checkpoint with the name ‘tokenizer.model’ unless separately specified.  
+This example obtains the model parameters by name matching to known models because ‘stories15M’ is one of the 
+models known to torchchat with a configuration stories in ‘torchchat/model_params’:
+
+
+### Example 1: Basic Text Generation
+
+
+```
+python3 torchchat.py generate \
+ --checkpoint-path stories15M/stories15M.pt \
+ --prompt "Hello, my name is"
+```
+
+
+### Example 2: Providing Additional Artifacts
+The following is an example of how to specify a local model checkpoint, the model architecture, and a tokenizer file:
+```
+python3 torchchat.py generate \
+ --prompt "Once upon a time" \
+ --checkpoint-path stories15M/stories15M.pt \
+ --params-path stories15M/model.json \
+ --tokenizer-path stories15M/tokenizer.model
+```
+
+
+Alternatively, we can specify the known architecture configuration for known models using ‘--params-table’ 
+to specify a p[particular architecture in the ‘torchchat/model_params’:
+
+```
+python3 torchchat.py generate \
+ --prompt "Once upon a time" \
+ --checkpoint-path stories15M/stories15M.pt \
+ --params-table stories15M \
+ --tokenizer-path stories15M//tokenizer.model
+```
+
+
+## Using GGUF Models
+Torchchat supports loading models in GGUF format using the `--gguf-file`. Refer to GGUF.md for additional 
+documentation about using GGUF files in torchchat.
+
+The GGUF format is compatible with several quantization levels such as F16, F32, Q4_0, and Q6_K. Model 
+configuration information is obtained directly from the GGUF file, simplifying setup and obviating the 
+need for a separate `model.json` model architecture specification.
+
+
+## Using local models
+Torchchat supports all commands such as chat, browser, server and export using local models. (In fact, 
+known models simply download and populate the parameters specified for local models.) 
+Here is an example setup for running a server with a local model:
+
+
+[skip default]: begin
+```
+python3 torchchat.py server --checkpoint-path stories15M/stories15M.pt
+```
+[skip default]: end
+
+
+[shell default]: python3 torchchat.py server --checkpoint-path stories15M/stories15M.pt & server_pid=$! ; sleep 90 # wait for server to be ready to accept requests
+
+
+In another terminal, query the server using `curl`. Depending on the model configuration, this query might take a few minutes to respond.
+
+
+> [!NOTE]
+> Since this feature is under active development, not every parameter is consumed. See `#api/api.pyi` for details on
+> which request parameters are implemented. If you encounter any issues, please comment on the [tracking Github issue](https://github.com/pytorch/torchchat/issues/973).
+
+
+<details>
+
+
+<summary>Example Query</summary>
+Setting `stream` to "true" in the request emits a response in chunks. If `stream` is unset or not "true", then the client will 
+await the full response from the server.
+
+
+**Example: using the server**
+A model server used witha local model works like any other torchchat server.  You can test it by sending a request with ‘curl’:
+```
+curl http://127.0.0.1:5000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "llama3.1",
+    "stream": "true",
+    "max_tokens": 200,
+    "messages": [
+      {
+        "role": "system",
+        "content": "You are a helpful assistant."
+      },
+      {
+        "role": "user",
+        "content": "Hello!"
+      }
+    ]
+  }'
+```
+
+
+[shell default]: kill ${server_pid}
+
+
+</details>
+
+
+For more information about using different commands, see the root README.md and refer to the Advanced Users Guide for further details on advanced configurations and parameter tuning.
+
+
+[end default]: end

From e60680b67104e77b32d4363e508064d42607b2da Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Sun, 5 Jan 2025 17:55:29 -0800
Subject: [PATCH 53/83] Update evaluation.md (#1442)

* Update evaluation.md

1 - Remove outdated reference to running eval.py directly
2 - explain how we run ET/AOTI models with eval.
3 - Add an example with quantization to show how we can use eval to determine how to process models.

* Update evaluation.md

* Update evaluation.md

Highlight ability to use different options and encourage users to experiment with them.

* Update evaluation.md

Wording corrections

* Update build_native.sh

Update to C++11 ABI for AOTI, similar to ET
---
 torchchat/utils/docs/evaluation.md | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/torchchat/utils/docs/evaluation.md b/torchchat/utils/docs/evaluation.md
index ac2aa54d3..77414eeb4 100644
--- a/torchchat/utils/docs/evaluation.md
+++ b/torchchat/utils/docs/evaluation.md
@@ -23,7 +23,7 @@ The evaluation mode of `torchchat.py` script can be used to evaluate your langua
 
 ## Examples
 
-### Evaluation example with model in Python
+### Evaluation example with model in Python environment
 
 Running wikitext for 10 iterations
 ```
@@ -35,33 +35,45 @@ Running wikitext with torch.compile for 10 iterations
 python3 torchchat.py eval stories15M --compile --tasks wikitext --limit 10
 ```
 
-Running multiple tasks and calling eval.py directly (with torch.compile):
+Running multiple tasks with torch.compile for evaluation and prefill:
 ```
-python3 torchchat.py eval stories15M --compile --tasks wikitext hellaswag
+python3 torchchat.py eval stories15M --compile --compile-prefill --tasks wikitext hellaswag
 ```
 
 ### Evaluation with model exported to PTE with ExecuTorch
 
-Running an exported model with ExecuTorch (as PTE)
+Running an exported model with ExecuTorch (as PTE).  Advantageously, because you can 
+load an exported PTE model back into the Python environment with torchchat,
+you can run evaluation on the exported model!
 ```
 python3 torchchat.py export stories15M --output-pte-path stories15M.pte
 python3 torchchat.py eval stories15M --pte-path stories15M.pte
 ```
 
-Running multiple tasks and calling eval.py directly (with PTE):
+Running multiple tasks directly on the created PTE mobile model:
 ```
 python3 torchchat.py eval stories15M --pte-path stories15M.pte --tasks wikitext hellaswag
 ```
 
+Now let's evaluate the effect of quantization on evaluation results by exporting with quantization using `--quantize` and an exemplary quantization configuration:
+```
+python3 torchchat.py export stories15M --output-pte-path stories15M.pte --quantize torchchat/quant_config/mobile.json
+python3 torchchat.py eval stories15M --pte-path stories15M.pte --tasks wikitext hellaswag
+```
+
+Now try your own export options to explore different trade-offs between model size, evaluation speed and accuracy using model quantization!
+
 ### Evaluation with model exported to DSO with AOT Inductor (AOTI)
 
-Running an exported model with AOT Inductor (DSO model)
+Running an exported model with AOT Inductor (DSO model).  Advantageously, because you can 
+load an exported DSO model back into the Python environment with torchchat,
+you can run evaluation on the exported model!
 ```
 python3 torchchat.py export stories15M --dtype fast16 --output-dso-path stories15M.so
 python3 torchchat.py eval stories15M --dtype fast16 --dso-path stories15M.so
 ```
 
-Running multiple tasks and calling eval.py directly (with AOTI):
+Running multiple tasks with AOTI:
 ```
 python3 torchchat.py eval stories15M --dso-path stories15M.so --tasks wikitext hellaswag
 ```

From 1ba40d732629b00ba2e3afa95a3b89e8f8267f3c Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Mon, 6 Jan 2025 07:04:28 -0800
Subject: [PATCH 54/83] Create distributed.md (#1438)

* Create distributed.md

Initial documentation for use of distributed inference w/ torchchat.
@mreso please review and update as appropriate.

* Add support for extracting distributed inference tests in run-docs

Add support for extracting distributed inference tests in run-docs

* Update distributed.md

* Update distributed.md

* Update distributed.md

* Update docs/distributed.md

Co-authored-by: Matthias Reso <13337103+mreso@users.noreply.github.com>

* Update docs/distributed.md

Co-authored-by: Matthias Reso <13337103+mreso@users.noreply.github.com>

* Update distributed.md

Uncommenting  section about generate subcommand w/ distributed inference after review by @mreso
Also, Added HF login to make this fully self-contained

* Update distributed.md

Wording

* Update distributed.md

Wording and formatting

* Update build_native.sh

Update to C++11 ABI for AOTI, similar to ET

---------

Co-authored-by: Matthias Reso <13337103+mreso@users.noreply.github.com>
---
 .ci/scripts/run-docs |  17 ++++++
 docs/distributed.md  | 125 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 142 insertions(+)
 create mode 100644 docs/distributed.md

diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs
index 6f5ee46c7..521cfa811 100755
--- a/.ci/scripts/run-docs
+++ b/.ci/scripts/run-docs
@@ -125,3 +125,20 @@ if [ "$1" == "native" ]; then
         bash -x ./run-native.sh
         echo "::endgroup::"
 fi
+
+if [ "$1" == "distributed" ]; then
+
+        echo "::group::Create script to run distributed"
+        python3 torchchat/utils/scripts/updown.py --file docs/distributed.md > ./run-distributed.sh
+        # for good measure, if something happened to updown processor,
+        # and it did not error out, fail with an exit 1
+        echo "exit 1" >> ./run-distributed.sh
+        echo "::endgroup::"
+
+        echo "::group::Run distributed"
+        echo "*******************************************"
+        cat ./run-distributed.sh
+        echo "*******************************************"
+        bash -x ./run-distributed.sh
+        echo "::endgroup::"
+fi
diff --git a/docs/distributed.md b/docs/distributed.md
new file mode 100644
index 000000000..3d34d7672
--- /dev/null
+++ b/docs/distributed.md
@@ -0,0 +1,125 @@
+# Distributed Inference with torchchat
+
+torchchat supports distributed inference for large language models (LLMs) on GPUs seamlessly. 
+At present, torchchat supports distributed inference using Python only.
+
+## Installation
+The following steps require that you have [Python 3.10](https://www.python.org/downloads/release/python-3100/) installed.
+
+> [!TIP]
+> torchchat uses the latest changes from various PyTorch projects so it's highly recommended that you use a venv (by using the commands below) or CONDA.
+
+[skip default]: begin
+```bash
+git clone https://github.com/pytorch/torchchat.git
+cd torchchat
+python3 -m venv .venv
+source .venv/bin/activate
+./install/install_requirements.sh
+```
+[skip default]: end
+
+[shell default]: ./install/install_requirements.sh
+
+## Login to HF for Downloading Weights
+Most models use Hugging Face as the distribution channel, so you will need to create a Hugging Face account. Create a Hugging Face user access token as documented here with the write role.
+
+Log into Hugging Face:
+
+[prefix default]: HF_TOKEN="${SECRET_HF_TOKEN_PERIODIC}"
+
+```
+huggingface-cli login
+```
+
+## Enabling Distributed torchchat Inference
+
+To enable distributed inference, use the option `--distributed`.  In addition, `--tp <num>` and `--pp <num>` 
+allow users to specify the types of parallelism to use where tp refers to tensor parallelism and pp to pipeline parallelism.
+
+
+## Generate Output with Distributed torchchat Inference
+
+To generate output using distributed inference with 4 GPUs, you can use:
+```
+python3 torchchat.py generate llama3.1 --distributed --tp 2 --pp 2 --prompt "write me a story about a boy and his bear"
+```
+
+
+## Chat with Distributed torchchat Inference
+
+This mode allows you to chat with an LLM in an interactive fashion with distributed Inference.  The following example uses 4 GPUs:
+
+[skip default]: begin
+```bash
+python3 torchchat.py chat llama3.1 --max-new-tokens 10 --distributed --tp 2 --pp 2
+```
+[skip default]: end
+
+
+## A Server with Distributed torchchat Inference
+
+This mode exposes a REST API for interacting with a model.
+The server follows the [OpenAI API specification](https://platform.openai.com/docs/api-reference/chat) for chat completions.
+
+To test out the REST API, **you'll need 2 terminals**: one to host the server, and one to send the request.
+
+In one terminal, start the server to run with 4 GPUs:
+
+[skip default]: begin
+
+```bash
+python3 torchchat.py server llama3.1 --distributed --tp 2 --pp 2
+```
+[skip default]: end
+
+<!--
+[shell default]: python3 torchchat.py server llama3.1 --distributed --tp 2 --pp 2 & server_pid=$! ; sleep 180 # wait for server to be ready to accept requests
+-->
+
+In another terminal, query the server using `curl`. Depending on the model configuration, this query might take a few minutes to respond.
+
+> [!NOTE]
+> Since this feature is under active development, not every parameter is consumed. See api/api.py for details on
+> which request parameters are implemented. If you encounter any issues, please comment on the [tracking Github issue](https://github.com/pytorch/torchchat/issues/973).
+
+<details>
+<summary>Example Query</summary>
+
+Setting `stream` to "true" in the request emits a response in chunks. If `stream` is unset or not "true", then the client will await the full response from the server.
+
+**Example Input + Output**
+
+```
+curl http://127.0.0.1:5000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "llama3.1",
+    "stream": "true",
+    "max_tokens": 200,
+    "messages": [
+      {
+        "role": "system",
+        "content": "You are a helpful assistant."
+      },
+      {
+        "role": "user",
+        "content": "Hello!"
+      }
+    ]
+  }'
+```
+[skip default]: begin
+```
+{"response":" I'm a software developer with a passion for building innovative and user-friendly applications. I have experience in developing web and mobile applications using various technologies such as Java, Python, and JavaScript. I'm always looking for new challenges and opportunities to learn and grow as a developer.\n\nIn my free time, I enjoy reading books on computer science and programming, as well as experimenting with new technologies and techniques. I'm also interested in machine learning and artificial intelligence, and I'm always looking for ways to apply these concepts to real-world problems.\n\nI'm excited to be a part of the developer community and to have the opportunity to share my knowledge and experience with others. I'm always happy to help with any questions or problems you may have, and I'm looking forward to learning from you as well.\n\nThank you for visiting my profile! I hope you find my information helpful and interesting. If you have any questions or would like to discuss any topics, please feel free to reach out to me. I"}
+```
+
+[skip default]: end
+
+<!--
+[shell default]: kill ${server_pid}
+-->
+
+</details>
+
+[end default]: end

From 06e78ce3f14ff142c5271423f7d622fd8961bb54 Mon Sep 17 00:00:00 2001
From: Angela Yi <yiangela7@gmail.com>
Date: Wed, 15 Jan 2025 11:54:28 -0800
Subject: [PATCH 55/83] [aoti] Remove need for -l in cmake (#1159)

---
 .github/workflows/runner-cuda-dtype.yml |  2 +-
 README.md                               |  2 +-
 runner/run.cpp                          | 61 ++++++++------------
 torchchat/export.py                     | 77 +++++++++++++++++--------
 4 files changed, 79 insertions(+), 63 deletions(-)

diff --git a/.github/workflows/runner-cuda-dtype.yml b/.github/workflows/runner-cuda-dtype.yml
index 1813f483e..4cfb9ff09 100644
--- a/.github/workflows/runner-cuda-dtype.yml
+++ b/.github/workflows/runner-cuda-dtype.yml
@@ -52,7 +52,7 @@ jobs:
 
             python torchchat.py export --checkpoint-path ${MODEL_DIR}/stories15M.pt --output-aoti-package-path /tmp/model.pt2
 
-            ./cmake-out/aoti_run /tmp/model.pt2 -d CUDA -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
+            ./cmake-out/aoti_run /tmp/model.pt2 -z ${MODEL_DIR}/tokenizer.model -i "${PROMPT}"
 
         done
 
diff --git a/README.md b/README.md
index 1cfb06e22..2448b0b72 100644
--- a/README.md
+++ b/README.md
@@ -341,7 +341,7 @@ torchchat/utils/scripts/build_native.sh aoti
 
 Then run the compiled executable, with the pt2.
 ```bash
-cmake-out/aoti_run exportedModels/llama3_1_artifacts.pt2 -z `python3 torchchat.py where llama3.1`/tokenizer.model -l 3 -i "Once upon a time"
+cmake-out/aoti_run exportedModels/llama3_1_artifacts.pt2 -z `python3 torchchat.py where llama3.1`/tokenizer.model -i "Once upon a time"
 ```
 
 ## Mobile Execution
diff --git a/runner/run.cpp b/runner/run.cpp
index f2b8e8e6b..e5c818cfa 100644
--- a/runner/run.cpp
+++ b/runner/run.cpp
@@ -102,6 +102,7 @@ typedef struct {
 typedef struct {
   Config config;  // the hyperparameters of the architecture (the blueprint)
   RunState state; // buffers for the "wave" of activations in the forward pass
+  std::unordered_map<std::string, std::string> metadata;
 
 #ifdef __AOTI_MODEL__
   torch::inductor::AOTIModelPackageLoader *runner;
@@ -141,20 +142,9 @@ void read_checkpoint(char *checkpoint, Config *config) {
   config->vocab_size = abs(config->vocab_size);
 }
 
-void build_transformer(Transformer *t, char *model_path, int vocab_size,
-                       int seq_len) {
-  // read in the Config and the Weights from the model
-  // read_checkpoint(model_path, &t->config);
-  // allocate the RunState buffers
-  t->config.vocab_size = vocab_size;
-  t->config.seq_len = seq_len;
-  malloc_run_state(&t->state, &t->config);
-
+void build_transformer(Transformer *t, char *model_path) {
 #ifdef __AOTI_MODEL__
   t->runner = new torch::inductor::AOTIModelPackageLoader(model_path);
-  aoti_device = t->runner->get_metadata()["AOTI_DEVICE_KEY"] == "cpu"
-                    ? torch::Device(torch::kCPU)
-                    : torch::Device(torch::kCUDA);
 #else //__ET_MODEL__
   t->runner = new Module(
       /* path to PTE model */ model_path,
@@ -776,9 +766,6 @@ void error_usage() {
           "  -v <int>    (optional) vocab size, default is model-specific.\n");
   fprintf(stderr,
           "  -l <int>    (optional) llama version (2 or 3), default 2.\n");
-  fprintf(
-      stderr,
-      "  -d <string> (optional) device(CUDA or CPU)  model was exported for\n");
   exit(EXIT_FAILURE);
 }
 
@@ -848,37 +835,35 @@ int main(int argc, char *argv[]) {
       system_prompt = argv[i + 1];
     } else if (argv[i][1] == 'l') {
       llama_ver = atoi(argv[i + 1]);
-#ifdef __AOTI_MODEL__
-    } else if (argv[i][1] == 'd') {
-#ifdef USE_CUDA
-      if (strcasecmp(argv[i + 1], "CUDA") == 0) {
-        aoti_device = torch::Device(torch::kCUDA);
-      } else
-#endif
-          if (strcasecmp(argv[i + 1], "CPU") == 0) {
-        aoti_device = torch::Device(torch::kCPU);
-      } else {
-        fprintf(stderr, "Unknown device %s", argv[i + 1]);
-        exit(1);
-      }
-#endif
     } else {
       error_usage();
     }
   }
 
+  if (model_path == NULL) {
+    fprintf(stderr, "No model_path provided.");
+    error_usage();
+  }
+
+  Transformer transformer;
+  build_transformer(&transformer, model_path);
+
+#ifdef __AOTI_MODEL__
+  auto aoti_metadata = transformer.runner->get_metadata();
+  aoti_device = aoti_metadata["AOTI_DEVICE_KEY"] == "cpu"
+                    ? torch::Device(torch::kCPU)
+                    : torch::Device(torch::kCUDA);
+  ModelType model_type = get_model_type(std::stoi(aoti_metadata["tokenizer_type"]));
+#else // __ET_MODEL__
   ModelType model_type = get_model_type(llama_ver);
+#endif
+
   if (model_type == UNKNOWN_MODEL) {
     fprintf(stderr, "Unknown model type passed by -l argument.  Received l=%d.",
             llama_ver);
     error_usage();
   }
 
-  if (model_path == NULL) {
-    fprintf(stderr, "No model_path provided.");
-    error_usage();
-  }
-
   if (tokenizer_path == NULL) {
     fprintf(stderr, "No tokenizer_path provided.");
     error_usage();
@@ -901,8 +886,12 @@ int main(int argc, char *argv[]) {
     vocab_size = tokenizer->vocab_size();
   }
 
-  Transformer transformer;
-  build_transformer(&transformer, model_path, vocab_size, steps);
+  // read in the Config and the Weights from the model
+  // read_checkpoint(model_path, &t->config);
+  // allocate the RunState buffers
+  transformer.config.vocab_size = vocab_size;
+  transformer.config.seq_len = steps;
+  malloc_run_state(&transformer.state, &transformer.config);
 
   Sampler sampler;
   build_sampler(&sampler, vocab_size, temperature, topp, rng_seed);
diff --git a/torchchat/export.py b/torchchat/export.py
index 979778b7c..e84a344bd 100644
--- a/torchchat/export.py
+++ b/torchchat/export.py
@@ -5,13 +5,13 @@
 # LICENSE file in the root directory of this source tree.
 
 import os
-from typing import Optional
+from typing import Dict, Optional
 
 import torch
+import torch._inductor
 import torch.nn as nn
 
 from torch.export import Dim
-import torch._inductor
 
 from torchchat.cli.builder import (
     _initialize_model,
@@ -39,6 +39,7 @@ def export_for_server(
     output_path: str = "model.pt2",
     dynamic_shapes: bool = False,
     package: bool = True,
+    metadata: Optional[Dict[str, str]] = None,
 ) -> str:
     """
     Export the model using AOT Compile to get a .dso for server use cases.
@@ -67,8 +68,10 @@ def export_for_server(
         dynamic_shapes = None
 
     with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]):
-        metadata = {}  # TODO: put more metadata here
-        options = {"aot_inductor.package": package, "aot_inductor.metadata": metadata}
+        options = {
+            "aot_inductor.package": package,
+            "aot_inductor.metadata": metadata or {},
+        }
         if not package:
             options = {"aot_inductor.output_path": output_path}
 
@@ -81,6 +84,7 @@ def export_for_server(
 
         if package:
             from torch._inductor.package import package_aoti
+
             path = package_aoti(output_path, path)
 
     print(f"The generated packaged model can be found at: {path}")
@@ -102,13 +106,13 @@ def export_for_server(
     from typing import Any, Dict, Tuple, Union
 
     import executorch.exir as exir
+    from executorch.backends.xnnpack._passes.convert_to_linear import (
+        ConvertToLinearPass,
+    )
 
     from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
         XnnpackDynamicallyQuantizedPartitioner,
     )
-    from executorch.backends.xnnpack._passes.convert_to_linear import (
-        ConvertToLinearPass,
-    )
     from executorch.exir import EdgeProgramManager, to_edge
 
     from executorch.exir.capture._config import (
@@ -166,18 +170,22 @@ def __init__(self, attention: Attention):
 
             self.wo = attention.wo
 
-            max_batch_size, n_heads, max_seq_length, head_dim = (
-                attention.kv_cache[0].k_cache.shape
-            )
+            max_batch_size, n_heads, max_seq_length, head_dim = attention.kv_cache[
+                0
+            ].k_cache.shape
             cache_dtype = attention.kv_cache[0].k_cache.dtype
             # The `Attention` module being replaced can have multiple KV caches
             # (denoted by `cache_lanes`).  Thus we follow the same setup format
             # as in `Attention.setup_cache`.
             cache_lanes = len(attention.kv_cache)
-            self.kv_cache = nn.ModuleList([
-                CustomKVCache(max_batch_size, max_seq_length, n_heads, head_dim, cache_dtype)
-                for _ in range(cache_lanes)
-            ])
+            self.kv_cache = nn.ModuleList(
+                [
+                    CustomKVCache(
+                        max_batch_size, max_seq_length, n_heads, head_dim, cache_dtype
+                    )
+                    for _ in range(cache_lanes)
+                ]
+            )
 
             self.n_heads = attention.n_heads
             self.head_dim = attention.head_dim
@@ -215,9 +223,7 @@ def forward(self, x, freqs_cis, mask, input_pos=None, cache_lane: int = 0):
             return self.wo(output)
 
     def replace_attention_with_custom_sdpa_attention(module: nn.Module):
-        from executorch.extension.llm.custom_ops import (  # noqa
-            sdpa_with_kv_cache,
-        )
+        from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa
 
         for name, child in module.named_children():
             if isinstance(child, Attention):
@@ -238,7 +244,9 @@ def _to_core_aten(
             raise ValueError(
                 f"Expected passed in model to be an instance of fx.GraphModule, got {type(model)}"
             )
-        core_aten_ep = export_for_training(model, example_inputs, dynamic_shapes=dynamic_shapes)
+        core_aten_ep = export_for_training(
+            model, example_inputs, dynamic_shapes=dynamic_shapes
+        )
         if verbose:
             logging.info(f"Core ATen graph:\n{core_aten_ep.graph}")
         return core_aten_ep
@@ -350,7 +358,11 @@ def main(args):
 
     print(f"Using device={builder_args.device}")
     set_precision(builder_args.precision)
-    set_backend(dso=args.output_dso_path, pte=args.output_pte_path, aoti_package=args.output_aoti_package_path)
+    set_backend(
+        dso=args.output_dso_path,
+        pte=args.output_pte_path,
+        aoti_package=args.output_aoti_package_path,
+    )
 
     builder_args.dso_path = None
     builder_args.pte_path = None
@@ -372,6 +384,7 @@ def main(args):
 
     # TODO: clean this up
     # This mess is because ET does not support _weight_int4pack_mm right now
+    tokenizer_args = None
     if not builder_args.gguf_path:
         # tokenizer needed for quantization so get that here,
         try:
@@ -382,9 +395,8 @@ def main(args):
 
         if builder_args.max_seq_length is None:
             if (
-                (output_dso_path is not None or output_aoti_package_path is not None)
-                and not builder_args.dynamic_shapes
-            ):
+                output_dso_path is not None or output_aoti_package_path is not None
+            ) and not builder_args.dynamic_shapes:
                 print("Setting max_seq_length to 300 for DSO export.")
                 builder_args.max_seq_length = 300
             elif output_pte_path is not None:
@@ -397,7 +409,8 @@ def main(args):
             quantize,
             tokenizer,
             max_seq_length=builder_args.max_seq_length,
-            support_tensor_subclass=output_dso_path is None and output_aoti_package_path is None,
+            support_tensor_subclass=output_dso_path is None
+            and output_aoti_package_path is None,
         )
         model_to_pte = model
         model_to_dso = model
@@ -435,7 +448,9 @@ def main(args):
         if output_dso_path:
             output_dso_path = str(os.path.abspath(output_dso_path))
             print(f"Exporting model using AOT Inductor to {output_dso_path}")
-            print("WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead.")
+            print(
+                "WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead."
+            )
             export_for_server(
                 model_to_dso,
                 builder_args.device,
@@ -446,11 +461,23 @@ def main(args):
 
         if output_aoti_package_path:
             output_aoti_package_path = str(os.path.abspath(output_aoti_package_path))
-            print(f"Exporting model using AOT Inductor to {output_aoti_package_path}")
+
+            if tokenizer_args is None:
+                tokenizer_type = "0"
+            elif tokenizer_args.is_sentencepiece:
+                tokenizer_type = "2"  # Corresponding to llama2
+            else:
+                tokenizer_type = "3"  # Corresponding to llama3
+
+            metadata = {"tokenizer_type": tokenizer_type}
+            print(
+                "Exporting model using AOT Inductor to " f"{output_aoti_package_path}."
+            )
             export_for_server(
                 model_to_aoti_package,
                 builder_args.device,
                 output_aoti_package_path,
                 builder_args.dynamic_shapes,
                 package=True,
+                metadata=metadata,
             )

From 6bfc5c8cb1e8b8485ba408cbb6167e59acd7a854 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Fri, 17 Jan 2025 13:48:43 -0800
Subject: [PATCH 56/83] Bumping ET Pin to Jan16 2025 (#1459)

* Bumping ET Pin to Jan15 2025

https://github.com/pytorch/executorch/commit/d596cd78cf2280c9c01adbfc95b54a29865f3fe5

* Remove call to capture_pre_auto_grad_graph

* Update naming for sdpa to custom ops

* Fix export_for_train

* Update et-pin.txt

* Update Test perms
---
 .github/workflows/more-tests.yml          |  3 ++
 .github/workflows/periodic.yml            |  3 ++
 .github/workflows/pull.yml                | 18 +++++++++++
 .github/workflows/run-readme-periodic.yml |  9 ++++++
 .github/workflows/run-readme-pr-mps.yml   |  4 +--
 .github/workflows/run-readme-pr.yml       | 38 ++++++++++++++++++++++-
 .github/workflows/runner-cuda-dtype.yml   |  3 ++
 install/.pins/et-pin.txt                  |  2 +-
 torchchat/export.py                       |  5 ++-
 9 files changed, 78 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/more-tests.yml b/.github/workflows/more-tests.yml
index f47740fe3..f772382d1 100644
--- a/.github/workflows/more-tests.yml
+++ b/.github/workflows/more-tests.yml
@@ -9,6 +9,9 @@ on:
 
 jobs:
   test-cuda:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index 5a0d9920b..2e264e6cf 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -108,6 +108,9 @@ jobs:
           set -eux
           PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "periodic" --backend "gpu"
   test-gpu:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 670c0205a..5dbafee9f 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -215,6 +215,9 @@ jobs:
           set -eux
           PYTHONPATH="${PWD}" python .ci/scripts/gather_test_models.py --event "pull_request" --backend "gpu"
   test-gpu-compile:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu-compile (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
@@ -250,6 +253,9 @@ jobs:
         echo "::endgroup::"
 
   test-gpu-aoti-bfloat16:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu-aoti-bfloat16 (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
@@ -286,6 +292,9 @@ jobs:
         echo "::endgroup::"
 
   test-gpu-aoti-float32:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu-aoti-float32 (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
@@ -327,6 +336,9 @@ jobs:
         echo "::endgroup::"
 
   test-gpu-aoti-float16:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu-aoti-float16 (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
@@ -369,6 +381,9 @@ jobs:
         echo "::endgroup::"
 
   test-gpu-eval-sanity-check:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     name: test-gpu-eval-sanity-check (${{ matrix.platform }}, ${{ matrix.model_name }})
     needs: gather-models-gpu
@@ -1011,6 +1026,9 @@ jobs:
           echo "Tests complete."
 
   test-build-runner-et-android:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.4xlarge
diff --git a/.github/workflows/run-readme-periodic.yml b/.github/workflows/run-readme-periodic.yml
index 61501e0c4..2c49a975f 100644
--- a/.github/workflows/run-readme-periodic.yml
+++ b/.github/workflows/run-readme-periodic.yml
@@ -10,6 +10,9 @@ on:
 
 jobs:
   test-readme:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     secrets: inherit
     with:
@@ -39,6 +42,9 @@ jobs:
 
 
   test-quantization-any:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -66,6 +72,9 @@ jobs:
         echo "::endgroup::"
 
   test-gguf-any:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     secrets: inherit
     with:
diff --git a/.github/workflows/run-readme-pr-mps.yml b/.github/workflows/run-readme-pr-mps.yml
index 3e90265f5..bf1587896 100644
--- a/.github/workflows/run-readme-pr-mps.yml
+++ b/.github/workflows/run-readme-pr-mps.yml
@@ -10,7 +10,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
       runner: macos-m1-14
-      timeout-minutes: 50
+      timeout: 50
       script: |
           conda create -y -n test-readme-mps-macos python=3.10.11 llvm-openmp
           conda activate test-readme-mps-macos
@@ -36,7 +36,7 @@ jobs:
   test-quantization-mps-macos:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
-      runner: macos-m1-14  
+      runner: macos-m1-14
       script: |
           set -x
           conda create -y -n test-quantization-mps-macos python=3.10.11
diff --git a/.github/workflows/run-readme-pr.yml b/.github/workflows/run-readme-pr.yml
index 8694757e7..f32473435 100644
--- a/.github/workflows/run-readme-pr.yml
+++ b/.github/workflows/run-readme-pr.yml
@@ -9,6 +9,9 @@ on:
 
 jobs:
   test-readme-any:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -28,6 +31,9 @@ jobs:
         echo "::endgroup::"
 
   test-readme-cpu:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -47,6 +53,9 @@ jobs:
         echo "::endgroup::"
 
   test-quantization-any:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -66,6 +75,9 @@ jobs:
         echo "::endgroup::"
 
   test-quantization-cpu:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -80,6 +92,9 @@ jobs:
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization
 
   test-gguf-any:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -99,6 +114,9 @@ jobs:
         echo "::endgroup::"
 
   test-gguf-cpu:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -119,6 +137,9 @@ jobs:
 
 
   test-advanced-any:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -139,6 +160,9 @@ jobs:
 
 
   test-advanced-cpu:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -158,6 +182,9 @@ jobs:
         echo "::endgroup::"
 
   test-evaluation-any:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -177,6 +204,9 @@ jobs:
         echo "::endgroup::"
 
   test-evaluation-cpu:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -196,6 +226,9 @@ jobs:
         echo "::endgroup::"
 
   test-multimodal-any:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -215,6 +248,9 @@ jobs:
         echo "::endgroup::"
 
   test-multimodal-cpu:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -269,4 +305,4 @@ jobs:
         export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
         echo "::endgroup::"
 
-        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs native  
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs native
diff --git a/.github/workflows/runner-cuda-dtype.yml b/.github/workflows/runner-cuda-dtype.yml
index 4cfb9ff09..0b4597942 100644
--- a/.github/workflows/runner-cuda-dtype.yml
+++ b/.github/workflows/runner-cuda-dtype.yml
@@ -9,6 +9,9 @@ on:
 
 jobs:
   test-runner-aot-cuda:
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.g5.4xlarge.nvidia.gpu
diff --git a/install/.pins/et-pin.txt b/install/.pins/et-pin.txt
index bb70ed39d..e79e9c341 100644
--- a/install/.pins/et-pin.txt
+++ b/install/.pins/et-pin.txt
@@ -1 +1 @@
-98e4dd524f2cb08414ee015b27616229cabc06ba
+9c043290ad3944268290e015c3063bc411e6ef6b
diff --git a/torchchat/export.py b/torchchat/export.py
index e84a344bd..37f0b056e 100644
--- a/torchchat/export.py
+++ b/torchchat/export.py
@@ -125,7 +125,6 @@ def export_for_server(
     )
     from executorch.exir.tracer import Value
 
-    from torch._export import capture_pre_autograd_graph
     from torch.export import export, export_for_training, ExportedProgram
 
     from torchchat.model import apply_rotary_emb, Attention
@@ -223,7 +222,7 @@ def forward(self, x, freqs_cis, mask, input_pos=None, cache_lane: int = 0):
             return self.wo(output)
 
     def replace_attention_with_custom_sdpa_attention(module: nn.Module):
-        from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa
+        from executorch.extension.llm.custom_ops import custom_ops  # noqa
 
         for name, child in module.named_children():
             if isinstance(child, Attention):
@@ -316,7 +315,7 @@ def export_for_et(model, device, output_path) -> str:
         with torch.nn.attention.sdpa_kernel(
             [torch.nn.attention.SDPBackend.MATH]
         ), torch.no_grad():
-            m = capture_pre_autograd_graph(model, input, dynamic_shapes=dynamic_shapes)
+            m = export_for_training(model, input, dynamic_shapes=dynamic_shapes).module()
 
             edge_manager = export_to_edge(
                 m,

From d625f72840fb66004d3d313331e84db87da9d7a4 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Fri, 17 Jan 2025 15:17:10 -0800
Subject: [PATCH 57/83] Fix typo in quantize.py (#1461)

Fix typo

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 torchchat/utils/quantize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py
index f1ebf2902..3d1d57b86 100644
--- a/torchchat/utils/quantize.py
+++ b/torchchat/utils/quantize.py
@@ -847,4 +847,4 @@ def quantized_model(self) -> nn.Module:
         print("Unable to load torchao mps ops library.")
 
 except Exception as e:
-    print("Unabled to import torchao experimental quant_api with error: ", e)
+    print("Unable to import torchao experimental quant_api with error: ", e)

From e5543e2f20fdf4b3034761b0e336201769a41bb1 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Fri, 17 Jan 2025 15:17:33 -0800
Subject: [PATCH 58/83] Update run-readme-pr-mps.yml for typo (#1460)

fiux typo as separate PR, as per @malfet

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 .github/workflows/run-readme-pr-mps.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/run-readme-pr-mps.yml b/.github/workflows/run-readme-pr-mps.yml
index bf1587896..0d70a4c1d 100644
--- a/.github/workflows/run-readme-pr-mps.yml
+++ b/.github/workflows/run-readme-pr-mps.yml
@@ -63,7 +63,7 @@ jobs:
   test-gguf-mps-macos:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
-      runner: macos-m1-14  # neeps MPS, was macos-m1-stable
+      runner: macos-m1-14  # needs MPS, was macos-m1-stable
       script: |
           set -x
           conda create -y -n test-quantization-mps-macos python=3.10.11
@@ -90,7 +90,7 @@ jobs:
   test-advanced-mps-macos:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
-      runner: macos-m1-14  # neeps MPS, was macos-m1-stable
+      runner: macos-m1-14  # needs MPS, was macos-m1-stable
       script: |
           set -x
           conda create -y -n test-quantization-mps-macos python=3.10.11

From 2d96e480aa0eaee744febc4dc483a51b4ea23d16 Mon Sep 17 00:00:00 2001
From: Jiao Wang <jenniewang123@gmail.com>
Date: Fri, 17 Jan 2025 19:23:04 -0800
Subject: [PATCH 59/83] Add Intel XPU device support to generate and serve
 (#1361)

* add xpu

* add xpu device

* update

* profile

* update install

* update

* update

* update

---------

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
Co-authored-by: Guoqiong <guoqiong.song@intel.com>
---
 install/install_requirements.sh | 36 +++++++++++++++++++++------------
 torchchat/cli/builder.py        |  7 ++++++-
 torchchat/cli/cli.py            |  4 ++--
 torchchat/generate.py           |  9 ++++++++-
 torchchat/utils/build_utils.py  |  8 ++++++--
 torchchat/utils/device_info.py  | 11 +++++++++-
 torchchat/utils/quantize.py     |  2 +-
 7 files changed, 56 insertions(+), 21 deletions(-)

diff --git a/install/install_requirements.sh b/install/install_requirements.sh
index b5ac414fd..146e11096 100755
--- a/install/install_requirements.sh
+++ b/install/install_requirements.sh
@@ -59,12 +59,6 @@ VISION_NIGHTLY_VERSION=dev20241218
 # Nightly version for torchtune
 TUNE_NIGHTLY_VERSION=dev20241218
 
-# Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same
-(
-  set -x
-  $PIP_EXECUTABLE uninstall -y triton
-)
-
 # The pip repository that hosts nightly torch packages. cpu by default.
 # If cuda is available, based on presence of nvidia-smi, install the pytorch nightly
 # with cuda for faster execution on cuda GPUs.
@@ -74,16 +68,28 @@ then
 elif [[ -x "$(command -v rocminfo)" ]];
 then
   TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/rocm6.2"
+elif [[ -x "$(command -v xpu-smi)" ]];
+then
+  TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/xpu"
 else
   TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cpu"
 fi
 
 # pip packages needed by exir.
-REQUIREMENTS_TO_INSTALL=(
-  torch=="2.6.0.${PYTORCH_NIGHTLY_VERSION}"
-  torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}"
-  torchtune=="0.5.0.${TUNE_NIGHTLY_VERSION}"
-)
+if [[ -x "$(command -v xpu-smi)" ]];
+then
+  REQUIREMENTS_TO_INSTALL=(
+    torch=="2.6.0.${PYTORCH_NIGHTLY_VERSION}"
+    torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}"
+    torchtune=="0.5.0"
+  )
+else
+  REQUIREMENTS_TO_INSTALL=(
+    torch=="2.6.0.${PYTORCH_NIGHTLY_VERSION}"
+    torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}"
+    torchtune=="0.5.0.${TUNE_NIGHTLY_VERSION}"
+  )
+fi
 
 #
 # First install requirements in install/requirements.txt. Older torch may be
@@ -95,6 +101,12 @@ REQUIREMENTS_TO_INSTALL=(
   $PIP_EXECUTABLE install -r install/requirements.txt --extra-index-url "${TORCH_NIGHTLY_URL}"
 )
 
+# Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same
+(
+  set -x
+  $PIP_EXECUTABLE uninstall -y triton
+)
+
 # Install the requirements. --extra-index-url tells pip to look for package
 # versions on the provided URL if they aren't available on the default URL.
 (
@@ -116,8 +128,6 @@ if [[ -x "$(command -v nvidia-smi)" ]]; then
     $PYTHON_EXECUTABLE torchchat/utils/scripts/patch_triton.py
   )
 fi
-
-
 (
   set -x
   $PIP_EXECUTABLE install evaluate=="0.4.3" lm-eval=="0.4.2" psutil=="6.0.0"
diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
index 38d0e33b2..69db14a4b 100644
--- a/torchchat/cli/builder.py
+++ b/torchchat/cli/builder.py
@@ -72,7 +72,12 @@ class BuilderArgs:
 
     def __post_init__(self):
         if self.device is None:
-            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+            if torch.cuda.is_available():
+                self.device = "cuda"
+            elif torch.xpu.is_available():
+                self.device = "xpu"
+            else:
+                self.device = "cpu"
 
         if not (
             (self.checkpoint_path and self.checkpoint_path.is_file())
diff --git a/torchchat/cli/cli.py b/torchchat/cli/cli.py
index 91bdcaf26..723f25ea4 100644
--- a/torchchat/cli/cli.py
+++ b/torchchat/cli/cli.py
@@ -176,8 +176,8 @@ def _add_model_config_args(parser, verb: str) -> None:
         "--device",
         type=str,
         default=None,
-        choices=["fast", "cpu", "cuda", "mps"],
-        help="Hardware device to use. Options: fast, cpu, cuda, mps",
+        choices=["fast", "cpu", "cuda", "mps", "xpu"],
+        help="Hardware device to use. Options: fast, cpu, cuda, mps, xpu",
     )
 
 
diff --git a/torchchat/generate.py b/torchchat/generate.py
index e271f5027..8ec4d4d5d 100644
--- a/torchchat/generate.py
+++ b/torchchat/generate.py
@@ -1203,8 +1203,10 @@ def callback(x, *, done_generating=False):
             if hasattr(prof, "export_chrome_trace"):
                 if self.builder_args.device == "cpu":
                     print(prof.key_averages().table(sort_by="self_cpu_time_total"))
-                else:
+                elif self.builder_args.device == "cuda":
                     print(prof.key_averages().table(sort_by="self_cuda_time_total"))
+                else:
+                    print(prof.key_averages().table(sort_by="self_xpu_time_total"))
                 prof.export_chrome_trace(f"{self.profile}.json")
 
             if start_pos >= max_seq_length:
@@ -1289,6 +1291,9 @@ def callback(x, *, done_generating=False):
             )
         if torch.cuda.is_available():
             print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
+        if torch.xpu.is_available():
+            print(f"Memory used: {torch.xpu.max_memory_reserved() / 1e9:.02f} GB")
+
 
 
 class DistributedGenerator(LocalGenerator):
@@ -1615,6 +1620,8 @@ def run_generator(
         )
         if torch.cuda.is_available():
             torch.cuda.reset_peak_memory_stats()
+        if torch.xpu.is_available():
+            torch.xpu.reset_peak_memory_stats()
 
         for _ in gen.chat(generator_args):
             pass
diff --git a/torchchat/utils/build_utils.py b/torchchat/utils/build_utils.py
index 2685ec2f3..a0862ff94 100644
--- a/torchchat/utils/build_utils.py
+++ b/torchchat/utils/build_utils.py
@@ -231,6 +231,8 @@ def find_multiple(n: int, k: int) -> int:
 def device_sync(device="cpu"):
     if "cuda" in device:
         torch.cuda.synchronize(device)
+    elif "xpu" in device:
+        torch.xpu.synchronize(device)
     elif ("cpu" in device) or ("mps" in device):
         pass
     else:
@@ -279,7 +281,8 @@ def get_device_str(device) -> str:
         device = (
             "cuda"
             if torch.cuda.is_available()
-            else "mps" if is_mps_available() else "cpu"
+            else "mps" if is_mps_available()
+            else "xpu" if torch.xpu.is_available()  else "cpu"
         )
         return device
     else:
@@ -291,7 +294,8 @@ def get_device(device) -> str:
         device = (
             "cuda"
             if torch.cuda.is_available()
-            else "mps" if is_mps_available() else "cpu"
+            else "mps" if is_mps_available()
+            else "xpu" if torch.xpu.is_available()  else "cpu"
         )
     return torch.device(device)
 
diff --git a/torchchat/utils/device_info.py b/torchchat/utils/device_info.py
index 9c5953944..950c03002 100644
--- a/torchchat/utils/device_info.py
+++ b/torchchat/utils/device_info.py
@@ -14,7 +14,7 @@ def get_device_info(device: str) -> str:
     """Returns a human-readable description of the hardware based on a torch.device.type
 
     Args:
-        device: A torch.device.type string: one of {"cpu", "cuda"}.
+        device: A torch.device.type string: one of {"cpu", "cuda", "xpu"}.
     Returns:
         str: A human-readable description of the hardware or an empty string if the device type is unhandled.
 
@@ -37,4 +37,13 @@ def get_device_info(device: str) -> str:
             )
     if device == "cuda":
         return torch.cuda.get_device_name(0)
+    if device == "xpu":
+        return (
+            check_output(
+                ["xpu-smi discovery |grep 'Device Name:'"], shell=True
+            )
+            .decode("utf-8")
+            .split("\n")[0]
+            .split("Device Name:")[1]
+            )
     return ""
diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py
index 3d1d57b86..e6f08d9a9 100644
--- a/torchchat/utils/quantize.py
+++ b/torchchat/utils/quantize.py
@@ -121,7 +121,7 @@ def quantize_model(
         else:
             ao_quant = True
             # Use tensor subclass API for int4 weight only.
-            if device == "cuda" and quantizer == "linear:int4":
+            if (device == "cuda" or device == "xpu") and quantizer == "linear:int4":
                 quantize_(model, int4_weight_only(q_kwargs["groupsize"]))
             elif quantizer == "linear:int8":
                 print("quantizer is linear int8")

From defc225bbac754655dcc59bd24e6c28837d5c2ef Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Tue, 21 Jan 2025 15:10:31 -0800
Subject: [PATCH 60/83] Create run-readme-pr-linuxaarch64 (#1350)

* Create run-readme-pr-linuxaarch64

Test torchchat on aarch64 linux

* Rename run-readme-pr-linuxaarch64 to run-readme-pr-linuxaarch64.yml

add yml extension.

* Update ADVANCED-USERS.md

Update doc to indicate testing for ARMv8/aarch64 on Linux/raspbian is introduced by this PR

---------

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 .../workflows/run-readme-pr-linuxaarch64.yml  | 124 ++++++++++++++++++
 docs/ADVANCED-USERS.md                        |   4 +-
 2 files changed, 126 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/run-readme-pr-linuxaarch64.yml

diff --git a/.github/workflows/run-readme-pr-linuxaarch64.yml b/.github/workflows/run-readme-pr-linuxaarch64.yml
new file mode 100644
index 000000000..1f920a12f
--- /dev/null
+++ b/.github/workflows/run-readme-pr-linuxaarch64.yml
@@ -0,0 +1,124 @@
+name: Run the README instructions - with stories - on Linux aarch64
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+
+jobs:
+  test-readme-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux-aarch64
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs readme
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
+
+  test-quantization-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux-aarch64
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization
+
+  test-gguf-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux-aarch64
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs gguf
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
+
+  test-advanced-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux-aarch64
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs advanced
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
+
+  test-evaluation-cpu:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux-aarch64
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.1"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        echo "::group::Install newer objcopy that supports --set-section-alignment"
+        yum install -y  devtoolset-10-binutils
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        echo "::endgroup::"
+
+        TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs evaluation
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"
diff --git a/docs/ADVANCED-USERS.md b/docs/ADVANCED-USERS.md
index a8d02c2f9..17958e790 100644
--- a/docs/ADVANCED-USERS.md
+++ b/docs/ADVANCED-USERS.md
@@ -479,7 +479,7 @@ in a Python-free environment with AOT Inductor and ExecuTorch.
 | Hardware | OS | Eager | Eager + Compile | AOT Compile | ET Runtime |
 |-----|------|-----|-----|-----|-----|
 | x86 | Linux | ✅ |  ✅ |  ✅ |  ✅ |
-| aarch64 | Linux | n/t | n/t | n/t | n/t |
+| aarch64 | Linux | ✅ | ✅ | ✅ | n/t |
 | aarch64 | macOS | ✅ |  ✅ |  ✅ |  ✅ |
 | AMD GPU | Linux |  ✅ |  ✅ |  ✅ | ❌|
 | Nvidia GPU | Linux | ✅ |  ✅ |  ✅ | ❌|
@@ -490,7 +490,7 @@ in a Python-free environment with AOT Inductor and ExecuTorch.
 | Mobile GPU (Vulkan) | Android |  ❌|❌|❌| ✅ |
 | CoreML | iOS |  ❌|❌|❌| ✅ |
 | Hexagon DSP | Android | ❌|❌|❌| ✅ |
-| Raspberry Pi 4/5 | Raspbian | n/t | n/t | n/t | ✅ |
+| Raspberry Pi 4/5 | Raspbian | ✅ | ✅ | ✅ | ✅ |
 | Raspberry Pi 4/5 | Android | ❌ | ❌ | ❌ | n/t |
 | ARM 32b (up to v7) | any | ❌|❌|❌|❌|
 

From 2227014eb08117f073ac1acd519387c19a0ff079 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Tue, 21 Jan 2025 15:26:57 -0800
Subject: [PATCH 61/83] Bump test-readme-mps-macos timeout (#1451)

---
 .github/workflows/run-readme-pr-mps.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/run-readme-pr-mps.yml b/.github/workflows/run-readme-pr-mps.yml
index 0d70a4c1d..7ab5b1558 100644
--- a/.github/workflows/run-readme-pr-mps.yml
+++ b/.github/workflows/run-readme-pr-mps.yml
@@ -10,7 +10,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
       runner: macos-m1-14
-      timeout: 50
+      timeout: 60
       script: |
           conda create -y -n test-readme-mps-macos python=3.10.11 llvm-openmp
           conda activate test-readme-mps-macos

From bc0f93ad704ae03abb16a84cd3cf1cd5b31eb40e Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Tue, 21 Jan 2025 17:02:49 -0800
Subject: [PATCH 62/83] Update torch/tune/vision pins to 1/19/25 (#1467)

* Update install_requirements.sh

* Update pytorch minor version

* Update install_requirements.sh
---
 install/install_requirements.sh | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/install/install_requirements.sh b/install/install_requirements.sh
index 146e11096..264c3496d 100755
--- a/install/install_requirements.sh
+++ b/install/install_requirements.sh
@@ -51,13 +51,13 @@ echo "Using pip executable: $PIP_EXECUTABLE"
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # PYTORCH_NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-PYTORCH_NIGHTLY_VERSION=dev20241218
+PYTORCH_NIGHTLY_VERSION=dev20250119
 
 # Nightly version for torchvision
-VISION_NIGHTLY_VERSION=dev20241218
+VISION_NIGHTLY_VERSION=dev20250119
 
 # Nightly version for torchtune
-TUNE_NIGHTLY_VERSION=dev20241218
+TUNE_NIGHTLY_VERSION=dev20250119
 
 # The pip repository that hosts nightly torch packages. cpu by default.
 # If cuda is available, based on presence of nvidia-smi, install the pytorch nightly
@@ -79,15 +79,15 @@ fi
 if [[ -x "$(command -v xpu-smi)" ]];
 then
   REQUIREMENTS_TO_INSTALL=(
-    torch=="2.6.0.${PYTORCH_NIGHTLY_VERSION}"
+    torch=="2.7.0.${PYTORCH_NIGHTLY_VERSION}"
     torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}"
-    torchtune=="0.5.0"
+    torchtune=="0.6.0"
   )
 else
   REQUIREMENTS_TO_INSTALL=(
-    torch=="2.6.0.${PYTORCH_NIGHTLY_VERSION}"
+    torch=="2.7.0.${PYTORCH_NIGHTLY_VERSION}"
     torchvision=="0.22.0.${VISION_NIGHTLY_VERSION}"
-    torchtune=="0.5.0.${TUNE_NIGHTLY_VERSION}"
+    torchtune=="0.6.0.${TUNE_NIGHTLY_VERSION}"
   )
 fi
 

From cd10377df5515ae70e33da3948cb5c280145e188 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Tue, 21 Jan 2025 17:21:58 -0800
Subject: [PATCH 63/83] Add warning in PTEModel when not defined (#1468)

* Add warning in PTEModel when not defined

* Add missing parans
---
 torchchat/model.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torchchat/model.py b/torchchat/model.py
index f50d2a8be..28429370c 100644
--- a/torchchat/model.py
+++ b/torchchat/model.py
@@ -1062,5 +1062,6 @@ def forward(self, x, input_pos):
         def setup_caches(self, max_batch_size, max_seq_length):
             pass
 
-except:
+except Exception as e:
+    print(f"Warning: PTEModel (ExecuTorch) not available with exception: {e}")
     pass

From ef58fce4aee7f94ca442de0d498ae6894eea942d Mon Sep 17 00:00:00 2001
From: YanbingJiang <yanbing.jiang@intel.com>
Date: Wed, 22 Jan 2025 09:55:30 +0800
Subject: [PATCH 64/83] Add attention_backend as a configurable option (#1456)

bump this into the constructor of BuilderArgs

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 torchchat/cli/builder.py | 13 +++++++++++++
 torchchat/cli/cli.py     |  7 +++++++
 torchchat/generate.py    |  7 ++++++-
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
index 69db14a4b..755817d1e 100644
--- a/torchchat/cli/builder.py
+++ b/torchchat/cli/builder.py
@@ -69,6 +69,7 @@ class BuilderArgs:
     prefill_possible: bool = False
     dynamic_shapes: bool = False
     max_seq_length: Optional[int] = None
+    attention_backend: str = "math"
 
     def __post_init__(self):
         if self.device is None:
@@ -183,6 +184,17 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
         pp = getattr(args, "pp", 1)
         tp = getattr(args, "tp", 1)
         chpt_from = getattr(args, "chpt_from", "hf")
+        sdp_backend_dict = {
+            'math': torch.nn.attention.SDPBackend.MATH,
+            'flash_attention': torch.nn.attention.SDPBackend.FLASH_ATTENTION,
+            'efficient_attention': torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION,
+            'cudnn_attention': torch.nn.attention.SDPBackend.CUDNN_ATTENTION,
+        }
+        attention_backend = sdp_backend_dict[args.attention_backend]
+        if args.device == "cpu" and (args.attention_backend == "efficient_attention"
+                                     or args.attention_backend == "cudnn_attention"):
+            print(f"Warning: {args.attention_backend} is not supported on CPU. Using math instead.")
+            attention_backend = torch.nn.attention.SDPBackend.MATH
         return cls(
             checkpoint_dir=checkpoint_dir,
             checkpoint_path=checkpoint_path,
@@ -207,6 +219,7 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
             is_chat_model=is_chat_model,
             dynamic_shapes=getattr(args, "dynamic_shapes", False),
             max_seq_length=getattr(args, "max_seq_length", None),
+            attention_backend=attention_backend,
         )
 
     @classmethod
diff --git a/torchchat/cli/cli.py b/torchchat/cli/cli.py
index 723f25ea4..70f404635 100644
--- a/torchchat/cli/cli.py
+++ b/torchchat/cli/cli.py
@@ -179,6 +179,13 @@ def _add_model_config_args(parser, verb: str) -> None:
         choices=["fast", "cpu", "cuda", "mps", "xpu"],
         help="Hardware device to use. Options: fast, cpu, cuda, mps, xpu",
     )
+    model_config_parser.add_argument(
+        "--attention-backend",
+        type=str,
+        default="math",
+        choices=["math", "flash_attention", "efficient_attention", "cudnn_attention"],
+        help="SDPBackend to use. Options: MATH, FLASH_ATTENTION, EFFICIENT_ATTENTION, CUDNN_ATTENTION",
+    )
 
 
 # Add CLI Args representing output paths of exported model files
diff --git a/torchchat/generate.py b/torchchat/generate.py
index 8ec4d4d5d..a596187f5 100644
--- a/torchchat/generate.py
+++ b/torchchat/generate.py
@@ -26,6 +26,7 @@
 import torch.distributed as dist
 import torch.multiprocessing as mp
 from torch.distributed.pipelining import PipelineStage, ScheduleGPipe
+from torch._C import _SDPBackend as SDPBackend
 
 from PIL import Image
 
@@ -531,6 +532,7 @@ def decode_n_tokens(
         callback=lambda _: _,
         eos_token_id: int = 2,
         eot_id: Optional[int] = None,
+        attention_backend: SDPBackend = torch.nn.attention.SDPBackend.MATH,
         **sampling_kwargs,
     ):
         new_tokens, new_probs = [], []
@@ -539,7 +541,7 @@ def decode_n_tokens(
             num_new_tokens - 1
         ):  # -1 to save space to run an EoS if dont generate it naturally
             # Actually better for Inductor to codegen attention here
-            with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]):
+            with torch.nn.attention.sdpa_kernel([attention_backend]):
 
                 out_token = cur_token.clone()
                 next_token, next_prob = self.decode_one_token(
@@ -683,6 +685,7 @@ def generate(
         sequential_prefill=True,
         callback=lambda x: x,
         max_seq_length: int,
+        attention_backend: str = "math",
         seed: Optional[int] = None,
         **sampling_kwargs,
     ) -> torch.Tensor:
@@ -799,6 +802,7 @@ def generate(
                     if self.is_llama3_model
                     else None
                 ),
+                attention_backend=attention_backend,
                 **sampling_kwargs,
             ):
                 generated_tokens.append(generated_token.view(-1))
@@ -1186,6 +1190,7 @@ def callback(x, *, done_generating=False):
                     start_pos=start_pos,
                     skip_cache_setup=not is_first_sample,
                     max_seq_length=max_seq_length,
+                    attention_backend=self.builder_args.attention_backend,
                 )
                 for token_tensor, metrics in generator_func:
                     if token_tensor is not None:

From 601f2d178e5cd5b30935af2f8f0075f50c7676dc Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Tue, 21 Jan 2025 17:58:34 -0800
Subject: [PATCH 65/83] Update import of sdpa_with_kv_cache to custom_ops
 (#1470)

---
 torchchat/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchchat/model.py b/torchchat/model.py
index 28429370c..c01ff1262 100644
--- a/torchchat/model.py
+++ b/torchchat/model.py
@@ -1025,7 +1025,7 @@ def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
     # For quantized_decomposed ops
     from executorch.kernels import quantized  # no-qa
     # For llama::sdpa_with_kv_cache.out, preprocess ops
-    from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # no-qa
+    from executorch.extension.llm.custom_ops import custom_ops  # no-qa
 
     class PTEModel(nn.Module):
         def __init__(self, config, path) -> None:

From 083960b671a56665dcb299030c88a0d387724b43 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Tue, 21 Jan 2025 18:56:16 -0800
Subject: [PATCH 66/83] Typo: Fix generate signature type hint for
 attention_backend (#1471)

`attention_backend` is a SDPBackend, not a string
---
 torchchat/generate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchchat/generate.py b/torchchat/generate.py
index a596187f5..a06e215f4 100644
--- a/torchchat/generate.py
+++ b/torchchat/generate.py
@@ -685,7 +685,7 @@ def generate(
         sequential_prefill=True,
         callback=lambda x: x,
         max_seq_length: int,
-        attention_backend: str = "math",
+        attention_backend: SDPBackend = torch.nn.attention.SDPBackend.MATH,
         seed: Optional[int] = None,
         **sampling_kwargs,
     ) -> torch.Tensor:

From a942c1660fdd54dabe9d0bf713ef6acd4311ac62 Mon Sep 17 00:00:00 2001
From: vlado <vladoovtcharov@gmail.com>
Date: Wed, 22 Jan 2025 15:37:28 -0500
Subject: [PATCH 67/83] chat: Change role to user for user prompts (#1447)

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 torchchat/generate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchchat/generate.py b/torchchat/generate.py
index a06e215f4..ad933687d 100644
--- a/torchchat/generate.py
+++ b/torchchat/generate.py
@@ -1126,7 +1126,7 @@ def chat(
                     messages_to_encode.append(
                         {"role": "system", "content": self.system_prompt}
                     )
-                messages_to_encode.append({"role": "system", "content": prompt})
+                messages_to_encode.append({"role": "user", "content": prompt})
                 encoded = self.chat_formatter.encode_dialog_prompt(
                     messages_to_encode, add_generation_prompt=True,
                 )

From f514b3581ab5fc7cf7d864d953b4197ffe6cfbf0 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Thu, 23 Jan 2025 10:03:10 -0800
Subject: [PATCH 68/83] Update run-readme-pr-linuxaarch64.yml to use correct
 runner (#1469)

* Update run-readme-pr-linuxaarch64.yml to use correct runner

* Move to linux.arm64.m7g.4xlarge

* Explicitly overriding the docker-image

* Bumping Cuda version to 12.6

* Updating GPU Arch type

* Testing various linux_job combos: v2 cuda, v2 cpu, v1 cpu

* Adding permissions to linux job v2

* Switch everything to CPU linux v2

* Test with devtoolset-11

* Remove devtoolset install

* Removing devtoolset from commands
---
 .../workflows/run-readme-pr-linuxaarch64.yml  | 74 ++++++++-----------
 1 file changed, 32 insertions(+), 42 deletions(-)

diff --git a/.github/workflows/run-readme-pr-linuxaarch64.yml b/.github/workflows/run-readme-pr-linuxaarch64.yml
index 1f920a12f..1f22c4f2e 100644
--- a/.github/workflows/run-readme-pr-linuxaarch64.yml
+++ b/.github/workflows/run-readme-pr-linuxaarch64.yml
@@ -9,22 +9,20 @@ on:
 
 jobs:
   test-readme-cpu:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
-      runner: linux-aarch64
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      runner: linux.arm64.2xlarge
+      docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main"
+      gpu-arch-type: cpu-aarch64
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs readme
 
         echo "::group::Completion"
@@ -33,41 +31,37 @@ jobs:
         echo "::endgroup::"
 
   test-quantization-cpu:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
-      runner: linux-aarch64
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      runner: linux.arm64.2xlarge
+      docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main"
+      gpu-arch-type: cpu-aarch64
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization
 
   test-gguf-cpu:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
-      runner: linux-aarch64
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      runner: linux.arm64.2xlarge
+      docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main"
+      gpu-arch-type: cpu-aarch64
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs gguf
 
         echo "::group::Completion"
@@ -77,21 +71,19 @@ jobs:
 
   test-advanced-cpu:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
-      runner: linux-aarch64
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      runner: linux.arm64.2xlarge
+      docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main"
+      gpu-arch-type: cpu-aarch64
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs advanced
 
         echo "::group::Completion"
@@ -101,21 +93,19 @@ jobs:
 
   test-evaluation-cpu:
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    permissions:
+      id-token: write
+      contents: read
     with:
-      runner: linux-aarch64
-      gpu-arch-type: cuda
-      gpu-arch-version: "12.1"
+      runner: linux.arm64.2xlarge
+      docker-image: "pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main"
+      gpu-arch-type: cpu-aarch64
       timeout: 60
       script: |
         echo "::group::Print machine info"
         uname -a
         echo "::endgroup::"
 
-        echo "::group::Install newer objcopy that supports --set-section-alignment"
-        yum install -y  devtoolset-10-binutils
-        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
-        echo "::endgroup::"
-
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs evaluation
 
         echo "::group::Completion"

From c536da40bf35cb7efa222be214b36e4f54d95d71 Mon Sep 17 00:00:00 2001
From: nlpfollower <piotr@deso.org>
Date: Thu, 23 Jan 2025 15:03:14 -0800
Subject: [PATCH 69/83] Increment start_pos by encoded size in generate (#1462)

* Add encoded size to start_pos

* Only in chat mode

---------

Co-authored-by: nlpfollower <nlp@nlpfollower.nlp>
Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 torchchat/generate.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torchchat/generate.py b/torchchat/generate.py
index ad933687d..7f37386ac 100644
--- a/torchchat/generate.py
+++ b/torchchat/generate.py
@@ -1192,6 +1192,8 @@ def callback(x, *, done_generating=False):
                     max_seq_length=max_seq_length,
                     attention_backend=self.builder_args.attention_backend,
                 )
+                if generator_args.chat_mode:
+                    start_pos += encoded.size(0)
                 for token_tensor, metrics in generator_func:
                     if token_tensor is not None:
                         start_pos += token_tensor.size(0)

From 8662471d3082689696b830814100e8b4d9c05cf2 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Thu, 23 Jan 2025 18:45:24 -0800
Subject: [PATCH 70/83] Explicitly turning off pybindings for ExecuTorch unless
 requested (#1475)

ExecuTorch now has XNN pybinding built by default https://github.com/pytorch/executorch/pull/7473

Previously it was not built by default
---
 torchchat/utils/scripts/install_utils.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchchat/utils/scripts/install_utils.sh b/torchchat/utils/scripts/install_utils.sh
index 94378960a..57dcc77bf 100644
--- a/torchchat/utils/scripts/install_utils.sh
+++ b/torchchat/utils/scripts/install_utils.sh
@@ -88,7 +88,7 @@ install_executorch_python_libs() {
   echo "Building and installing python libraries"
   if [ "${ENABLE_ET_PYBIND}" = false ]; then
       echo "Not installing pybind"
-      bash ./install_requirements.sh
+      bash ./install_requirements.sh --pybind off
   else
       echo "Installing pybind"
       bash ./install_requirements.sh --pybind xnnpack

From a64b9e3251111d1b2139754c7517a8ed0919410e Mon Sep 17 00:00:00 2001
From: Manuel Candales <42380156+manuelcandales@users.noreply.github.com>
Date: Fri, 24 Jan 2025 12:51:30 -0500
Subject: [PATCH 71/83] Replace RMSNorm by nn.RMSNorm (#1464)

In this PR we replace torchchat's own [RMSNorm](https://github.com/pytorch/torchchat/blob/f4ae60fc936328c7ebd4551019733dc0942c42f9/torchchat/model.py#L931-L942) implementation by nn.RMSNorm, and we bump the PyTorch pin to capture the massive speed up (30x-40x) to RMSNorm on MPS backend introduced in https://github.com/pytorch/pytorch/pull/145301

Preliminary benchmarks on an M1 Pro with 16GB RAM, show a 33% speed up on token generation when running Llama 3.2 1B with 4-bit quantization

Motivation: Token generation on MPS backend is currently CPU bound, because of MPSGraph overhead. Surprisingly, the ops that are impacting performance the most are simple ones: mul, copy_, add, where, mean, rsqrt, sub, cat, stack. Experiments on an M1 Pro show that each of those op calls on the MPS backend, has at least 20us of CPU overhead. Also, these ops dominate the graph. For example, in aggregate, these ops are called 770 times for each token, when running Llama 3.2 1B. Compare that to SDPA which is called only 33 times, and linear which is called 113 times.
- mul is called 275 times per token
- copy_ is called 202 times per token
- add is called 97 times per token
- where is called 34 times per token
- mean is called 33 times per token
- rsqrt is called 33 times per token
- sub is called 32 times per token
- cat is called 32 times per token
- stack is called 32 times per token

Currently, torchchat's own [RMSNorm](https://github.com/pytorch/torchchat/blob/f4ae60fc936328c7ebd4551019733dc0942c42f9/torchchat/model.py#L931-L942) operation is basically implemented like this:
```
norm = x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
output = norm(x.float()).type_as(x) * weight
```
This means that a single call to torchchat's RMSNorm involves 3 calls to `aten::mul` and calls to `aten::rsqrt`, `aten::mean` and `aten::add`. RMSNorm is called 33 times for each token. Hence, RMSNorm contributes 5 * 33 = 165 of those 770 op calls.
---
 install/install_requirements.sh |  6 +++---
 torchchat/model.py              | 20 +++-----------------
 2 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/install/install_requirements.sh b/install/install_requirements.sh
index 264c3496d..360ba1801 100755
--- a/install/install_requirements.sh
+++ b/install/install_requirements.sh
@@ -51,13 +51,13 @@ echo "Using pip executable: $PIP_EXECUTABLE"
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # PYTORCH_NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-PYTORCH_NIGHTLY_VERSION=dev20250119
+PYTORCH_NIGHTLY_VERSION=dev20250124
 
 # Nightly version for torchvision
-VISION_NIGHTLY_VERSION=dev20250119
+VISION_NIGHTLY_VERSION=dev20250124
 
 # Nightly version for torchtune
-TUNE_NIGHTLY_VERSION=dev20250119
+TUNE_NIGHTLY_VERSION=dev20250124
 
 # The pip repository that hosts nightly torch packages. cpu by default.
 # If cuda is available, based on presence of nvidia-smi, install the pytorch nightly
diff --git a/torchchat/model.py b/torchchat/model.py
index c01ff1262..ce7dcb5e4 100644
--- a/torchchat/model.py
+++ b/torchchat/model.py
@@ -657,7 +657,7 @@ def __init__(self, config: TransformerArgs) -> None:
             self.layers[str(layer_id)] = TransformerBlock(config)
 
         if config.stage_idx == config.n_stages - 1:
-            self.norm = RMSNorm(config.dim, eps=config.norm_eps)
+            self.norm = nn.RMSNorm(config.dim, eps=config.norm_eps)
             self.output = nn.Linear(config.dim, config.vocab_size, bias=False)
             if config.tie_word_embeddings:
                 self.output.weight = self.tok_embeddings.weight
@@ -751,8 +751,8 @@ def __init__(self, config: TransformerArgs) -> None:
         super().__init__()
         self.attention = Attention(config)
         self.feed_forward = FeedForward(config)
-        self.ffn_norm = RMSNorm(config.dim, config.norm_eps)
-        self.attention_norm = RMSNorm(config.dim, config.norm_eps)
+        self.ffn_norm = nn.RMSNorm(config.dim, config.norm_eps)
+        self.attention_norm = nn.RMSNorm(config.dim, config.norm_eps)
         # None for llama architecture, set for granite architectures
         self.residual_multiplier = (
             config.residual_multiplier
@@ -928,20 +928,6 @@ def forward(self, x: Tensor) -> Tensor:
         return self.w2(F.silu(self.w1(x)) * self.w3(x))
 
 
-class RMSNorm(nn.Module):
-    def __init__(self, dim: int, eps: float = 1e-5):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-
-    def _norm(self, x):
-        return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
-
-    def forward(self, x: Tensor) -> Tensor:
-        output = self._norm(x.float()).type_as(x)
-        return output * self.weight
-
-
 def apply_scaling(freqs: torch.Tensor, rope_scaling: Dict[str, Any]):
     # Check for the presence of the required keys
     required_keys = {

From 84d223202f5e62fcdd08460246745ea8eed7bb11 Mon Sep 17 00:00:00 2001
From: Angela Yi <yiangela7@gmail.com>
Date: Fri, 24 Jan 2025 10:28:06 -0800
Subject: [PATCH 72/83] Update aoti calls to utilize new export and packaging
 APIs (#1455)

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 torchchat/cli/builder.py |  3 +--
 torchchat/export.py      | 13 ++++++++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
index 755817d1e..a5b23dfe3 100644
--- a/torchchat/cli/builder.py
+++ b/torchchat/cli/builder.py
@@ -589,9 +589,8 @@ def do_nothing(max_batch_size, max_seq_length):
             # attributes will NOT be seen on by AOTI-compiled forward
             # function, e.g. calling model.setup_cache will NOT touch
             # AOTI compiled and maintained model buffers such as kv_cache.
-            from torch._inductor.package import load_package
 
-            aoti_compiled_model = load_package(
+            aoti_compiled_model = torch._inductor.aoti_load_package(
                 str(builder_args.aoti_package_path.absolute())
             )
 
diff --git a/torchchat/export.py b/torchchat/export.py
index 37f0b056e..829bd47db 100644
--- a/torchchat/export.py
+++ b/torchchat/export.py
@@ -75,17 +75,20 @@ def export_for_server(
         if not package:
             options = {"aot_inductor.output_path": output_path}
 
-        path = torch._export.aot_compile(
+        ep = torch.export.export(
             model,
             example_inputs,
             dynamic_shapes=dynamic_shapes,
-            options=options,
         )
 
         if package:
-            from torch._inductor.package import package_aoti
-
-            path = package_aoti(output_path, path)
+            path = torch._inductor.aoti_compile_and_package(
+                ep, package_path=output_path, inductor_configs=options
+            )
+        else:
+            path = torch._inductor.aot_compile(
+                ep.module(), example_inputs, options=options
+            )
 
     print(f"The generated packaged model can be found at: {path}")
     return path

From 1c2f5aa9160c45ef653cba39945608c1c4385906 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Fri, 24 Jan 2025 12:15:15 -0800
Subject: [PATCH 73/83] Update numpy requirements to no longer upper bound on
 2.0 (#1479)

---
 install/requirements.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/install/requirements.txt b/install/requirements.txt
index 457131275..bd1e09174 100644
--- a/install/requirements.txt
+++ b/install/requirements.txt
@@ -16,8 +16,7 @@ jinja2
 # Miscellaneous
 snakeviz
 sentencepiece
-# numpy version range required by GGUF util
-numpy >= 1.17, < 2.0
+numpy >= 1.17
 blobfile
 tomli >= 1.1.0 ; python_version < "3.11"
 openai

From 59e168e083786bec08fb41bb46db540c8efbf0db Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Fri, 24 Jan 2025 14:00:25 -0800
Subject: [PATCH 74/83] Add evaluation, multimodal, native tests to
 run-readme-pr-macos.yml (#1409)

* Add evaluation, multimodal, native tests to run-readme-pr-macos.yml

Add evaluation, multimodal, native tests to run-readme-pr-macos.yml

* Update run-readme-pr-mps.yml

* Update build_native.sh

Update to C++11 ABI for AOTI, similar to ET

* Update run-readme-pr-macos.yml

fix typo

---------

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 .github/workflows/run-readme-pr-macos.yml | 103 ++++++++++++++++++++++
 1 file changed, 103 insertions(+)

diff --git a/.github/workflows/run-readme-pr-macos.yml b/.github/workflows/run-readme-pr-macos.yml
index 64afe2247..c6f3d99f6 100644
--- a/.github/workflows/run-readme-pr-macos.yml
+++ b/.github/workflows/run-readme-pr-macos.yml
@@ -143,3 +143,106 @@ jobs:
           echo "tests complete"
           echo "*******************************************"
           echo "::endgroup::"
+
+  test-eval-macos:
+    runs-on: macos-14-xlarge
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10.11'
+      - name: Setup Xcode
+        if: runner.os == 'macOS'
+        uses: maxim-lobanov/setup-xcode@v1
+        with:
+          xcode-version: '15.3'
+      - name: Run script
+        run: |
+          set -x
+          # NS: Remove previous installation  of torch first
+          # as this script does not isntall anything into conda env but rather as system dep
+          pip3 uninstall -y torch || true
+          set -eou pipefail
+
+          echo "::group::Print machine info"
+          uname -a
+          sysctl machdep.cpu.brand_string
+          sysctl machdep.cpu.core_count
+          echo "::endgroup::"
+
+          .ci/scripts/run-docs evaluation
+
+          echo "::group::Completion"
+          echo "tests complete"
+          echo "*******************************************"
+          echo "::endgroup::"   
+
+  test-multimodal-macos:
+    runs-on: macos-14-xlarge
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10.11'
+      - name: Setup Xcode
+        if: runner.os == 'macOS'
+        uses: maxim-lobanov/setup-xcode@v1
+        with:
+          xcode-version: '15.3'
+      - name: Run script
+        run: |
+          set -x
+          # NS: Remove previous installation  of torch first
+          # as this script does not isntall anything into conda env but rather as system dep
+          pip3 uninstall -y torch || true
+          set -eou pipefail
+
+          echo "::group::Print machine info"
+          uname -a
+          sysctl machdep.cpu.brand_string
+          sysctl machdep.cpu.core_count
+          echo "::endgroup::"
+
+          .ci/scripts/run-docs multimodal
+
+          echo "::group::Completion"
+          echo "tests complete"
+          echo "*******************************************"
+          echo "::endgroup::"
+
+  test-native-macos:
+    runs-on: macos-14-xlarge
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10.11'
+      - name: Setup Xcode
+        if: runner.os == 'macOS'
+        uses: maxim-lobanov/setup-xcode@v1
+        with:
+          xcode-version: '15.3'
+      - name: Run script
+        run: |
+          set -x
+          # NS: Remove previous installation  of torch first
+          # as this script does not isntall anything into conda env but rather as system dep
+          pip3 uninstall -y torch || true
+          set -eou pipefail
+
+          echo "::group::Print machine info"
+          uname -a
+          sysctl machdep.cpu.brand_string
+          sysctl machdep.cpu.core_count
+          echo "::endgroup::"
+
+          .ci/scripts/run-docs native
+
+          echo "::group::Completion"
+          echo "tests complete"
+          echo "*******************************************"
+          echo "::endgroup::"
+  

From 7b3a5fd18b4138b915870c7da48a6249d6de8525 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Fri, 24 Jan 2025 14:03:10 -0800
Subject: [PATCH 75/83] Add evaluation, multimodal, native tests to
 run-readme-pr-mps.yml (#1410)

* Add evaluation, multimodal, native tests to run-readme-pr-mps.yml

Add evaluation, multimodal, native tests to run-readme-pr-mps.yml

* Update run-readme-pr-mps.yml

Typos

* Update run-readme-pr-mps.yml

* Update run-readme-pr-mps.yml

Extend timeout for test-readme-mps to avoid test failing from timeout.

* Update build_native.sh

Update to C++11 ABI for AOTI, similar to ET

---------

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 .github/workflows/run-readme-pr-mps.yml | 81 +++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/.github/workflows/run-readme-pr-mps.yml b/.github/workflows/run-readme-pr-mps.yml
index 7ab5b1558..4d5cd7e14 100644
--- a/.github/workflows/run-readme-pr-mps.yml
+++ b/.github/workflows/run-readme-pr-mps.yml
@@ -113,3 +113,84 @@ jobs:
           echo "tests complete"
           echo "*******************************************"
           echo "::endgroup::"
+
+  test-evaluation-mps-macos:
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-m1-14  # needs MPS, was macos-m1-stable
+      script: |
+          set -x
+          conda create -y -n test-evaluation-mps-macos python=3.10.11
+          conda activate test-evaluation-mps-macos
+          # NS: Remove previous installation  of torch first
+          # as this script does not isntall anything into conda env
+          # but rather  system dep
+          pip3 uninstall -y torch || true
+          set -eou pipefail
+
+          echo "::group::Print machine info"
+          uname -a
+          sysctl machdep.cpu.brand_string
+          sysctl machdep.cpu.core_count
+          echo "::endgroup::"
+
+          .ci/scripts/run-docs evaluation
+
+          echo "::group::Completion"
+          echo "tests complete"
+          echo "*******************************************"
+          echo "::endgroup::"
+
+  test-multimodal-mps-macos:
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-m1-14  # needs MPS, was macos-m1-stable
+      script: |
+          set -x
+          conda create -y -n test-multimodal-mps-macos python=3.10.11
+          conda activate test-multimodal-mps-macos
+          # NS: Remove previous installation  of torch first
+          # as this script does not isntall anything into conda env
+          # but rather  system dep
+          pip3 uninstall -y torch || true
+          set -eou pipefail
+
+          echo "::group::Print machine info"
+          uname -a
+          sysctl machdep.cpu.brand_string
+          sysctl machdep.cpu.core_count
+          echo "::endgroup::"
+
+          .ci/scripts/run-docs multimodal
+
+          echo "::group::Completion"
+          echo "tests complete"
+          echo "*******************************************"
+          echo "::endgroup::"
+
+  test-native-mps-macos:
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-m1-14  # needs MPS, was macos-m1-stable
+      script: |
+          set -x
+          conda create -y -n test-native-mps-macos python=3.10.11
+          conda activate test-native-mps-macos
+          # NS: Remove previous installation  of torch first
+          # as this script does not isntall anything into conda env
+          # but rather  system dep
+          pip3 uninstall -y torch || true
+          set -eou pipefail
+
+          echo "::group::Print machine info"
+          uname -a
+          sysctl machdep.cpu.brand_string
+          sysctl machdep.cpu.core_count
+          echo "::endgroup::"
+
+          .ci/scripts/run-docs native
+
+          echo "::group::Completion"
+          echo "tests complete"
+          echo "*******************************************"
+          echo "::endgroup::"

From 4e2c3841e9353ce463526bf5e4241ce6f4e40e5b Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Fri, 24 Jan 2025 14:07:36 -0800
Subject: [PATCH 76/83] Force run-readme-pr-macos.yml to use CPU instead of
 incorrectly loading to MPS (#1417)

* bandaid for run-readme-pr-macos.yml incorrectly loading to MPS

as per #1416 torchchat on hosts without MPS (which is all github hosts which use kvm to virtualize MacOS, but not MPS) should choose CPU as "fast" device.  The logic is present (see discussion in #1416 ), but either not fully functional (that would be the easier one to fix, just print the result of get_device_str and fix the code!) or specifically ignored on load in torch/serialization.py (If this is the case, we're effectively looking at a core PyTorch issue....)

In the meantime, this bandaid just forces the use of CPU on MacOS tests, to make MacOS tests run on CPU -- labeit hsortcircuiting test/execution of the "fast" device logic.  Not ideal, but some testing beats no testing.

* Update run-readme-pr-macos.yml

Add informational message to MacOS CPU tests

* Update build_native.sh

Update to C++11 ABI for AOTI, similar to ET

---------

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 .github/workflows/run-readme-pr-macos.yml | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/run-readme-pr-macos.yml b/.github/workflows/run-readme-pr-macos.yml
index c6f3d99f6..ce84d3b50 100644
--- a/.github/workflows/run-readme-pr-macos.yml
+++ b/.github/workflows/run-readme-pr-macos.yml
@@ -33,7 +33,8 @@ jobs:
           sysctl machdep.cpu.core_count
           echo "::endgroup::"
 
-          .ci/scripts/run-docs readme
+          echo "using workaround for #1416 and #1315 by setting torchchat device explicitly"
+          TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs readme
   
           echo "::group::Completion"
           echo "tests complete"
@@ -68,7 +69,8 @@ jobs:
           sysctl machdep.cpu.core_count
           echo "::endgroup::"
 
-          .ci/scripts/run-docs quantization
+          echo "using workaround for #1416 and #1315 by setting torchchat device explicitly"
+          TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs quantization
   
           echo "::group::Completion"
           echo "tests complete"
@@ -103,7 +105,8 @@ jobs:
           sysctl machdep.cpu.core_count
           echo "::endgroup::"
 
-          .ci/scripts/run-docs gguf
+          echo "using workaround for #1416 and #1315 by setting torchchat device explicitly"
+          TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs gguf
 
           echo "::group::Completion"
           echo "tests complete"
@@ -137,7 +140,8 @@ jobs:
           sysctl machdep.cpu.core_count
           echo "::endgroup::"
 
-          .ci/scripts/run-docs advanced
+          echo "using workaround for #1416 and #1315 by setting torchchat device explicitly"
+          TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs advanced
 
           echo "::group::Completion"
           echo "tests complete"

From 8bae5478a14f7b695980a950a3b0dd618d74f231 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Mon, 27 Jan 2025 10:34:49 -0800
Subject: [PATCH 77/83] Add distributed tests to run-readme-pr.yml (#1466)

* Add distributed tests to run-readme-pr.yml

Need to ensure this is the right runner, @lessw2020 can you please have a look -- torchchat uses the same runners as pytorch.

* Update run-docs

Remove HF login because tokens not available as git secret

* Update run-docs

Replace llama3.1 with open-llama to avoid need for token.
If this turns out running too long, then we can switch to stories110M

* Update run-docs

open-llama -> stories.
---
 .ci/scripts/run-docs                |  3 ++-
 .github/workflows/run-readme-pr.yml | 22 ++++++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs
index 521cfa811..3ca460cd2 100755
--- a/.ci/scripts/run-docs
+++ b/.ci/scripts/run-docs
@@ -129,7 +129,8 @@ fi
 if [ "$1" == "distributed" ]; then
 
         echo "::group::Create script to run distributed"
-        python3 torchchat/utils/scripts/updown.py --file docs/distributed.md > ./run-distributed.sh
+        python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --replace 'llama3.1:stories110M,-l 3:-l 2'  --suppress huggingface-cli,HF_TOKEN  > ./run-distributed.sh
+        python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --suppress huggingface-cli,HF_TOKEN > ./run-distributed.sh
         # for good measure, if something happened to updown processor,
         # and it did not error out, fail with an exit 1
         echo "exit 1" >> ./run-distributed.sh
diff --git a/.github/workflows/run-readme-pr.yml b/.github/workflows/run-readme-pr.yml
index f32473435..37c27822b 100644
--- a/.github/workflows/run-readme-pr.yml
+++ b/.github/workflows/run-readme-pr.yml
@@ -306,3 +306,25 @@ jobs:
         echo "::endgroup::"
 
         TORCHCHAT_DEVICE=cpu .ci/scripts/run-docs native
+
+  test-distributed-cuda:
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.4"
+      timeout: 60
+      script: |
+        echo "::group::Print machine info"
+        uname -a
+        echo "::endgroup::"
+
+        .ci/scripts/run-docs distributed
+
+        echo "::group::Completion"
+        echo "tests complete"
+        echo "*******************************************"
+        echo "::endgroup::"

From eba2b078b1f1dd3e0366817417d5685f8107f51e Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Thu, 30 Jan 2025 15:07:37 -0800
Subject: [PATCH 78/83] Update run-docs to avoid code duplication (#1439)

* Update run-docs to avoid duplicate code

Update run-docs to avoid duplicate code

* Update run-docs

Add back command explaining seemingly extraneous `echo exit 1`

* Update build_native.sh

Update to C++11 ABI for AOTI, similar to ET

* Update run-docs

* Update run-docs

Update to run distributed inference test with open-llama instead of llama3.1

* Update run-docs

Open-llama -> stories to avoid tokens.

* Update README.md

Remove -l 3 since no longer necessary after Angea's change

* Update quantization.md

remove -l 3 from aoti run , and write -l3 for et_run

* Update run-docs

-l 3:-l 2 -> -l3:-l2

after modifying the command lines.  Hopefull this is legal for et_run

* Update run.cpp

Update to support non-space separated args

* Update run.cpp

typo

* Create cuda-32.json

Add a gs=32 cuda.json for test runs with stories15M

* Create mobile-32.json

add gs=32 variant of mobile for tests

* Update run-docs

Use gs=32 variants with stories models

* Update run-docs

undo gs32

* Update run-readme-pr-mps.yml

Extend timeout to avoid timeout of mps quantization test

* Update run.cpp

enforce that and argument must have at least length 2, and refine check for uniarg (ie arg plus flag value in one option) to be args with more than 2 characters

* Update run.cpp

typos

---------

Co-authored-by: Jack-Khuu <jack.khuu.7@gmail.com>
---
 .ci/scripts/run-docs                    | 202 ++++++++----------------
 .github/workflows/run-readme-pr-mps.yml |   5 +-
 README.md                               |   2 +-
 docs/quantization.md                    |   4 +-
 runner/run.cpp                          |  44 ++++--
 torchchat/quant_config/cuda-32.json     |   5 +
 torchchat/quant_config/mobile-32.json   |   4 +
 7 files changed, 105 insertions(+), 161 deletions(-)
 create mode 100644 torchchat/quant_config/cuda-32.json
 create mode 100644 torchchat/quant_config/mobile-32.json

diff --git a/.ci/scripts/run-docs b/.ci/scripts/run-docs
index 3ca460cd2..71f074cef 100755
--- a/.ci/scripts/run-docs
+++ b/.ci/scripts/run-docs
@@ -1,145 +1,67 @@
-# /bin/bash -x
+#!/bin/bash -x
 
-if [ "X$1" == "X" ]; then
+# Check if an argument was provided
+if [ -z "$1" ]; then
   echo "Must specify document to run"
   exit 1
 fi
 
-if [ "$1" == "readme" ]; then
-        echo "::group::Create script to run README"
-        python3 torchchat/utils/scripts/updown.py --create-sections --file README.md --replace 'llama3.1:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-readme.sh
-        # for good measure, if something happened to updown processor,
-        # and it did not error out, fail with an exit 1
-        echo "exit 1" >> ./run-readme.sh
-        echo "::endgroup::"
-
-        echo "::group::Run README"
-        echo "*******************************************"
-        cat ./run-readme.sh
-        echo "*******************************************"
-        bash -x ./run-readme.sh
-        echo "::endgroup::"
-
-        exit 0
-fi
-
-if [ "$1" == "quantization" ]; then
-        echo "::group::Create script to run quantization"
-        python3 torchchat/utils/scripts/updown.py --create-sections --file docs/quantization.md --replace llama3:stories15M --suppress huggingface-cli,HF_TOKEN > ./run-quantization.sh
-        # for good measure, if something happened to updown processor,
-        # and it did not error out, fail with an exit 1
-        echo "exit 1" >> ./run-quantization.sh
-        echo "::endgroup::"
-
-        echo "::group::Run quantization"
-        echo "*******************************************"
-        cat ./run-quantization.sh
-        echo "*******************************************"
-        bash -x ./run-quantization.sh
-        echo "::endgroup::"
-
-        exit 0
-fi
-
-if [ "$1" == "gguf" ]; then
-        echo "::group::Create script to run gguf"
-        python3 torchchat/utils/scripts/updown.py --file docs/GGUF.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-gguf.sh
-        # for good measure, if something happened to updown processor,
-        # and it did not error out, fail with an exit 1
-        echo "exit 1" >> ./run-gguf.sh
-        echo "::endgroup::"
-
-        echo "::group::Run gguf"
-        echo "*******************************************"
-        cat ./run-gguf.sh
-        echo "*******************************************"
-        bash -x ./run-gguf.sh
-        echo "::endgroup::"
-fi
-
-
-if [ "$1" == "advanced" ]; then
-        echo "::group::Create script to run advanced"
-        python3 torchchat/utils/scripts/updown.py --file docs/ADVANCED-USERS.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-advanced.sh
-        # for good measure, if something happened to updown processor,
-        # and it did not error out, fail with an exit 1
-        echo "exit 1" >> ./run-advanced.sh
-        echo "::endgroup::"
-
-        echo "::group::Run advanced"
-        echo "*******************************************"
-        cat ./run-advanced.sh
-        echo "*******************************************"
-        bash -x ./run-advanced.sh
-        echo "::endgroup::"
-fi
-
-if [ "$1" == "evaluation" ]; then
-        echo "::group::Create script to run evaluation"
-        python3 torchchat/utils/scripts/updown.py --file torchchat/utils/docs/evaluation.md --replace 'llama3:stories15M,-l 3:-l 2' --suppress huggingface-cli,HF_TOKEN > ./run-evaluation.sh
-        # for good measure, if something happened to updown processor,
-        # and it did not error out, fail with an exit 1
-        echo "exit 1" >> ./run-evaluation.sh
-        echo "::endgroup::"
-
-        echo "::group::Run evaluation"
-        echo "*******************************************"
-        cat ./run-evaluation.sh
-        echo "*******************************************"
-        bash -x ./run-evaluation.sh
-fi
-
-if [ "$1" == "multimodal" ]; then
-
-   # Expecting that this might fail this test as-is, because 
-   # it's the first on-pr test depending on github secrets for access with HF token access
-
-        echo "::group::Create script to run multimodal"
-        python3 torchchat/utils/scripts/updown.py --file docs/multimodal.md > ./run-multimodal.sh
-        # for good measure, if something happened to updown processor,
-        # and it did not error out, fail with an exit 1
-        echo "exit 1" >> ./run-multimodal.sh
-        echo "::endgroup::"
-
-        echo "::group::Run multimodal"
-        echo "*******************************************"
-        cat ./run-multimodal.sh
-        echo "*******************************************"
-        bash -x ./run-multimodal.sh
-        echo "::endgroup::"
-fi
-
-if [ "$1" == "native" ]; then
-
-        echo "::group::Create script to run native-execution"
-        python3 torchchat/utils/scripts/updown.py --file docs/native-execution.md > ./run-native.sh
-        # for good measure, if something happened to updown processor,
-        # and it did not error out, fail with an exit 1
-        echo "exit 1" >> ./run-native.sh
-        echo "::endgroup::"
-
-        echo "::group::Run native-execution"
-        echo "*******************************************"
-        cat ./run-native.sh
-        echo "*******************************************"
-        bash -x ./run-native.sh
-        echo "::endgroup::"
-fi
-
-if [ "$1" == "distributed" ]; then
-
-        echo "::group::Create script to run distributed"
-        python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --replace 'llama3.1:stories110M,-l 3:-l 2'  --suppress huggingface-cli,HF_TOKEN  > ./run-distributed.sh
-        python3 torchchat/utils/scripts/updown.py --file docs/distributed.md --suppress huggingface-cli,HF_TOKEN > ./run-distributed.sh
-        # for good measure, if something happened to updown processor,
-        # and it did not error out, fail with an exit 1
-        echo "exit 1" >> ./run-distributed.sh
-        echo "::endgroup::"
-
-        echo "::group::Run distributed"
-        echo "*******************************************"
-        cat ./run-distributed.sh
-        echo "*******************************************"
-        bash -x ./run-distributed.sh
-        echo "::endgroup::"
-fi
+# Pre-initialize variables
+filepath=""
+parameters="--replace 'llama3:stories15M,-l3:-l2' --suppress huggingface-cli,HF_TOKEN"
+script_name="./run-${1}.sh"  # Dynamically initialize script name
+
+# Use a case statement to handle the $1 argument
+case "$1" in
+  "readme")
+    filepath="README.md"
+    ;;
+  "quantization")
+    filepath="docs/quantization.md"
+    ;;
+  "gguf")
+    filepath="docs/GGUF.md"
+    ;;
+  "advanced")
+    filepath="docs/ADVANCED-USERS.md"
+    ;;
+  "evaluation")
+    filepath="torchchat/utils/docs/evaluation.md"
+    ;;
+  "multimodal")
+    filepath="docs/multimodal.md"
+    parameters=""  # Clear parameters
+    ;;
+  "native")
+    filepath="docs/native-execution.md"
+    parameters=""  # Clear parameters
+    ;;
+  "distributed")
+    filepath="docs/distributed.md"
+    parameters="--replace 'llama3.1:stories110M,-l3:-l2' --suppress huggingface-cli,HF_TOKEN"  # Use stories110M to avoid need for authentication
+    ;;
+  "local")
+    filepath="docs/local-model.md"
+    parameters=""  # Clear parameters
+    ;;
+
+  *)
+    echo "Unknown option: $1"
+    exit 1
+    ;;
+esac
+
+# Generate the script
+echo "::group::Create script to run $1"
+python3 torchchat/utils/scripts/updown.py --file "$filepath" $parameters > "$script_name"
+# if something happened to updown processor, and it did not error out, fail with an exit 1
+echo "exit 1" >> "$script_name"
+echo "::endgroup::"
+
+# Run the script
+echo "::group::Run $1"
+echo "*******************************************"
+cat "$script_name"
+echo "*******************************************"
+bash -x "$script_name"
+echo "::endgroup::"
diff --git a/.github/workflows/run-readme-pr-mps.yml b/.github/workflows/run-readme-pr-mps.yml
index 4d5cd7e14..db16bc80e 100644
--- a/.github/workflows/run-readme-pr-mps.yml
+++ b/.github/workflows/run-readme-pr-mps.yml
@@ -15,8 +15,8 @@ jobs:
           conda create -y -n test-readme-mps-macos python=3.10.11 llvm-openmp
           conda activate test-readme-mps-macos
           set -x
-          # NS: Remove previous installation  of torch first
-          # as this script does not isntall anything into conda env but rather as system dep
+          # NS: Remove previous installation of torch first
+          # as this script does not install anything into conda env but rather as system dep
           pip3 uninstall -y torch || true
           set -eou pipefail
 
@@ -37,6 +37,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
       runner: macos-m1-14
+      timeout: 60
       script: |
           set -x
           conda create -y -n test-quantization-mps-macos python=3.10.11
diff --git a/README.md b/README.md
index 2448b0b72..04fb4789e 100644
--- a/README.md
+++ b/README.md
@@ -413,7 +413,7 @@ torchchat/utils/scripts/build_native.sh et
 
 Execute using the runner
 ```bash
-cmake-out/et_run llama3.1.pte -z `python3 torchchat.py where llama3.1`/tokenizer.model -l 3 -i "Once upon a time"
+cmake-out/et_run llama3.1.pte -z `python3 torchchat.py where llama3.1`/tokenizer.model -i "Once upon a time"
 ```
 
 </details>
diff --git a/docs/quantization.md b/docs/quantization.md
index 704a7ed6a..56fd2182e 100644
--- a/docs/quantization.md
+++ b/docs/quantization.md
@@ -182,7 +182,7 @@ OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --dso-path llama3_1.so
 If you built the AOTI runner with link_torchao_ops as discussed in the setup section, you can also use the C++ runner:
 
 ```
-OMP_NUM_THREADS=6 ./cmake-out/aoti_run llama3_1.so -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l 3 -i "Once upon a time,"
+OMP_NUM_THREADS=6 ./cmake-out/aoti_run llama3_1.so -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -i "Once upon a time," # -l 3
 ```
 
 #### ExecuTorch
@@ -193,7 +193,7 @@ python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"e
 Note: only the ExecuTorch C++ runner in torchchat when built using the instructions in the setup can run the exported *.pte file.  It will not work with the `python torchchat.py generate` command.
 
 ```
-./cmake-out/et_run llama3_1.pte -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l 3 -i "Once upon a time,"
+./cmake-out/et_run llama3_1.pte -z $HOME/.torchchat/model-cache/meta-llama/Meta-Llama-3.1-8B-Instruct/tokenizer.model -l3 -i "Once upon a time,"
 ```
 
 ## Experimental TorchAO MPS lowbit kernels
diff --git a/runner/run.cpp b/runner/run.cpp
index e5c818cfa..d64c636bb 100644
--- a/runner/run.cpp
+++ b/runner/run.cpp
@@ -803,41 +803,53 @@ int main(int argc, char *argv[]) {
   } else {
     error_usage();
   }
-  for (int i = 2; i < argc; i += 2) {
+  for (int i = 2; i < argc; i += 1) {
     // do some basic validation
-    if (i + 1 >= argc) {
-      error_usage();
-    } // must have arg after flag
+    char *parm = argv[i+1];
+    // uniarg means the arg comes right after the letter in accordance with posix
+    int uniarg = strlen(argv[i]) > 2; 
+
     if (argv[i][0] != '-') {
       error_usage();
     } // must start with dash
-    if (strlen(argv[i]) != 2) {
+
+    if (strlen(argv[i]) < 2) {
       error_usage();
-    } // must be -x (one dash, one letter)
+    } // must have at least dash '-' and option letter
+    
+    if (uniarg) {
+      parm=&argv[i][2];
+    } else if (i + 1 >= argc) {
+      error_usage();
+    } // must have arg after option if flag is not contiguous to option
+    
     // read in the args
     if (argv[i][1] == 't') {
-      temperature = atof(argv[i + 1]);
+      temperature = atof(parm);
     } else if (argv[i][1] == 'p') {
-      topp = atof(argv[i + 1]);
+      topp = atof(parm);
     } else if (argv[i][1] == 's') {
-      rng_seed = atoi(argv[i + 1]);
+      rng_seed = atoi(parm);
     } else if (argv[i][1] == 'n') {
-      steps = atoi(argv[i + 1]);
+      steps = atoi(parm);
     } else if (argv[i][1] == 'v') {
-      vocab_size = atoi(argv[i + 1]);
+      vocab_size = atoi(parm);
     } else if (argv[i][1] == 'i') {
-      prompt = argv[i + 1];
+      prompt = parm;
     } else if (argv[i][1] == 'z') {
-      tokenizer_path = argv[i + 1];
+      tokenizer_path = parm;
     } else if (argv[i][1] == 'm') {
-      mode = argv[i + 1];
+      mode = parm;
     } else if (argv[i][1] == 'y') {
-      system_prompt = argv[i + 1];
+      system_prompt = parm;
     } else if (argv[i][1] == 'l') {
-      llama_ver = atoi(argv[i + 1]);
+      llama_ver = atoi(parm);
     } else {
       error_usage();
     }
+
+    // account for parameter
+    i += (uniarg)?0:1;
   }
 
   if (model_path == NULL) {
diff --git a/torchchat/quant_config/cuda-32.json b/torchchat/quant_config/cuda-32.json
new file mode 100644
index 000000000..90c37250a
--- /dev/null
+++ b/torchchat/quant_config/cuda-32.json
@@ -0,0 +1,5 @@
+{
+    "executor": {"accelerator": "cuda"},
+    "precision": {"dtype": "bf16"},
+    "linear:int4": {"groupsize" : 32}
+}
diff --git a/torchchat/quant_config/mobile-32.json b/torchchat/quant_config/mobile-32.json
new file mode 100644
index 000000000..3afaa7542
--- /dev/null
+++ b/torchchat/quant_config/mobile-32.json
@@ -0,0 +1,4 @@
+{
+    "embedding": {"bitwidth": 4, "groupsize" : 32},
+    "linear:a8w4dq": {"groupsize" : 32}
+}

From 2f34fee72c39f2acf126f4ae91e8a3ccab2482c6 Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Thu, 30 Jan 2025 16:16:55 -0800
Subject: [PATCH 79/83] Add `export --output-snapshot-path snap.tc`, and
 `--snapshot-path snap.tc` (#1465)

* support model snapshots to save quantized models

* import set backend

---------

Co-authored-by: Michael Gschwind <m.k.gschwind@gmail.com>
---
 torchchat/cli/builder.py | 33 +++++++++++++++++++++++++++
 torchchat/cli/cli.py     | 14 +++++++++++-
 torchchat/export.py      | 49 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
index a5b23dfe3..1e04800ab 100644
--- a/torchchat/cli/builder.py
+++ b/torchchat/cli/builder.py
@@ -56,6 +56,7 @@ class BuilderArgs:
     gguf_kwargs: Optional[Dict[str, Any]] = None
     dso_path: Optional[Union[Path, str]] = None
     aoti_package_path: Optional[Union[Path, str]] = None
+    snapshot_path: Optional[Union[Path, str]] = None
     pte_path: Optional[Union[Path, str]] = None
     device: Optional[str] = None
     precision: torch.dtype = torch.float32
@@ -87,6 +88,7 @@ def __post_init__(self):
             or (self.dso_path and Path(self.dso_path).is_file())
             or (self.aoti_package_path and Path(self.aoti_package_path).is_file())
             or (self.pte_path and Path(self.pte_path).is_file())
+            or (self.snapshot_path and Path(self.snapshot_path).is_file())
         ):
             raise RuntimeError(
                 "need to specify a valid checkpoint path, checkpoint dir, gguf path, DSO path, AOTI PACKAGE or PTE path"
@@ -142,6 +144,7 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
         dso_path = getattr(args, "dso_path", None)
         pte_path = getattr(args, "pte_path", None)
         aoti_package_path = getattr(args, "aoti_package_path", None)
+        snapshot_path = getattr(args, "snapshot_path", None)
 
         is_chat_model = False
         if args.is_chat_model:
@@ -169,6 +172,7 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
         output_pte_path = getattr(args, "output_pte_path", None)
         output_aoti_package_path = getattr(args, "output_aoti_package_path", None)
         output_dso_path = getattr(args, "output_dso_path", None)
+        output_snapshot_path = getattr(args, "output_snapshot_path", None)
         if output_pte_path and args.dtype.startswith("fast"):
             if args.dtype == "fast":
                 # As per Kimish, float32 should be faster on ET XNNPACK
@@ -206,6 +210,7 @@ def from_args(cls, args: argparse.Namespace) -> "BuilderArgs":
             dso_path=dso_path,
             aoti_package_path=aoti_package_path,
             pte_path=pte_path,
+            snapshot_path=snapshot_path,
             device=args.device,
             precision=dtype,
             setup_caches=(
@@ -631,6 +636,34 @@ def do_nothing(max_batch_size, max_seq_length):
             model = PTEModel(config, builder_args.pte_path)
         except Exception:
             raise RuntimeError(f"Failed to load ET compiled {builder_args.pte_path}")
+    elif builder_args.snapshot_path:
+        # Resolve ModelArgs for constructing the PTEModel
+        # If a manual params_path is provided, use that
+        if builder_args.params_path:
+            config: ModelArgs = ModelArgs.from_params(builder_args.params_path)
+        else:
+            # TODO: Instead of loading the whole model, refactor to call a
+            # helper that generate just model.config
+            with measure_time("Time to load model: {time:.02f} seconds"):
+                model = _load_model(builder_args)
+                device_sync(device=builder_args.device)
+                config = model.config
+                model = None
+        try:
+            model = torch.load(builder_args.snapshot_path, weights_only=False)
+        except Exception:
+            raise RuntimeError(f"Failed to load torchchat snapshot {builder_args.snapshot_path}")
+        # _active_backend() does not allow DSO & AOTI to be true. 
+        # Choose either.
+        from torchchat.utils.build_utils import set_backend
+        set_backend (dso=True, pte=False, aoti_package=False)
+        if (model.config != config):
+            raise RuntimeError("loaded model architecture mismatch")
+        ##        
+        ## import all libraries with custom kernels ans custom operators
+        ## that quantize may be pulling in
+        ##
+
     elif builder_args.distributed:
         pp_degree = builder_args.pp
         tp_degree = builder_args.tp
diff --git a/torchchat/cli/cli.py b/torchchat/cli/cli.py
index 70f404635..1d531c709 100644
--- a/torchchat/cli/cli.py
+++ b/torchchat/cli/cli.py
@@ -207,6 +207,12 @@ def _add_export_output_path_args(parser) -> None:
         default=None,
         help="Output to the specified AOT Inductor .dso model file",
     )
+    exclusive_parser.add_argument( 
+        "--output-snapshot-path",
+        type=str,
+        default=None,
+        help="Output to the specified PyTorch model and sha256 file",
+    )
     exclusive_parser.add_argument(
         "--output-aoti-package-path",
         type=str,
@@ -254,7 +260,13 @@ def _add_exported_input_path_args(parser) -> None:
         default=None,
         help="Use the specified ExecuTorch .pte model file",
     )
-
+    exclusive_parser.add_argument(
+        "--snapshot-path",
+        type=Path,
+        default=None,
+        help="Use the specified torchchat snaphot .tc model file",
+    )
+ 
 
 # Add CLI Args related to JIT downloading of model artifacts
 def _add_jit_downloading_args(parser) -> None:
diff --git a/torchchat/export.py b/torchchat/export.py
index 829bd47db..e7cb32309 100644
--- a/torchchat/export.py
+++ b/torchchat/export.py
@@ -28,6 +28,31 @@
 default_device = "cpu"
 
 
+"""
+Export Snapshot
+"""
+
+
+def export_snapshot(
+    model: nn.Module,
+    device: Optional[str] = None,
+    output_path: str = "model-snapshot.tc",
+) -> str:
+    """
+    Export the model as snapshot.
+
+    Args:
+        model: The model to be exported.
+        device: The device to run the model on.
+        output_path: The path to save the exported model.
+    Returns:
+        The path to the exported model.
+    """
+    assert output_path.endswith(".tc"), "use .tc extension for snapshots"
+    torch.save(model, output_path)
+    return output_path
+
+
 """
 Export for Server
 """
@@ -72,6 +97,7 @@ def export_for_server(
             "aot_inductor.package": package,
             "aot_inductor.metadata": metadata or {},
         }
+
         if not package:
             options = {"aot_inductor.output_path": output_path}
 
@@ -373,6 +399,7 @@ def main(args):
 
     output_pte_path = args.output_pte_path
     output_dso_path = args.output_dso_path
+    output_snapshot_path = args.output_snapshot_path
     output_aoti_package_path = args.output_aoti_package_path
 
     if output_pte_path and builder_args.device != "cpu":
@@ -380,7 +407,7 @@ def main(args):
             f"Warning! ExecuTorch export target is controlled by export recipe, not device setting. Ignoring device={builder_args.device} setting."
         )
         builder_args.device = "cpu"
-    elif "mps" in builder_args.device:
+    elif (output_pte_path or output_dso_path or output_aoti_package_path) and "mps" in builder_args.device:
         print("Warning! Device MPS not supported for export. Exporting for device CPU.")
         builder_args.device = "cpu"
 
@@ -417,6 +444,7 @@ def main(args):
         model_to_pte = model
         model_to_dso = model
         model_to_aoti_package = model
+        model_to_snapshot = model
     else:
         if output_pte_path:
             _set_gguf_kwargs(builder_args, is_et=True, context="export")
@@ -436,6 +464,15 @@ def main(args):
             model_to_dso = model_to_aoti_package
             _unset_gguf_kwargs(builder_args)
 
+        if output_snapshot_path:
+            _set_gguf_kwargs(builder_args, is_et=False, context="export")
+            model_to_snapshot = _initialize_model(
+                builder_args,
+                quantize,
+                support_tensor_subclass=False,
+            )
+            _unset_gguf_kwargs(builder_args)
+ 
     with torch.no_grad():
         if output_pte_path:
             output_pte_path = str(os.path.abspath(output_pte_path))
@@ -483,3 +520,13 @@ def main(args):
                 package=True,
                 metadata=metadata,
             )
+
+        if output_snapshot_path:
+            output_snapshot_path = str(os.path.abspath(output_snapshot_path))
+            print(f"Exporting model using Snapshot to {output_snapshot_path}")
+            export_snapshot(
+                model_to_snapshot,
+                builder_args.device,
+                output_snapshot_path,
+            )
+

From ad7f85a531e72dce1cbd380761d2f3219029ca7e Mon Sep 17 00:00:00 2001
From: Michael Gschwind <61328285+mikekgfb@users.noreply.github.com>
Date: Fri, 31 Jan 2025 12:48:47 -0800
Subject: [PATCH 80/83] Update check_gibberish to check for aspell
 availability(#1487)

Hnadle situation where aspell not available
---
 .ci/scripts/check_gibberish | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.ci/scripts/check_gibberish b/.ci/scripts/check_gibberish
index 5d9783b3b..912020a5a 100755
--- a/.ci/scripts/check_gibberish
+++ b/.ci/scripts/check_gibberish
@@ -24,6 +24,18 @@ else
     fi
 fi
 
+#######################################################################
+#
+# check whether aspell spell check evailable
+
+if command -v aspell &> /dev/null; then
+    echo "Checking $TMPFILE for gibberish"
+else
+    echo "Aspell is not installed or not in PATH."
+    echo "Gibberish unchecked in $TMPFILE"
+    exit 0
+fi
+
 #######################################################################
 #
 # run spell check on the extracted sequence

From 31ecb188cf614c13e63748b702fc5b4cf7131d52 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Mon, 3 Feb 2025 10:15:00 -0800
Subject: [PATCH 81/83] Add DeepSeek R1 Distill 8B (#1488)

* Add DeepSeek R1 Distill 8B

* Update aliases to match Ollama

* Update README
---
 README.md                                              |  7 ++++++-
 tokenizer/hf_tokenizer.py                              | 10 ++++++++--
 torchchat/model_config/models.json                     |  6 ++++++
 .../model_params/DeepSeek-R1-Distill-Llama-8B.json     |  1 +
 4 files changed, 21 insertions(+), 3 deletions(-)
 create mode 100644 torchchat/model_params/DeepSeek-R1-Distill-Llama-8B.json

diff --git a/README.md b/README.md
index 04fb4789e..51db1bfca 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,11 @@
 torchchat is a small codebase showcasing the ability to run large language models (LLMs) seamlessly. With torchchat, you can run LLMs using Python, within your own (C/C++) application (desktop or server) and on iOS and Android.
 
 > [!IMPORTANT]
-> Update September 25, 2024: torchchat has multimodal support for **Llama3.2 11B**!!
+> Update
+>
+> **February 3, 2025**: torchchat has support for [**DeepSeek R1 Distill: 8B**]( https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B)!
+>
+> **September 25, 2024**: torchchat has multimodal support for **Llama3.2 11B**!
 >
 > To try it out, finish the [Installation](#Installation) section below, then hop
 > over to our [multimodal guide](docs/multimodal.md) to learn more.
@@ -75,6 +79,7 @@ aliases.
 | [ibm-granite/granite-3.0-8b-instruct](https://huggingface.co/ibm-granite/granite-3.0-8b-instruct) |✅| Alias to `granite3-8b`.|
 | [ibm-granite/granite-3.1-2b-instruct](https://huggingface.co/ibm-granite/granite-3.1-2b-instruct) |✅| Alias to `granite3.1-2b` and `granite3.1`.|
 | [ibm-granite/granite-3.1-8b-instruct](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct) |✅| Alias to `granite3.1-8b`.|
+| [deepseek-ai/DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B) |✅| Alias to `deepseek-r1:8b`.|
 
 
 ## Installation
diff --git a/tokenizer/hf_tokenizer.py b/tokenizer/hf_tokenizer.py
index d10ecb076..b77ee43ea 100644
--- a/tokenizer/hf_tokenizer.py
+++ b/tokenizer/hf_tokenizer.py
@@ -46,8 +46,14 @@ def __init__(self, file_path: str):
         if tokenizer_config_path is not None:
             with open(tokenizer_config_path, "r") as handle:
                 tok_config = json.load(handle)
-            bos_token = tok_config.get("bos_token")
-            eos_token = tok_config.get("eos_token")
+
+            def _extract_token(identifier: str) -> Optional[str]:
+                entry: Optional[Union[str, dict]] = tok_config.get(identifier)
+                return entry.get("content") if isinstance(entry, dict) else entry
+
+            bos_token = _extract_token("bos_token")
+            eos_token = _extract_token("eos_token")
+
             if bos_token is not None:
                 self._bos_id = self._tokenizer.token_to_id(bos_token)
             if eos_token is not None:
diff --git a/torchchat/model_config/models.json b/torchchat/model_config/models.json
index d2252e6dd..3c2161b9b 100644
--- a/torchchat/model_config/models.json
+++ b/torchchat/model_config/models.json
@@ -51,6 +51,12 @@
         "distribution_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
         "transformer_params_key": "Meta-Llama-3.1-8B"
     },
+    "deepseek-ai/DeepSeek-R1-Distill-Llama-8B": {
+        "aliases": ["deepseek-r1:8b"],
+        "distribution_channel": "HuggingFaceSnapshot",
+        "distribution_path": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+        "tokenizer_file": "tokenizer.json"
+    },
     "meta-llama/Meta-Llama-3.1-70B-Instruct": {
         "aliases": ["llama3.1-70b"],
         "distribution_channel": "HuggingFaceSnapshot",
diff --git a/torchchat/model_params/DeepSeek-R1-Distill-Llama-8B.json b/torchchat/model_params/DeepSeek-R1-Distill-Llama-8B.json
new file mode 100644
index 000000000..b9fa79cd2
--- /dev/null
+++ b/torchchat/model_params/DeepSeek-R1-Distill-Llama-8B.json
@@ -0,0 +1 @@
+{"block_size": 131072, "dim": 4096, "ffn_dim_multiplier": 1.3, "multiple_of": 1024, "n_heads": 32, "n_local_heads": 8, "n_layers": 32, "rope_base": 500000.0, "vocab_size": 128256, "use_tiktoken": true, "use_hf_tokenizer": true, "norm_eps": 1e-05, "rope_scaling": {"factor": 8.0, "low_freq_factor": 1.0, "high_freq_factor": 4.0, "original_max_position_embeddings": 8192}}

From 5f9b3475aa8baa7df279d9d9c4ec1376fc66c9e2 Mon Sep 17 00:00:00 2001
From: vmpuri <puri@meta.com>
Date: Thu, 24 Oct 2024 12:43:48 -0700
Subject: [PATCH 82/83] Replace WeightOnlyInt8Linear with TorchAO
 int8_weight_only quantization

---
 torchchat/utils/quantize.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py
index e6f08d9a9..a3e5b06ed 100644
--- a/torchchat/utils/quantize.py
+++ b/torchchat/utils/quantize.py
@@ -66,9 +66,11 @@ def get_named_parameters(func: Callable) -> List[str]:
     # Get the signature of the function
     signature = inspect.signature(func)
 
+
     # Extract the parameters from the signature
     parameters = signature.parameters
 
+
     # Filter and return named parameters
     named_params = [
         name
@@ -91,6 +93,8 @@ def validate_args(
     return q_kwargs
 
 
+
+
 #########################################################################
 ###                  torchchat quantization API                       ###
 
@@ -133,6 +137,7 @@ def quantize_model(
                     unwrap_tensor_subclass(model)
                 continue
 
+
             if quantizer in ["linear:a8wxdq", "embedding:wx"]:
                 # These quantizers require float32 input weights.  Note that after quantization,
                 # the weights will no longer be float32, but lowbit integers

From 8b1af3f1a874e43398d000cad971390db9534e20 Mon Sep 17 00:00:00 2001
From: vmpuri <puri@meta.com>
Date: Tue, 4 Feb 2025 13:48:33 -0800
Subject: [PATCH 83/83] Fallback to original quantization if float16

---
 torchchat/utils/quantize.py | 160 ++++++++++++++++++++++++++++++++++--
 1 file changed, 152 insertions(+), 8 deletions(-)

diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py
index a3e5b06ed..933bc1b9e 100644
--- a/torchchat/utils/quantize.py
+++ b/torchchat/utils/quantize.py
@@ -66,11 +66,9 @@ def get_named_parameters(func: Callable) -> List[str]:
     # Get the signature of the function
     signature = inspect.signature(func)
 
-
     # Extract the parameters from the signature
     parameters = signature.parameters
 
-
     # Filter and return named parameters
     named_params = [
         name
@@ -93,8 +91,6 @@ def validate_args(
     return q_kwargs
 
 
-
-
 #########################################################################
 ###                  torchchat quantization API                       ###
 
@@ -129,7 +125,15 @@ def quantize_model(
                 quantize_(model, int4_weight_only(q_kwargs["groupsize"]))
             elif quantizer == "linear:int8":
                 print("quantizer is linear int8")
-                quantize_(model, int8_weight_only())
+
+                # TODO: float16 quant via the AO quantize_() API seems broken. Remove this once the issue is resolved https://github.com/pytorch/ao/issues/1662
+                if get_precision() == torch.float16:
+                    print(
+                        "model is float16 dtype - fallback to native implementation (see https://github.com/pytorch/ao/issues/1662)"
+                    )
+                    ao_quant = False
+                else:
+                    quantize_(model, int8_weight_only())
             else:
                 ao_quant = False
             if ao_quant:
@@ -137,7 +141,6 @@ def quantize_model(
                     unwrap_tensor_subclass(model)
                 continue
 
-
             if quantizer in ["linear:a8wxdq", "embedding:wx"]:
                 # These quantizers require float32 input weights.  Note that after quantization,
                 # the weights will no longer be float32, but lowbit integers
@@ -578,6 +581,147 @@ def linear_int8_et(input, weight, scales):
     )
 
 
+class WeightOnlyInt8Linear(nn.Module):
+    __constants__ = ["in_features", "out_features"]
+    in_features: int
+    out_features: int
+    weight: torch.Tensor
+    scales: torch.Tensor
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        bias=None,
+        device=None,
+        dtype=None,
+        *,
+        weight: Optional[torch.Tensor] = None,
+        scales: Optional[torch.Tensor] = None,
+        groupsize: Optional[int] = None,
+    ):
+        super().__init__()
+        if dtype is None:
+            dtype = torch.get_default_dtype()
+
+        if device is None:
+            device = "cpu"
+
+        assert not bias, "Bias is not supported by LinearInt8"
+        self.in_features = in_features
+        self.out_features = out_features
+
+        assert (weight is None) == bool(
+            scales is None
+        ), "must specify both weights and scales, or neither"
+        if weight is None:
+            weight = torch.empty(
+                (out_features, in_features),
+                dtype=torch.int8,
+                device=device,
+            )
+            if groupsize is None or (groupsize == 0):
+                scales = torch.empty(out_features, dtype=dtype, device=device)
+            else:
+                n_groups = (in_features + groupsize - 1) // groupsize
+                scales = torch.empty(out_features, n_groups, dtype=dtype, device=device)
+
+        self.register_buffer("weight", weight.to(device))
+        self.register_buffer("scales", scales.to(device))
+
+        if use_et_backend():
+            self.forward = self.et_forward
+        else:
+            self.forward = self.aoti_forward
+
+    def aoti_forward(self, input: torch.Tensor) -> torch.Tensor:
+        return linear_int8_aoti(input, self.weight, self.scales)
+
+    def et_forward(self, input: torch.Tensor) -> torch.Tensor:
+        return linear_int8_et(input, self.weight, self.scales)
+
+
+class WeightOnlyInt8QuantHandler(QuantHandler):
+    def __init__(
+        self,
+        model: Optional[nn.Module] = None,
+        device=None,
+        precision=None,
+        tokenizer=None,
+        *,
+        node_type: str = "*",
+        bitwidth: Optional[int] = None,
+        groupsize: Optional[int] = None,
+    ):
+        self.model_ = model
+        self.device = device
+        self.groupsize = groupsize
+        self.node_type = node_type
+        if bitwidth is None:
+            self.bitwidth = 8
+        else:
+            self.bitwidth = bitwidth
+
+    @torch.no_grad()
+    def quantize(self, module):
+        # cur_state_dict = state_dict_device(self.model_.state_dict())
+        # dict_device = "cpu"  # self.device
+
+        if self.bitwidth == 4:
+            range_min = -8
+            range_max = 7
+        elif self.bitwidth == 8:
+            range_min = -128
+            range_max = 127
+        else:
+            raise ValueError(f"Unsupported bitwidth {self.bitwidth}")
+
+        for name, child in module.named_children():
+            # print(f"name: {name}")
+            if isinstance(child, nn.Linear):
+                if (
+                    (self.node_type == "*")
+                    or (self.node_type == "output" and name == "output")
+                    or (self.node_type == "!output" and name != "output")
+                ):
+                    # print(f"{name, child}")
+                    input_weight = child.weight.float()
+                    # print(f"{name, child}")
+                    # print(f"in_features: {child.in_features}")
+                    # print(f"out_features: {child.out_features}")
+
+                    # print(f"expanded weight shape {input_weight.shape}")
+                    weight, scales, _ = dynamically_quantize_per_channel(
+                        input_weight,
+                        range_min,
+                        range_max,
+                        torch.int8,
+                        self.groupsize,
+                        scales_dtype=child.weight.dtype,
+                    )
+
+                    setattr(
+                        module,
+                        name,
+                        WeightOnlyInt8Linear(
+                            in_features=child.in_features,
+                            out_features=child.out_features,
+                            device=self.device,
+                            # update variables from quantization
+                            weight=weight,
+                            scales=scales,
+                            groupsize=self.groupsize,
+                        ),
+                    )
+            else:
+                self.quantize(child)
+
+        return module
+
+    def quantized_model(self) -> nn.Module:
+        return self.quantize(self.model_)
+
+
 #########################################################################
 #####                   embedding table quantization               ######
 ###                    (unify with torchao in future)                 ###
@@ -797,7 +941,7 @@ def quantized_model(self) -> nn.Module:
     "precision": PrecisionHandler,
     "executor": ExecutorHandler,
     "linear:int4": Int4WeightOnlyQuantizer,
-    "linear:int8": int8_weight_only,
+    "linear:int8": WeightOnlyInt8QuantHandler,
     "linear:a8w4dq": Int8DynActInt4WeightQuantizer,
 }
 
@@ -840,7 +984,7 @@ def quantized_model(self) -> nn.Module:
         print("Loaded torchao cpu ops.")
     except Exception as e:
         print(
-            "Unabled to load torchao cpu ops library. Slow fallback kernels will be used."
+            "Unable to load torchao cpu ops library. Slow fallback kernels will be used."
         )
 
     try: