Merge pull request #21 from andrea-fasoli/int8_llm

ani300 · web-flow · commit 6e7e360fcb5c · 2025-04-14T15:47:20.000-04:00
INT8 LLM support
diff --git a/scripts/inference.py b/scripts/inference.py
@@ -1,22 +1,24 @@
+# Standard
 import argparse
+from functools import partial
 import itertools
 import json
 import os
+from pathlib import Path
 import random
-import sys
 import time
-from pathlib import Path
 
+# Third Party
 from aiu_fms_testing_utils.utils import aiu_setup
 from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank, local_rank, world_size
 import numpy as np
 import torch
-import torch._inductor.config
+from torch import distributed as dist
 from fms.models import get_model, register_model
 from fms.models.llama import LLaMAConfig, _llama_factory_factory
-from fms.utils import fusion, generation, tokenizers
+from fms.utils import generation, tokenizers
 from fms.utils.generation import generate, pad_input_ids
-from torch import distributed as dist
+
 
 # This example script validates the LLaMA implementation by running inference on a couple of prompts.
 #
@@ -59,10 +61,27 @@
 parser.add_argument(
     "--quantization",
     type=str,
-    choices=["gptq"],
+    choices=["gptq", "int8"],
     default=None,
     help="Type of quantization of the model checkpoint",
 )
+parser.add_argument(
+    "--int8_weight_per_channel",
+    action="store_true",
+    help="Enable per-channel weight quantization in INT8 quantized model",
+)
+parser.add_argument(
+    "--int8_activ_quant_type",
+    default="per_token",
+    choices=["per_token", "per_tensor_symm", "per_tensor_asymm"],
+    type=str,
+    help="Define strategy for activation quantization in INT8 quantized model",
+)
+parser.add_argument(
+    "--int8_smoothquant",
+    action="store_true",
+    help="Enable smoothquant in INT8 quantized model",
+)
 parser.add_argument(
     "--tokenizer",
     type=str,
@@ -196,19 +215,18 @@
 args = parser.parse_args()
 
 if args.quantization == "gptq":
-    GPTQ_ENABLED = True
-    try:
-        if "aiu" in args.device_type:
+    if "aiu" in args.device_type:
+        try:
             from fms_mo.aiu_addons.gptq import gptq_aiu_adapter, gptq_aiu_linear
             print("Loaded `aiu_addons` functionalities")
-        elif args.device_type != "cpu":
-            raise ValueError(f"Device {args.device_type} unsupported for GPTQ run")
-    except ImportError as e:
-        print(f"Failed to import addon packages: {e}")
-        GPTQ_ENABLED = False
-
-    if not GPTQ_ENABLED:
-        raise Exception("GPTQ not enabled")
+        except:
+            raise ImportError("Failed to import GPTQ addons from fms-mo.")
+elif args.quantization == "int8":
+    try:
+        from fms_mo.aiu_addons.i8i8 import i8i8_aiu_adapter, i8i8_aiu_linear
+        print("Loaded `aiu_addons` functionalities")
+    except:
+        raise ImportError("Failed to import INT8 addons from fms-mo.")
 
 # this is a test model config
 config = LLaMAConfig(
@@ -319,6 +337,13 @@
 
 fused_weights = not args.unfuse_weights
 if args.quantization == "gptq":
+    if fused_weights and is_aiu_backend:
+        raise ValueError("GPTQ checkpoints on AIU must always run with --unfuse_weights")
+    if default_dtype is not None:
+        raise ValueError(
+            "GPTQ default_dtype must be None to preserve the checkpoint data types."
+        )
+
     if "aiu" in args.device_type:
         linear_type = "gptq_aiu"
     elif args.device_type == "cpu":
@@ -352,12 +377,51 @@
         "group_size": group_size,
         "desc_act": desc_act,
     }
-    # [ATTENTION] for GPTQ on AIU, we must always instantiate an unfused
-    # model, the adapter will take care of converting key/values from
-    # ckpt into the appropriate form for the model
-    if fused_weights:
-        raise ValueError("GPTQ checkpoints on AIU must always run with --unfuse_weights")
-    default_dtype = None  # GPTQ dtype always comes from ckpt, can't be enforced
+elif args.quantization == "int8":
+    if fused_weights and is_aiu_backend:
+        raise ValueError("INT8 checkpoints on AIU must always run with --unfuse_weights")
+    if default_dtype is not None:
+        raise ValueError(
+            "INT8 default_dtype must be None to preserve the checkpoint data types."
+        )
+
+    def select_int8_module(
+        module_name: str | None = None,
+        smoothquant: bool = True,
+        smoothquant_layers: list[str] | None = None,
+    ):
+        if module_name is None:
+            return "int8_aiu"
+        smoothquant_on_module = (
+            any([m in module_name for m in smoothquant_layers])
+            if smoothquant_layers is not None
+            else True
+        )
+        use_smoothquant = smoothquant and smoothquant_on_module
+        return "int8_smoothquant_aiu" if use_smoothquant else "int8_aiu"
+
+    if args.int8_smoothquant:
+        # TODO: consider saving this info into config during quantization
+        if any("granite" in p.lower() for p in [args.model_path, args.architecture]):
+            smoothquant_layers = ["key", "value", "w1", "wg"]
+        elif any("roberta" in p.lower() for p in [args.model_path, args.architecture]):
+            smoothquant_layers = ["query", "key", "value", "w1"]
+        else:
+            raise NotImplementedError(
+                "INT8 architecture does not support smoothquant."
+            )
+    else:
+        smoothquant_layers = []
+
+    linear_config = {
+        "linear_type": partial(
+            select_int8_module,
+            smoothquant = args.int8_smoothquant,
+            smoothquant_layers = smoothquant_layers,
+        ),
+        "weight_per_channel": args.int8_weight_per_channel,
+        "activ_quant_type": args.int8_activ_quant_type,
+    }
 else:
     linear_config = {"linear_type": "torch_linear"}
 
@@ -381,13 +445,13 @@
     fused_weights=fused_weights,
 )
 
-if args.quantization == "gptq":
+if args.quantization in ["gptq", "int8"]:
     if rank == 0 and args.verbose > 0:
         dprint("PARAMS:\n" + "\n".join(f"{k:60} {str(v.dtype):15} {str(v.device):10} {list(v.size())}" for k,v in model.named_parameters()))
         dprint("BUFFERS:\n" + "\n".join(f"{k:60} {str(v.dtype):15} {str(v.device):10} {list(v.size())}" for k,v in model.named_buffers()))
         dprint("="*60 + "\n")
     if args.architecture == "llama":
-        dprint("[NOTE] It's OK for unused keys to contain bias and rotary embeddings, in GPTQ LLaMA models")
+        dprint("[NOTE] In Llama models, it's OK for bias and rotary embeddings to be marked as unused keys.")
     dprint(model)
     dprint("="*60 + "\n")
 
@@ -522,6 +586,8 @@ def truncate_prompts_to_max_length(prompts, max_len, max_allowed_length):
     ids, extra_generation_kwargs = pad_input_ids(prompts, min_pad_length=padding_length)
 else:
     ids = prompts
+    if isinstance(ids, list) and len(ids) == 1:
+        ids = ids[0].unsqueeze(0)
     extra_generation_kwargs = None