Merge pull request #177 from stochasticai/toan/fix_int4

sarthaklangde · web-flow · commit c0b1b76ffdcf · 2023-05-01T17:24:51.000+01:00
fix: int4 loading model
diff --git a/src/xturing/engines/llama_engine.py b/src/xturing/engines/llama_engine.py
@@ -1,17 +1,18 @@
 import os
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union
-import transformers
 
 import torch
+import transformers
 from torch import nn
 
 from xturing.engines.causal import CausalEngine, CausalLoraEngine
 from xturing.engines.llama_utils import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
 from xturing.engines.lora_engine import prepare_model_for_int8_training
-from xturing.engines.quant_utils import make_quant, autotune_warmup
+from xturing.engines.quant_utils import autotune_warmup, make_quant
 from xturing.utils.hub import ModelHub
 
+
 class LLamaEngine(CausalEngine):
     config_name: str = "llama_engine"
 
@@ -102,24 +103,28 @@ def __init__(self, weights_path: Optional[Union[str, Path]] = None):
             target_modules=["q_proj", "v_proj"],
         )
 
-def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
+
+def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=""):
     if type(module) in layers:
         return {name: module}
     res = {}
     for name1, child in module.named_children():
-        res.update(find_layers(
-            child, layers=layers, name=name + '.' + name1 if name != '' else name1
-        ))
+        res.update(
+            find_layers(
+                child, layers=layers, name=name + "." + name1 if name != "" else name1
+            )
+        )
     return res
 
+
 class LlamaLoraInt4Engine(CausalLoraEngine):
     config_name: str = "llama_lora_int4_engine"
 
     def __init__(self, weights_path: Optional[Union[str, Path]] = None):
-        model_name = "decapoda-research/llama-7b-hf" 
+        model_name = "decapoda-research/llama-7b-hf"
 
         if weights_path is None:
-            weights_path = ModelHub().load("x/llama_lora_int4")            
+            weights_path = ModelHub().load("x/llama_lora_int4")
 
         config = LlamaConfig.from_pretrained(model_name)
 
@@ -129,10 +134,10 @@ def __init__(self, weights_path: Optional[Union[str, Path]] = None):
 
         def noop(*args, **kwargs):
             pass
-        
-        torch.nn.init.kaiming_uniform_ = noop 
-        torch.nn.init.uniform_ = noop 
-        torch.nn.init.normal_ = noop 
+
+        torch.nn.init.kaiming_uniform_ = noop
+        torch.nn.init.uniform_ = noop
+        torch.nn.init.normal_ = noop
 
         torch.set_default_dtype(torch.half)
         transformers.modeling_utils._init_weights = False
@@ -143,18 +148,23 @@ def noop(*args, **kwargs):
 
         layers = find_layers(model)
 
-        for name in ['lm_head']:
+        for name in ["lm_head"]:
             if name in layers:
                 del layers[name]
-        
+
         wbits = 4
         groupsize = 128
-        warmup_autotune=True
-        
+        warmup_autotune = True
+
         make_quant(model, layers, wbits, groupsize)
-        
 
-        model.load_state_dict(torch.load(weights_path / Path("pytorch_model.bin")), strict=False)
+        state_dict = torch.load(
+            weights_path / Path("pytorch_model.bin"), map_location="cpu"
+        )
+        new_state_dict = {}
+        for key, value in state_dict.items():
+            new_state_dict[key[6:]] = value
+        model.load_state_dict(new_state_dict, strict=False)
 
         if warmup_autotune:
             autotune_warmup(model)
@@ -171,12 +181,12 @@ def noop(*args, **kwargs):
         tokenizer.pad_token_id = tokenizer.eos_token_id
 
         super().__init__(
-            model=model, 
+            model=model,
             tokenizer=tokenizer,
             target_modules=[
                 "q_proj",
                 "v_proj",
-            ]
+            ],
         )
 
         torch.nn.init.kaiming_uniform_ = saved_kaiming_uniform_