from transformers import AutoModelForCausalLM,BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(llm_int8_threshold=200.0,load_in_8bit=True)
model = AutoModelForCausalLM.from_pretrained(model_id,device_map='auto',torch_dtype=torch.float16,quantization_config=quantization_config,)
import time
from optimum.nvidia import AutoModelForCausalLM, ExportConfig
from optimum.nvidia.utils.cli import (
postprocess_quantization_parameters,
register_common_model_topology_args,
register_optimization_profiles_args,
register_quantization_args,
)
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf")
export = ExportConfig.from_pretrained("meta-llama/Llama-2-13b-chat-hf")
export.max_input_len = 128
export.max_output_len = 64
export.max_num_tokens = 64
export.max_beam_width = 1
def gentext(prompt):
start_time = time.time()
inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
generated = model.generate(
tokens["input_ids"],
)
response = tokenizer.decode(generated, skip_special_tokens=True)
total_time = time.time()-start_time
return str(response), str(total_time)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-chat-hf",device_map="sequential", export_config=export)
print(gentext("write a poem"))
root@sd2-compile-nvidia-ckz65:/optimum-nvidia# nvidia-smi
Sun Aug 11 06:18:10 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06 Driver Version: 535.183.06 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 Tesla V100-SXM2-16GB On | 00000000:00:1C.0 Off | 0 |
| N/A 37C P0 37W / 300W | 0MiB / 16384MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
| 1 Tesla V100-SXM2-16GB On | 00000000:00:1D.0 Off | 0 |
| N/A 36C P0 40W / 300W | 0MiB / 16384MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| No running processes found |
+---------------------------------------------------------------------------------------+
root@sd2-compile-nvidia-ckz65:/optimum-nvidia#
[TensorRT-LLM] TensorRT-LLM version: 0.12.0.dev2024072300
tokenizer_config.json: 100%|████████████████████████████████████████████████████████████| 1.62k/1.62k [00:00<00:00, 11.3MB/s]
tokenizer.model: 100%|████████████████████████████████████████████████████████████████████| 500k/500k [00:00<00:00, 14.6MB/s]
tokenizer.json: 100%|███████████████████████████████████████████████████████████████████| 1.84M/1.84M [00:00<00:00, 22.5MB/s]
special_tokens_map.json: 100%|██████████████████████████████████████████████████████████████| 414/414 [00:00<00:00, 3.82MB/s]
config.json: 100%|██████████████████████████████████████████████████████████████████████████| 587/587 [00:00<00:00, 5.03MB/s]
model.safetensors.index.json: 100%|█████████████████████████████████████████████████████| 33.4k/33.4k [00:00<00:00, 66.4MB/s]
generation_config.json: 100%|███████████████████████████████████████████████████████████████| 188/188 [00:00<00:00, 1.25MB/s]
model-00003-of-00003.safetensors: 100%|██████████████████████████████████████████████████| 6.18G/6.18G [00:17<00:00, 346MB/s]
model-00002-of-00003.safetensors: 100%|██████████████████████████████████████████████████| 9.90G/9.90G [00:30<00:00, 324MB/s]
model-00001-of-00003.safetensors: 100%|█████████████████████████████████████████████████| 9.95G/9.95G [02:19<00:00, 71.3MB/s]
Fetching 6 files: 100%|████████████████████████████████████████████████████████████████████████| 6/6 [02:19<00:00, 23.28s/it]
Fetching 6 files: 100%|██████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 5853.88it/s]
Fetching 6 files: 100%|██████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 7686.57it/s]
/opt/conda/lib/python3.10/site-packages/tensorrt_llm/models/llama/convert.py:1414: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
q, k, v = (torch.tensor(weights[t]) for t in ['q', 'k', 'v'])
[08/11/2024-06:10:36] [TRT] [E] [virtualMemoryBuffer.cpp::resizePhysical::151] Error Code 2: OutOfMemory (Requested size was 16654532608 bytes.)
[08/11/2024-06:10:36] [TRT] [E] [virtualMemoryBuffer.cpp::resizePhysical::151] Error Code 2: OutOfMemory (Requested size was 16654532608 bytes.)
[08/11/2024-06:10:36] [TRT] [E] [globWriter.cpp::makeResizableGpuMemory::435] Error Code 2: OutOfMemory (Requested size was 16654532608 bytes.)
[08/11/2024-06:10:36] [TRT-LLM] [E] Engine building failed, please check the error log.
[TensorRT-LLM][INFO] Engine version 0.12.0.dev2024072300 found in the config file, assuming engine(s) built by new builder API.
[TensorRT-LLM][INFO] MPI size: 1, MPI local size: 1, rank: 0
[TensorRT-LLM][INFO] Rank 0 is using GPU 0
[TensorRT-LLM][INFO] TRTGptModel maxNumSequences: 1
[TensorRT-LLM][INFO] TRTGptModel maxBatchSize: 1
[TensorRT-LLM][INFO] TRTGptModel maxBeamWidth: 1
[TensorRT-LLM][INFO] TRTGptModel maxSequenceLen: 64
[TensorRT-LLM][INFO] TRTGptModel maxDraftLen: 0
[TensorRT-LLM][INFO] TRTGptModel mMaxAttentionWindowSize: 64
[TensorRT-LLM][INFO] TRTGptModel computeContextLogits: 0
[TensorRT-LLM][INFO] TRTGptModel computeGenerationLogits: 0
[TensorRT-LLM][INFO] TRTGptModel enableTrtOverlap: 0
[TensorRT-LLM][INFO] TRTGptModel normalizeLogProbs: 1
[TensorRT-LLM][INFO] TRTGptModel maxNumTokens: 64
[TensorRT-LLM][INFO] TRTGptModel maxInputLen: 63 = min(maxSequenceLen - 1, maxNumTokens) since context FMHA and usePackedInput are enabled
[TensorRT-LLM][INFO] Capacity Scheduler Policy: GUARANTEED_NO_EVICT
[TensorRT-LLM][INFO] Context Chunking Scheduler Policy: None
[TensorRT-LLM][INFO] The logger passed into createInferRuntime differs from one already provided for an existing builder, runtime, or refitter. Uses of the global logger, returned by nvinfer1::getLogger(), will return the existing value.
Traceback (most recent call last):
File "/optimum-nvidia/1.py", line 44, in <module>
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-chat-hf",device_map="sequential", export_config=export)
File "/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
return fn(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/huggingface_hub/hub_mixin.py", line 569, in from_pretrained
instance = cls._from_pretrained(
File "/opt/conda/lib/python3.10/site-packages/optimum/nvidia/models/auto.py", line 75, in _from_pretrained
model = model_clazz.from_pretrained(
File "/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
return fn(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/huggingface_hub/hub_mixin.py", line 569, in from_pretrained
instance = cls._from_pretrained(
File "/opt/conda/lib/python3.10/site-packages/optimum/nvidia/hub.py", line 307, in _from_pretrained
return cls(
File "/opt/conda/lib/python3.10/site-packages/optimum/nvidia/runtime.py", line 166, in __init__
InferenceRuntimeBase.__init__(
File "/opt/conda/lib/python3.10/site-packages/optimum/nvidia/runtime.py", line 93, in __init__
self._executor = GenerationExecutor.create(
File "/opt/conda/lib/python3.10/site-packages/tensorrt_llm/executor.py", line 404, in create
return ExecutorBindingsWorker(**worker_kwargs)
File "/opt/conda/lib/python3.10/site-packages/tensorrt_llm/executor.py", line 425, in __init__
self.engine = tllm.Executor(engine_dir,
RuntimeError: [TensorRT-LLM][ERROR] Assertion failed: Error opening engine file: /root/.cache/huggingface/assets/trtllm/0.12.0.dev2024072300/meta-llama--Llama-2-13b-chat-hf/V100-SXM2-16GB/engines/rank0.engine (/home/jenkins/agent/workspace/LLM/main/L0_MergeRequest/tensorrt_llm/cpp/tensorrt_llm/runtime/tllmRuntime.cpp:66)
1 0x7feb878af7e1 tensorrt_llm::common::throwRuntimeError(char const*, int, std::string const&) + 82
2 0x7feb88dca0d0 tensorrt_llm::runtime::TllmRuntime::TllmRuntime(tensorrt_llm::runtime::RawEngine const&, nvinfer1::ILogger*, float, bool) + 3152
3 0x7feb8902a1c9 tensorrt_llm::batch_manager::TrtGptModelInflightBatching::TrtGptModelInflightBatching(std::shared_ptr<nvinfer1::ILogger>, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, tensorrt_llm::runtime::RawEngine const&, bool, tensorrt_llm::batch_manager::TrtGptModelOptionalParams const&) + 1017
4 0x7feb89045a03 tensorrt_llm::executor::Executor::Impl::createModel(tensorrt_llm::runtime::RawEngine const&, tensorrt_llm::runtime::ModelConfig const&, tensorrt_llm::runtime::WorldConfig const&, tensorrt_llm::executor::ExecutorConfig const&) + 419
5 0x7feb89046208 tensorrt_llm::executor::Executor::Impl::loadModel(std::optional<std::filesystem::path> const&, std::optional<std::vector<unsigned char, std::allocator<unsigned char> > > const&, tensorrt_llm::runtime::GptJsonConfig const&, tensorrt_llm::executor::ExecutorConfig const&, bool) + 1272
6 0x7feb8904c239 tensorrt_llm::executor::Executor::Impl::Impl(std::filesystem::path const&, std::optional<std::filesystem::path> const&, tensorrt_llm::executor::ModelType, tensorrt_llm::executor::ExecutorConfig const&) + 1785
7 0x7feb89040570 tensorrt_llm::executor::Executor::Executor(std::filesystem::path const&, tensorrt_llm::executor::ModelType, tensorrt_llm::executor::ExecutorConfig const&) + 64
8 0x7febe2181f92 /opt/conda/lib/python3.10/site-packages/tensorrt_llm/bindings.cpython-310-x86_64-linux-gnu.so(+0xb6f92) [0x7febe2181f92]
9 0x7febe21243ac /opt/conda/lib/python3.10/site-packages/tensorrt_llm/bindings.cpython-310-x86_64-linux-gnu.so(+0x593ac) [0x7febe21243ac]
10 0x55be86842b27 python(+0x13fb27) [0x55be86842b27]
11 0x55be8683c42b _PyObject_MakeTpCall + 619
12 0x55be8684e934 python(+0x14b934) [0x55be8684e934]
13 0x55be8684f322 PyVectorcall_Call + 146
14 0x55be8684c74a python(+0x14974a) [0x55be8684c74a]
15 0x55be8683c7a0 python(+0x1397a0) [0x55be8683c7a0]
16 0x7febe21239cb /opt/conda/lib/python3.10/site-packages/tensorrt_llm/bindings.cpython-310-x86_64-linux-gnu.so(+0x589cb) [0x7febe21239cb]
17 0x55be8683c42b _PyObject_MakeTpCall + 619
18 0x55be8683884e _PyEval_EvalFrameDefault + 23134
19 0x55be86842f8f _PyFunction_Vectorcall + 111
20 0x55be8683b985 _PyObject_FastCallDictTstate + 389
21 0x55be8684c34b python(+0x14934b) [0x55be8684c34b]
22 0x55be8683c7a0 python(+0x1397a0) [0x55be8683c7a0]
23 0x55be8684f139 PyObject_Call + 521
24 0x55be86835cb2 _PyEval_EvalFrameDefault + 11970
25 0x55be86842f8f _PyFunction_Vectorcall + 111
26 0x55be868341c0 _PyEval_EvalFrameDefault + 5072
27 0x55be86842f8f _PyFunction_Vectorcall + 111
28 0x55be86837afd _PyEval_EvalFrameDefault + 19725
29 0x55be86842f8f _PyFunction_Vectorcall + 111
30 0x55be8683b985 _PyObject_FastCallDictTstate + 389
31 0x55be8684c34b python(+0x14934b) [0x55be8684c34b]
32 0x55be8683c47b _PyObject_MakeTpCall + 699
33 0x55be8683884e _PyEval_EvalFrameDefault + 23134
34 0x55be8684e641 python(+0x14b641) [0x55be8684e641]
35 0x55be8684efe8 PyObject_Call + 184
36 0x55be86835cb2 _PyEval_EvalFrameDefault + 11970
37 0x55be86842f8f _PyFunction_Vectorcall + 111
38 0x55be8684efe8 PyObject_Call + 184
39 0x55be86835cb2 _PyEval_EvalFrameDefault + 11970
40 0x55be8684e641 python(+0x14b641) [0x55be8684e641]
41 0x55be8684efe8 PyObject_Call + 184
42 0x55be86835cb2 _PyEval_EvalFrameDefault + 11970
43 0x55be8684e641 python(+0x14b641) [0x55be8684e641]
44 0x55be8684efe8 PyObject_Call + 184
45 0x55be86835cb2 _PyEval_EvalFrameDefault + 11970
46 0x55be86842f8f _PyFunction_Vectorcall + 111
47 0x55be8684efe8 PyObject_Call + 184
48 0x55be86835cb2 _PyEval_EvalFrameDefault + 11970
49 0x55be8684e641 python(+0x14b641) [0x55be8684e641]
50 0x55be868341c0 _PyEval_EvalFrameDefault + 5072
51 0x55be868dba82 python(+0x1d8a82) [0x55be868dba82]
52 0x55be868db9c7 PyEval_EvalCode + 135
53 0x55be8690e82c python(+0x20b82c) [0x55be8690e82c]
54 0x55be86909704 python(+0x206704) [0x55be86909704]
55 0x55be8679a53c python(+0x9753c) [0x55be8679a53c]
56 0x55be86903925 _PyRun_SimpleFileObject + 437
57 0x55be869034d3 _PyRun_AnyFileObject + 67
58 0x55be869006a9 Py_RunMain + 921
59 0x55be868ce089 Py_BytesMain + 57
60 0x7fee5c293083 __libc_start_main + 243
61 0x55be868cdf81 python(+0x1caf81) [0x55be868cdf81]
Exception ignored in: <function PretrainedModel.__del__ at 0x7feb30289480>
Traceback (most recent call last):
File "/opt/conda/lib/python3.10/site-packages/tensorrt_llm/models/modeling_utils.py", line 394, in __del__
self.release()
File "/opt/conda/lib/python3.10/site-packages/tensorrt_llm/models/modeling_utils.py", line 391, in release
release_gc()
File "/opt/conda/lib/python3.10/site-packages/tensorrt_llm/_utils.py", line 483, in release_gc
torch.cuda.ipc_collect()
File "/opt/conda/lib/python3.10/site-packages/torch/cuda/__init__.py", line 804, in ipc_collect
_lazy_init()
File "/opt/conda/lib/python3.10/site-packages/torch/cuda/__init__.py", line 312, in _lazy_init
raise DeferredCudaCallError(msg) from e
torch.cuda.DeferredCudaCallError: CUDA call failed lazily at initialization with error: 'NoneType' object is not iterable
CUDA call was originally invoked at:
File "/optimum-nvidia/1.py", line 5, in <module>
import torch
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 883, in exec_module
File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
File "/opt/conda/lib/python3.10/site-packages/torch/__init__.py", line 1480, in <module>
_C._initExtension(manager_path())
File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 883, in exec_module
File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
File "/opt/conda/lib/python3.10/site-packages/torch/cuda/__init__.py", line 1294, in <module>
_lazy_call(_register_triton_kernels)
File "/opt/conda/lib/python3.10/site-packages/torch/cuda/__init__.py", line 235, in _lazy_call
_queued_calls.append((callable, traceback.format_stack()))
I am trying to port the
transformersbasedAutoModelForCausalLMtooptimum.nvidiaand I hitOutOfMemory. I assume I need to add quantization_config like I do with thetransformers:and pass it in:
The script I run:
The error: