From deadd492308892845385bdcfdf8f9f8651e7d8f5 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 5 Aug 2022 09:23:25 +0200 Subject: [PATCH 1/5] add dtype support on config + added tests --- tests/test_modeling_gpt2.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 tests/test_modeling_gpt2.py diff --git a/tests/test_modeling_gpt2.py b/tests/test_modeling_gpt2.py new file mode 100644 index 00000000..b0f60b10 --- /dev/null +++ b/tests/test_modeling_gpt2.py @@ -0,0 +1,36 @@ +""" +Some tests for minGPT +""" + +import unittest +import torch +from mingpt.model import GPT + +class GPT2Tester(unittest.TestCase): + + def test_dtypes(self): + """ + Dtype tests for GPT2 model + """ + config_fp16 = GPT.get_default_config() + config_fp16.merge_from_dict({'dtype':'float16', 'vocab_size':50257, 'block_size':1024}) + config_fp16.model_type = 'gpt2' + + config_fp32 = GPT.get_default_config() + config_fp32.merge_from_dict({'vocab_size':50257, 'block_size':1024}) + config_fp32.model_type = 'gpt2' + + + model_fp16 = GPT(config_fp16) + model_fp32 = GPT(config_fp32) + + # Check whether the dtype has been checked correctly + self.assertTrue(model_fp16.dtype == torch.float16) + self.assertTrue(model_fp32.dtype == torch.float32) + + # Checck whether the memory footprint is half of the fp32 model + self.assertTrue(model_fp16.get_memory_footprint() == model_fp32.get_memory_footprint() // 2) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file From c4bce59533e8b83a75035df2b799240e144b10bb Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 5 Aug 2022 09:24:15 +0200 Subject: [PATCH 2/5] add dtype support --- mingpt/model.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/mingpt/model.py b/mingpt/model.py index 83ee22dc..d4050361 100644 --- a/mingpt/model.py +++ b/mingpt/model.py @@ -9,6 +9,7 @@ """ import math +from multiprocessing.sharedctypes import Value import torch import torch.nn as nn @@ -110,6 +111,8 @@ def get_default_config(): C.embd_pdrop = 0.1 C.resid_pdrop = 0.1 C.attn_pdrop = 0.1 + # add parameter dtype + C.dtype = torch.float32 return C def __init__(self, config): @@ -118,6 +121,13 @@ def __init__(self, config): assert config.block_size is not None self.block_size = config.block_size + if isinstance(config.dtype, str): + try: + config.dtype = getattr(torch, config.dtype) + except: + raise ValueError(f"Unknown dtype {config.dtype}") + self.dtype = config.dtype + type_given = config.model_type is not None params_given = all([config.n_layer is not None, config.n_head is not None, config.n_embd is not None]) assert type_given ^ params_given # exactly one of these (XOR) @@ -170,6 +180,24 @@ def _init_weights(self, module): elif isinstance(module, nn.LayerNorm): torch.nn.init.zeros_(module.bias) torch.nn.init.ones_(module.weight) + module = module.to(self.dtype) + + def get_memory_footprint(self, return_buffers=True): + r""" + Get the memory footprint of a model. This will return the memory footprint of the current model in bytes. + Useful to benchmark the memory footprint of the current model and design some tests. Solution inspired from the + PyTorch discussions: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2 + Arguments: + return_buffers (`bool`, *optional*): + Whether to return the size of the buffer tensors in the computation of the memory footprint. Buffers + are tensors that do not require gradients and not registered as parameters. E.g. mean and std in batch + norm layers. Please see: https://discuss.pytorch.org/t/what-pytorch-means-by-buffers/120266/2 + """ + mem = sum([param.nelement() * param.element_size() for param in self.parameters()]) + if return_buffers: + mem_bufs = sum([buf.nelement() * buf.element_size() for buf in self.buffers()]) + mem = mem + mem_bufs + return mem @classmethod def from_pretrained(cls, model_type): From f345c397cc43f540177e58199796b6104ee909f5 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 5 Aug 2022 09:24:48 +0200 Subject: [PATCH 3/5] remove dummy class --- mingpt/model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mingpt/model.py b/mingpt/model.py index d4050361..5979fba9 100644 --- a/mingpt/model.py +++ b/mingpt/model.py @@ -9,7 +9,6 @@ """ import math -from multiprocessing.sharedctypes import Value import torch import torch.nn as nn From 92b54e7d1dac7cbced9e7e218eb4db414623ec39 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 5 Aug 2022 09:28:51 +0200 Subject: [PATCH 4/5] added new assert --- mingpt/model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mingpt/model.py b/mingpt/model.py index 5979fba9..956975ed 100644 --- a/mingpt/model.py +++ b/mingpt/model.py @@ -125,7 +125,9 @@ def __init__(self, config): config.dtype = getattr(torch, config.dtype) except: raise ValueError(f"Unknown dtype {config.dtype}") + # check that the dtype is a floating point self.dtype = config.dtype + assert torch.is_floating_point(self.dtype) type_given = config.model_type is not None params_given = all([config.n_layer is not None, config.n_head is not None, config.n_embd is not None]) From adf1e57252eee4b0d139060f8953e74803a83cc4 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Fri, 5 Aug 2022 18:46:15 +0200 Subject: [PATCH 5/5] Update mingpt/model.py --- mingpt/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mingpt/model.py b/mingpt/model.py index 956975ed..97470b26 100644 --- a/mingpt/model.py +++ b/mingpt/model.py @@ -110,7 +110,7 @@ def get_default_config(): C.embd_pdrop = 0.1 C.resid_pdrop = 0.1 C.attn_pdrop = 0.1 - # add parameter dtype + # parameter dtype C.dtype = torch.float32 return C