From deadd492308892845385bdcfdf8f9f8651e7d8f5 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Fri, 5 Aug 2022 09:23:25 +0200
Subject: [PATCH 1/5] add dtype support on config + added tests

---
 tests/test_modeling_gpt2.py | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 tests/test_modeling_gpt2.py

diff --git a/tests/test_modeling_gpt2.py b/tests/test_modeling_gpt2.py
new file mode 100644
index 00000000..b0f60b10
--- /dev/null
+++ b/tests/test_modeling_gpt2.py
@@ -0,0 +1,36 @@
+"""
+Some tests for minGPT
+"""
+
+import unittest
+import torch
+from mingpt.model import GPT
+
+class GPT2Tester(unittest.TestCase):    
+
+    def test_dtypes(self):
+        """
+            Dtype tests for GPT2 model
+        """
+        config_fp16 = GPT.get_default_config()
+        config_fp16.merge_from_dict({'dtype':'float16', 'vocab_size':50257, 'block_size':1024})
+        config_fp16.model_type = 'gpt2'
+
+        config_fp32 = GPT.get_default_config()
+        config_fp32.merge_from_dict({'vocab_size':50257, 'block_size':1024})
+        config_fp32.model_type = 'gpt2'
+
+
+        model_fp16 = GPT(config_fp16)
+        model_fp32 = GPT(config_fp32)
+
+        # Check whether the dtype has been checked correctly
+        self.assertTrue(model_fp16.dtype == torch.float16)
+        self.assertTrue(model_fp32.dtype == torch.float32)
+
+        # Checck whether the memory footprint is half of the fp32 model
+        self.assertTrue(model_fp16.get_memory_footprint() == model_fp32.get_memory_footprint() // 2)
+
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file

From c4bce59533e8b83a75035df2b799240e144b10bb Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Fri, 5 Aug 2022 09:24:15 +0200
Subject: [PATCH 2/5] add dtype support

---
 mingpt/model.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/mingpt/model.py b/mingpt/model.py
index 83ee22dc..d4050361 100644
--- a/mingpt/model.py
+++ b/mingpt/model.py
@@ -9,6 +9,7 @@
 """
 
 import math
+from multiprocessing.sharedctypes import Value
 
 import torch
 import torch.nn as nn
@@ -110,6 +111,8 @@ def get_default_config():
         C.embd_pdrop = 0.1
         C.resid_pdrop = 0.1
         C.attn_pdrop = 0.1
+        # add parameter dtype
+        C.dtype = torch.float32
         return C
 
     def __init__(self, config):
@@ -118,6 +121,13 @@ def __init__(self, config):
         assert config.block_size is not None
         self.block_size = config.block_size
 
+        if isinstance(config.dtype, str):
+            try:
+                config.dtype = getattr(torch, config.dtype)
+            except:
+                raise ValueError(f"Unknown dtype {config.dtype}")
+        self.dtype = config.dtype
+
         type_given = config.model_type is not None
         params_given = all([config.n_layer is not None, config.n_head is not None, config.n_embd is not None])
         assert type_given ^ params_given # exactly one of these (XOR)
@@ -170,6 +180,24 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             torch.nn.init.zeros_(module.bias)
             torch.nn.init.ones_(module.weight)
+        module = module.to(self.dtype)
+    
+    def get_memory_footprint(self, return_buffers=True):
+        r"""
+        Get the memory footprint of a model. This will return the memory footprint of the current model in bytes.
+        Useful to benchmark the memory footprint of the current model and design some tests. Solution inspired from the
+        PyTorch discussions: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2
+        Arguments:
+            return_buffers (`bool`, *optional*):
+                Whether to return the size of the buffer tensors in the computation of the memory footprint. Buffers
+                are tensors that do not require gradients and not registered as parameters. E.g. mean and std in batch
+                norm layers. Please see: https://discuss.pytorch.org/t/what-pytorch-means-by-buffers/120266/2
+        """
+        mem = sum([param.nelement() * param.element_size() for param in self.parameters()])
+        if return_buffers:
+            mem_bufs = sum([buf.nelement() * buf.element_size() for buf in self.buffers()])
+            mem = mem + mem_bufs
+        return mem
 
     @classmethod
     def from_pretrained(cls, model_type):

From f345c397cc43f540177e58199796b6104ee909f5 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Fri, 5 Aug 2022 09:24:48 +0200
Subject: [PATCH 3/5] remove dummy class

---
 mingpt/model.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mingpt/model.py b/mingpt/model.py
index d4050361..5979fba9 100644
--- a/mingpt/model.py
+++ b/mingpt/model.py
@@ -9,7 +9,6 @@
 """
 
 import math
-from multiprocessing.sharedctypes import Value
 
 import torch
 import torch.nn as nn

From 92b54e7d1dac7cbced9e7e218eb4db414623ec39 Mon Sep 17 00:00:00 2001
From: younesbelkada <younesbelkada@gmail.com>
Date: Fri, 5 Aug 2022 09:28:51 +0200
Subject: [PATCH 4/5] added new assert

---
 mingpt/model.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mingpt/model.py b/mingpt/model.py
index 5979fba9..956975ed 100644
--- a/mingpt/model.py
+++ b/mingpt/model.py
@@ -125,7 +125,9 @@ def __init__(self, config):
                 config.dtype = getattr(torch, config.dtype)
             except:
                 raise ValueError(f"Unknown dtype {config.dtype}")
+        # check that the dtype is a floating point
         self.dtype = config.dtype
+        assert torch.is_floating_point(self.dtype)
 
         type_given = config.model_type is not None
         params_given = all([config.n_layer is not None, config.n_head is not None, config.n_embd is not None])

From adf1e57252eee4b0d139060f8953e74803a83cc4 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Fri, 5 Aug 2022 18:46:15 +0200
Subject: [PATCH 5/5] Update mingpt/model.py

---
 mingpt/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mingpt/model.py b/mingpt/model.py
index 956975ed..97470b26 100644
--- a/mingpt/model.py
+++ b/mingpt/model.py
@@ -110,7 +110,7 @@ def get_default_config():
         C.embd_pdrop = 0.1
         C.resid_pdrop = 0.1
         C.attn_pdrop = 0.1
-        # add parameter dtype
+        # parameter dtype
         C.dtype = torch.float32
         return C