NouamaneTazi · barsuna · Apr 2, 2023
diff --git a/README.md b/README.md
@@ -1,4 +1,12 @@
-# bloomz.cpp
+# clone of bloomz.cpp modified to quantize and run inference bloom-176B model
+
+You can quantize and run inference on bloom-176b
+
+- 4-bit quantized Bloom model file is ~112GB 
+- this code/model takes about 105GB of ram to run inference (though peak allocation is 111GB due to some shortcomings of the code that are not yet fixed)
+- performace-wise, this is not fast. at all. about 17 seconds per token (on 96 threads)... which is very close to full size Bloom on same CPU
+
+## bloomz.cpp
 
 Inference of HuggingFace's [BLOOM-like](https://huggingface.co/docs/transformers/model_doc/bloom) models in pure C/C++.
 

diff --git a/ggml.c b/ggml.c
@@ -2229,10 +2229,10 @@ void ggml_print_objects(const struct ggml_context * ctx) {
     GGML_PRINT("%s: --- end ---\n", __func__);
 }
 
-int ggml_nelements(const struct ggml_tensor * tensor) {
+size_t ggml_nelements(const struct ggml_tensor * tensor) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
-    return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
+    return (size_t) tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
 }
 
 int ggml_nrows(const struct ggml_tensor * tensor) {

diff --git a/ggml.h b/ggml.h
@@ -327,7 +327,7 @@ int64_t ggml_cycles_per_ms(void);
 void ggml_print_object (const struct ggml_object * obj);
 void ggml_print_objects(const struct ggml_context * ctx);
 
-int    ggml_nelements(const struct ggml_tensor * tensor);
+size_t    ggml_nelements(const struct ggml_tensor * tensor);
 size_t ggml_nbytes   (const struct ggml_tensor * tensor);
 
 int    ggml_blck_size (enum ggml_type type);

diff --git a/main.cpp b/main.cpp
@@ -216,6 +216,9 @@ bool bloom_model_load(const std::string & fname, bloom_model & model, gpt_vocab
         printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
     }
 
+    // FIXME: increase the context size as it seems above calculations are off...
+    ctx_size += 5000000000; // + 5GB, seems to be the least amount of GB that works
+
     // create the ggml context
     {
         struct ggml_init_params params = {
@@ -355,7 +358,7 @@ bool bloom_model_load(const std::string & fname, bloom_model & model, gpt_vocab
                     break;
                 }
 
-                int32_t nelements = 1;
+                int64_t nelements = 1;
                 int32_t ne[2] = { 1, 1 };
                 for (int i = 0; i < n_dims; ++i) {
                     fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
@@ -551,7 +554,8 @@ bool bloom_eval(
 
     const int d_key = n_embd/n_head;
 
-    static size_t buf_size = 512u*1024*1024;
+    //static size_t buf_size = 512u*1024*1024;
+    static size_t buf_size = 1024u*1024*1024;
     static void * buf = malloc(buf_size);
 
     if (mem_per_token > 0 && mem_per_token*N > buf_size) {

diff --git a/quantize.cpp b/quantize.cpp
@@ -150,7 +150,7 @@ bool bloom_model_quantize(const std::string & fname_inp, const std::string & fna
                 break;
             }
 
-            int32_t nelements = 1;
+            int64_t nelements = 1;
             int32_t ne[2] = { 1, 1 };
             for (int i = 0; i < n_dims; ++i) {
                 finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));

diff --git a/utils.cpp b/utils.cpp
@@ -486,7 +486,7 @@ gpt_vocab::id bloom_sample_top_p(
 }
 
 
-size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
+size_t ggml_quantize_q4_0(float * src, void * dst, int64_t n, int k, int qk, int64_t * hist) {
     const int nb = k / qk;
     const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2);
     const size_t row_size = nb*bs;
@@ -498,7 +498,7 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t
 
     char * pdst = (char *) dst;
 
-    for (int j = 0; j < n; j += k) {
+    for (int64_t j = 0; j < n; j += k) {
         uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
         uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
 
@@ -542,7 +542,7 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t
     return (n/k)*row_size;
 }
 
-size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
+size_t ggml_quantize_q4_1(float * src, void * dst, int64_t n, int k, int qk, int64_t * hist) {
     const int nb = k / qk;
     const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2);
 
@@ -553,7 +553,7 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
 
     char * pdst = (char *) dst;
 
-    for (int j = 0; j < n; j += k) {
+    for (int64_t j = 0; j < n; j += k) {
         float   * pm = (float *)   (pdst + (j/k)*row_size);
         float   * pd = (float *)   (pm + nb);
         uint8_t * pb = (uint8_t *) (pd + nb);

diff --git a/utils.h b/utils.h
@@ -101,5 +101,5 @@ gpt_vocab::id bloom_sample_top_p(
 // Quantization
 //
 
-size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
-size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
+size_t ggml_quantize_q4_0(float * src, void * dst, int64_t n, int k, int qk, int64_t * hist);
+size_t ggml_quantize_q4_1(float * src, void * dst, int64_t n, int k, int qk, int64_t * hist);