Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
# bloomz.cpp
# clone of bloomz.cpp modified to quantize and run inference bloom-176B model

You can quantize and run inference on bloom-176b

- 4-bit quantized Bloom model file is ~112GB
- this code/model takes about 105GB of ram to run inference (though peak allocation is 111GB due to some shortcomings of the code that are not yet fixed)
- performace-wise, this is not fast. at all. about 17 seconds per token (on 96 threads)... which is very close to full size Bloom on same CPU

## bloomz.cpp

Inference of HuggingFace's [BLOOM-like](https://huggingface.co/docs/transformers/model_doc/bloom) models in pure C/C++.

Expand Down
4 changes: 2 additions & 2 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -2229,10 +2229,10 @@ void ggml_print_objects(const struct ggml_context * ctx) {
GGML_PRINT("%s: --- end ---\n", __func__);
}

int ggml_nelements(const struct ggml_tensor * tensor) {
size_t ggml_nelements(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
return (size_t) tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
}

int ggml_nrows(const struct ggml_tensor * tensor) {
Expand Down
2 changes: 1 addition & 1 deletion ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ int64_t ggml_cycles_per_ms(void);
void ggml_print_object (const struct ggml_object * obj);
void ggml_print_objects(const struct ggml_context * ctx);

int ggml_nelements(const struct ggml_tensor * tensor);
size_t ggml_nelements(const struct ggml_tensor * tensor);
size_t ggml_nbytes (const struct ggml_tensor * tensor);

int ggml_blck_size (enum ggml_type type);
Expand Down
8 changes: 6 additions & 2 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,9 @@ bool bloom_model_load(const std::string & fname, bloom_model & model, gpt_vocab
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}

// FIXME: increase the context size as it seems above calculations are off...
ctx_size += 5000000000; // + 5GB, seems to be the least amount of GB that works

// create the ggml context
{
struct ggml_init_params params = {
Expand Down Expand Up @@ -355,7 +358,7 @@ bool bloom_model_load(const std::string & fname, bloom_model & model, gpt_vocab
break;
}

int32_t nelements = 1;
int64_t nelements = 1;
int32_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) {
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
Expand Down Expand Up @@ -551,7 +554,8 @@ bool bloom_eval(

const int d_key = n_embd/n_head;

static size_t buf_size = 512u*1024*1024;
//static size_t buf_size = 512u*1024*1024;
static size_t buf_size = 1024u*1024*1024;
static void * buf = malloc(buf_size);

if (mem_per_token > 0 && mem_per_token*N > buf_size) {
Expand Down
2 changes: 1 addition & 1 deletion quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ bool bloom_model_quantize(const std::string & fname_inp, const std::string & fna
break;
}

int32_t nelements = 1;
int64_t nelements = 1;
int32_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) {
finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
Expand Down
8 changes: 4 additions & 4 deletions utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,7 @@ gpt_vocab::id bloom_sample_top_p(
}


size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
size_t ggml_quantize_q4_0(float * src, void * dst, int64_t n, int k, int qk, int64_t * hist) {
const int nb = k / qk;
const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2);
const size_t row_size = nb*bs;
Expand All @@ -498,7 +498,7 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t

char * pdst = (char *) dst;

for (int j = 0; j < n; j += k) {
for (int64_t j = 0; j < n; j += k) {
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));

Expand Down Expand Up @@ -542,7 +542,7 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t
return (n/k)*row_size;
}

size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
size_t ggml_quantize_q4_1(float * src, void * dst, int64_t n, int k, int qk, int64_t * hist) {
const int nb = k / qk;
const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2);

Expand All @@ -553,7 +553,7 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t

char * pdst = (char *) dst;

for (int j = 0; j < n; j += k) {
for (int64_t j = 0; j < n; j += k) {
float * pm = (float *) (pdst + (j/k)*row_size);
float * pd = (float *) (pm + nb);
uint8_t * pb = (uint8_t *) (pd + nb);
Expand Down
4 changes: 2 additions & 2 deletions utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,5 +101,5 @@ gpt_vocab::id bloom_sample_top_p(
// Quantization
//

size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
size_t ggml_quantize_q4_0(float * src, void * dst, int64_t n, int k, int qk, int64_t * hist);
size_t ggml_quantize_q4_1(float * src, void * dst, int64_t n, int k, int qk, int64_t * hist);