From e94132b5515be5ced81efe39d1349428c260506c Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Wed, 3 Jul 2024 16:50:58 +0000 Subject: [PATCH 01/41] started on geneartor --- llava/GAN/pipeline.py | 57 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 llava/GAN/pipeline.py diff --git a/llava/GAN/pipeline.py b/llava/GAN/pipeline.py new file mode 100644 index 000000000..a6c0df83a --- /dev/null +++ b/llava/GAN/pipeline.py @@ -0,0 +1,57 @@ +import argparse +import torch +import os +import random +import json + + +from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN +from llava.conversation import conv_templates, SeparatorStyle +from llava.model.builder import load_pretrained_model +from llava.utils import disable_torch_init +from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path + +from discriminator.py import discriminator # import Laya's discriminator + +def get_data(image_folder, language_file): + '''takes in a folder of images and returns the tokens - images go trough both the clip encoder and the mm_projector; also takes in a language file and gets those toekens''' + return NotImplemented + +def train_gan(args): + EPOCHS = 10 + G_losses = [] + D_losses = [] + + ## boot up model and get everything running properly + disable_torch_init() + model_path = os.path.expanduser(args.model_path) + model_name = get_model_name_from_path(model_path) + tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) + + for epoch in range(EPOCHS): + + # how to decide whether to pass an image or language token to the discriminator + tkn_type = 0 if (random.random() % 2 == 0) else 1 + + # if image - use prepare inputs for mulitmotdal with only one image; or use encode_images + if tkn_type == 1: + + image_tkn = + # if language - use tokenizer_image_token + + # run discriminator; get prediciton + + # calculate loss of discriminator; somehow calculate loss of the generator + + + return NotImplemented + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model-path", type=str, default="facebook/opt-350m") + parser.add_argument("--model-base", type=str, default=None) + parser.add_argument("--image-folder", type=str, default="") + parser.add_argument("--language-file", type=str, default="answer.jsonl") + args = parser.parse_args() + + train_gan(args) \ No newline at end of file From 44934a86d3412ca6330b635965512c7a52fe05fc Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Wed, 3 Jul 2024 23:22:24 +0000 Subject: [PATCH 02/41] training script --- llava/GAN/{pipeline.py => trainGAN.py} | 37 ++++++++++++-------------- 1 file changed, 17 insertions(+), 20 deletions(-) rename llava/GAN/{pipeline.py => trainGAN.py} (57%) diff --git a/llava/GAN/pipeline.py b/llava/GAN/trainGAN.py similarity index 57% rename from llava/GAN/pipeline.py rename to llava/GAN/trainGAN.py index a6c0df83a..c709087c3 100644 --- a/llava/GAN/pipeline.py +++ b/llava/GAN/trainGAN.py @@ -2,8 +2,7 @@ import torch import os import random -import json - +from datetime import datetime from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN from llava.conversation import conv_templates, SeparatorStyle @@ -12,10 +11,7 @@ from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path from discriminator.py import discriminator # import Laya's discriminator - -def get_data(image_folder, language_file): - '''takes in a folder of images and returns the tokens - images go trough both the clip encoder and the mm_projector; also takes in a language file and gets those toekens''' - return NotImplemented +from make_data.py import CustomDataset # impor tht edataset class def train_gan(args): EPOCHS = 10 @@ -28,21 +24,23 @@ def train_gan(args): model_name = get_model_name_from_path(model_path) tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) - for epoch in range(EPOCHS): + # get data + d = CustomDataset(args.data) + + ds = {} + ds["im_tok"] = d.im_toks + ds["lang_tok"] = d.lang_toks - # how to decide whether to pass an image or language token to the discriminator - tkn_type = 0 if (random.random() % 2 == 0) else 1 + tkn_lst = random.shuffle([ds["im_tok"],ds["lang_tok"]]) + - # if image - use prepare inputs for mulitmotdal with only one image; or use encode_images - if tkn_type == 1: - - image_tkn = - # if language - use tokenizer_image_token - - # run discriminator; get prediciton - - # calculate loss of discriminator; somehow calculate loss of the generator + for epoch in range(EPOCHS): + for tkn in tkn_lst: + # send token to discriminator; get prediciton + # calculate loss of discriminator; somehow calculate loss of the generator + pass + pass return NotImplemented @@ -50,8 +48,7 @@ def train_gan(args): parser = argparse.ArgumentParser() parser.add_argument("--model-path", type=str, default="facebook/opt-350m") parser.add_argument("--model-base", type=str, default=None) - parser.add_argument("--image-folder", type=str, default="") - parser.add_argument("--language-file", type=str, default="answer.jsonl") + parser.add_argument("--data, type=str, default=") args = parser.parse_args() train_gan(args) \ No newline at end of file From 965741337103c3bf01b960285cc274582dbb70ce Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Fri, 5 Jul 2024 06:44:47 +0000 Subject: [PATCH 03/41] going through codebase --- llava/GAN/send.py | 34 ++++++++++++++++++++++++++++++++++ llava/GAN/trainGAN.py | 29 +++++++++++++++++++++++++++-- 2 files changed, 61 insertions(+), 2 deletions(-) create mode 100644 llava/GAN/send.py diff --git a/llava/GAN/send.py b/llava/GAN/send.py new file mode 100644 index 000000000..6509ad862 --- /dev/null +++ b/llava/GAN/send.py @@ -0,0 +1,34 @@ +import argparse +import torch +import os +import random +from datetime import datetime + +from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN +from llava.conversation import conv_templates, SeparatorStyle +from llava.model.builder import load_pretrained_model +from llava.utils import disable_torch_init +from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path + +from discriminator.py import Discriminator # import Laya's discriminator +from make_data.py import CustomDataset # impor the dataset class + +def get_data(image_folder, language_file): + '''takes in images and the language file and outputs a shuffled list + of both the images after going through _projector and the tokenized language tokens''' + + return None + +def send_to_discriminator(): + ## boot up model and get everything running properly + disable_torch_init() + model_path = os.path.expanduser(args.model_path) + model_name = get_model_name_from_path(model_path) + tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)] + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model-path", type=str, default="facebook/opt-350m") + parser.add_argument("--model-base", type=str, default=None) + parser.add_argument("--data, type=str, default=") + args = parser.parse_args() \ No newline at end of file diff --git a/llava/GAN/trainGAN.py b/llava/GAN/trainGAN.py index c709087c3..e0bd1147c 100644 --- a/llava/GAN/trainGAN.py +++ b/llava/GAN/trainGAN.py @@ -10,10 +10,23 @@ from llava.utils import disable_torch_init from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path -from discriminator.py import discriminator # import Laya's discriminator -from make_data.py import CustomDataset # impor tht edataset class +from discriminator.py import Discriminator # import Laya's discriminator +from make_data.py import CustomDataset # impor the dataset class + +# from DCGAN tutorial: according to GAN paper, model weights should be randomly +# initalized from mean 0 sd = 0.2; but this is for image classification, maybe +# we want something different for our purpose? + +def weights_init(m): + classname = m.__class__.__name__ + if classname.find('Conv') != -1: + nn.init.normal_(m.weight.data, 0.0, 0.02) + elif classname.find('BatchNorm') != -1: + nn.init.normal_(m.weight.data, 1.0, 0.02) + nn.init.constant_(m.bias.data, 0) def train_gan(args): + device = 'cuda' # set device appropriately EPOCHS = 10 G_losses = [] D_losses = [] @@ -33,7 +46,19 @@ def train_gan(args): tkn_lst = random.shuffle([ds["im_tok"],ds["lang_tok"]]) + # instantiate discriminator and send to device + + IMAGE_SIZE = 1024 * 5 # TODO: ensure this is correct + NUM_CLASSES = 2 + discrim = Discriminator(IMAGE_SIZE, NUM_CLASSES) + discrim.to(device) + discrim.apply(weights_init) + + # instantiate generator and send to device + + + for epoch in range(EPOCHS): for tkn in tkn_lst: # send token to discriminator; get prediciton From 94da3a59a44655db5bbd942014fde9afb7d2e20a Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Fri, 12 Jul 2024 05:23:21 +0000 Subject: [PATCH 04/41] reworkig things --- llava/GAN/trainGAN.py | 119 +++++++++++++++++++++++++++++++++--------- 1 file changed, 94 insertions(+), 25 deletions(-) diff --git a/llava/GAN/trainGAN.py b/llava/GAN/trainGAN.py index e0bd1147c..543e9f4ef 100644 --- a/llava/GAN/trainGAN.py +++ b/llava/GAN/trainGAN.py @@ -1,5 +1,10 @@ import argparse import torch +import torch.nn as nn +import json +import torch.nn.parallel +import torch.optim as optim +import torch.utils.data import os import random from datetime import datetime @@ -10,60 +15,124 @@ from llava.utils import disable_torch_init from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path -from discriminator.py import Discriminator # import Laya's discriminator +from discriminator.py import Discriminator # import Laya's discriminatorcod from make_data.py import CustomDataset # impor the dataset class # from DCGAN tutorial: according to GAN paper, model weights should be randomly # initalized from mean 0 sd = 0.2; but this is for image classification, maybe # we want something different for our purpose? -def weights_init(m): - classname = m.__class__.__name__ - if classname.find('Conv') != -1: - nn.init.normal_(m.weight.data, 0.0, 0.02) - elif classname.find('BatchNorm') != -1: - nn.init.normal_(m.weight.data, 1.0, 0.02) - nn.init.constant_(m.bias.data, 0) +def split_list(lst, n): # taken from model_vqa.py + """Split a list into n (roughly) equal-sized chunks""" + chunk_size = math.ceil(len(lst) / n) # integer division + return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] -def train_gan(args): + +def get_chunk(lst, n, k): # taken from model_vqa.py + chunks = split_list(lst, n) + return chunks[k] + +def prep_convo_for_discrim(prompt, image): + '''takes in one prompt and one image and returns a dictionary that has the language tokens + from the prompt as one entry and the image tokens from the prompt as another''' + +def train_gen(args): device = 'cuda' # set device appropriately EPOCHS = 10 G_losses = [] D_losses = [] + iters = 0 - ## boot up model and get everything running properly + ## boot up model and get everything running properly - THIS IS THE GENERATOR disable_torch_init() model_path = os.path.expanduser(args.model_path) model_name = get_model_name_from_path(model_path) tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) # get data - d = CustomDataset(args.data) - - ds = {} - ds["im_tok"] = d.im_toks - ds["lang_tok"] = d.lang_toks + questions = [json.loads(q) for q in open(os.path.expanduser(args.data), "r")] + questions = get_chunk(questions, args.num_chunks, args.chunk_idx) + + for line in questions: + idx = line["question_id"] + image_file = line["image"] + qs = line["text"] + curr_prompt = qs + + + - tkn_lst = random.shuffle([ds["im_tok"],ds["lang_tok"]]) + + real_label = 1 + fake_label = 0 + + + + lr = 0.0002 + beta1 = 0.5 + # instantiate discriminator and send to device IMAGE_SIZE = 1024 * 5 # TODO: ensure this is correct NUM_CLASSES = 2 - discrim = Discriminator(IMAGE_SIZE, NUM_CLASSES) + discrim = Discriminator(IMAGE_SIZE, NUM_CLASSES) # TODO: apply weights discrim.to(device) - discrim.apply(weights_init) - - - # instantiate generator and send to device + # TODO: instantiate generator and send to device - i dont think this code is correct, maybe its better to load the weights directly into a new class + mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu') + mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()} + gen = model.load_state_dict(mm_projector_weights, strict=False) + # loss functions and optimizers + criterion = nn.BCELoss() + optimizerD = optim.Adam(discrim.parameters(), lr=lr, betas=(beta1, 0.999)) + optimizerG = optim.Adam(gen.parameters(), lr=lr, betas=(beta1, 0.999)) for epoch in range(EPOCHS): - for tkn in tkn_lst: - # send token to discriminator; get prediciton + # TODO: do we train the discrim at all? or simply send generated tokens to discrim and calcualte loss + for i, (data1, data0) in enumerate(zip(dataloader1, dataloader0)): # this loop is following along DCGAN tutorial in pytroch documentation + + # train discriminator with all real batch + discrim.zero_grad() + + real_batch = data1[0].to(device) + rb_size = real_batch.size(0) + label = torch.full((rb_size,), real_label, dtype=torch.float, device=device) + + # forward pass through discrim + output = discrim(real_batch).view(-1) + + # calculate loss on real token batch + errD_real = criterion(output, label) + + #calculate gradients backward pass + errD_real.backward() + D_x = output.mean().item() + + # train discriminator with all fake batch + fake_batch = data0[0].to(device) + fb_size = fake_batch.size(0) + label = torch.full((fb_size,), fake_label, dtype=torch.float, device=device) + + # project the tokens + fake_tkns = gen(fake_batch) + + # send fake batch to discrim for classification + output = discrim(fake_tkns.detach()).view(-1) + + # calculate discrim loss on fake batch + errD_fake = criterion(output, label) + D_G_z1 = output.mean().item() + + # compute error of D as sum over the fake and the real batches + errD = errD_real + errD_fake + + # update D + optimizerD.step() + + - # calculate loss of discriminator; somehow calculate loss of the generator pass pass @@ -76,4 +145,4 @@ def train_gan(args): parser.add_argument("--data, type=str, default=") args = parser.parse_args() - train_gan(args) \ No newline at end of file + train_gen(args) \ No newline at end of file From 7caaf558ce1bcad7b4c00f66eb78ac167a44f8d4 Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Tue, 16 Jul 2024 05:20:14 +0000 Subject: [PATCH 05/41] filtering tokens --- llava/GAN/trainGAN.py | 57 +++++++++++++++++++++++++++++++++------ llava/model/llava_arch.py | 8 +++--- 2 files changed, 53 insertions(+), 12 deletions(-) diff --git a/llava/GAN/trainGAN.py b/llava/GAN/trainGAN.py index 543e9f4ef..fcedce2ec 100644 --- a/llava/GAN/trainGAN.py +++ b/llava/GAN/trainGAN.py @@ -1,6 +1,7 @@ import argparse import torch import torch.nn as nn +from PIL import Image import json import torch.nn.parallel import torch.optim as optim @@ -18,10 +19,6 @@ from discriminator.py import Discriminator # import Laya's discriminatorcod from make_data.py import CustomDataset # impor the dataset class -# from DCGAN tutorial: according to GAN paper, model weights should be randomly -# initalized from mean 0 sd = 0.2; but this is for image classification, maybe -# we want something different for our purpose? - def split_list(lst, n): # taken from model_vqa.py """Split a list into n (roughly) equal-sized chunks""" chunk_size = math.ceil(len(lst) / n) # integer division @@ -32,9 +29,38 @@ def get_chunk(lst, n, k): # taken from model_vqa.py chunks = split_list(lst, n) return chunks[k] -def prep_convo_for_discrim(prompt, image): +def prep_batches(input_ids, image_tensor, model): '''takes in one prompt and one image and returns a dictionary that has the language tokens from the prompt as one entry and the image tokens from the prompt as another''' + position_ids = None # set to None in generate() + attention_mask = None # set to None in generate() must figure out if this and the above is acceptable + image_size = image_tensor.size() + + # prep_inputs... returns None as the first value, but idk why + none_q, position_ids, attention_mask, past_key_values, input_embeds, labels, chunk_sizes = model.prepare_inputs_labels_for_multimodal( + input_ids = input_ids, + position_ids = position_ids, + attention_mask = attention_mask, + past_key_values = None, + labels = None, + images = image_tensor, + image_sizes=image_size + ) + + # filter output to create the batch CURRENTLY ONLY WORKS IN ONE CASE where its text - image - text -> NEED TO GENERALIZE + # also has not been tested yet so does it work? tbh idk + split_embeddings = torch.split(input_embeds, chunk_sizes, dim=0) + lang_tkns = torch.cat(split_embeddings[0], split_embeddings[2]) + img_tkns = split_embeddings[1] + + tkn_dict = { + lang_tkns: lang_tkns, + img_tkns: img_tkns + } + + return tkn_dict + + def train_gen(args): device = 'cuda' # set device appropriately @@ -49,15 +75,30 @@ def train_gen(args): model_name = get_model_name_from_path(model_path) tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) - # get data + # get data - following along with model_vqa.py questions = [json.loads(q) for q in open(os.path.expanduser(args.data), "r")] questions = get_chunk(questions, args.num_chunks, args.chunk_idx) for line in questions: - idx = line["question_id"] + idx = line["question_id"] # can be used to identify each batch, probably good to use to keep track of progress during training image_file = line["image"] qs = line["text"] - curr_prompt = qs + if model.config.mm_use_im_start_end: + qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs + else: + qs = DEFAULT_IMAGE_TOKEN + '\n' + qs + + conv = conv_templates[args.conv_mode].copy() + conv.append_message(conv.roles[0], qs) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + + input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() + + image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB') + image_tensor = process_images([image], image_processor, model.config)[0] + + prep_batches(input_ids, image_tensor, model) diff --git a/llava/model/llava_arch.py b/llava/model/llava_arch.py index d71650eac..71507f9c4 100644 --- a/llava/model/llava_arch.py +++ b/llava/model/llava_arch.py @@ -199,7 +199,7 @@ def prepare_inputs_labels_for_multimodal( else: raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}") else: - image_features = self.encode_images(images) + image_features = self.encode_images(images) # send image through CLIP, and projected by W matrix into language space # TODO: image start / end is not implemented here to support pretraining. if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False): @@ -216,9 +216,9 @@ def prepare_inputs_labels_for_multimodal( attention_mask = torch.ones_like(input_ids, dtype=torch.bool) else: attention_mask = attention_mask.bool() - if position_ids is None: + if position_ids is None: # identifies the start of each token position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device) - if labels is None: + if labels is None: # i dont see where this is used labels = torch.full_like(input_ids, IGNORE_INDEX) # remove the padding using attention_mask -- FIXME @@ -296,7 +296,7 @@ def prepare_inputs_labels_for_multimodal( new_labels_padded[i, -cur_len:] = cur_new_labels attention_mask[i, -cur_len:] = True position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device) - else: + else: # zero padding new_input_embeds_padded.append(torch.cat(( cur_new_embed, torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device) From 9afb3b86480e05337f536549eebb44ebb4a27c7b Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Wed, 17 Jul 2024 23:00:29 +0000 Subject: [PATCH 06/41] rename --- llava/{GAN/trainGAN.py => VLLMSafety/pipeline.py} | 2 +- llava/{GAN => VLLMSafety}/send.py | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename llava/{GAN/trainGAN.py => VLLMSafety/pipeline.py} (99%) rename llava/{GAN => VLLMSafety}/send.py (100%) diff --git a/llava/GAN/trainGAN.py b/llava/VLLMSafety/pipeline.py similarity index 99% rename from llava/GAN/trainGAN.py rename to llava/VLLMSafety/pipeline.py index fcedce2ec..80f32cec9 100644 --- a/llava/GAN/trainGAN.py +++ b/llava/VLLMSafety/pipeline.py @@ -98,7 +98,7 @@ def train_gen(args): image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB') image_tensor = process_images([image], image_processor, model.config)[0] - prep_batches(input_ids, image_tensor, model) + tkn_dict = prep_batches(input_ids, image_tensor, model) diff --git a/llava/GAN/send.py b/llava/VLLMSafety/send.py similarity index 100% rename from llava/GAN/send.py rename to llava/VLLMSafety/send.py From 8351c7745b8c0adf282517c1332a582b9a5c383d Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Sun, 21 Jul 2024 02:39:27 +0000 Subject: [PATCH 07/41] saving --- llava/VLLMSafety/pipeline.py | 106 +++++------------------------------ 1 file changed, 15 insertions(+), 91 deletions(-) diff --git a/llava/VLLMSafety/pipeline.py b/llava/VLLMSafety/pipeline.py index 80f32cec9..52a6500e4 100644 --- a/llava/VLLMSafety/pipeline.py +++ b/llava/VLLMSafety/pipeline.py @@ -15,9 +15,9 @@ from llava.model.builder import load_pretrained_model from llava.utils import disable_torch_init from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path +from llava.llava_arch import prepare_inputs_labels_for_multimodal -from discriminator.py import Discriminator # import Laya's discriminatorcod -from make_data.py import CustomDataset # impor the dataset class +from discriminator.py import Discriminator # import Laya's discriminator def split_list(lst, n): # taken from model_vqa.py """Split a list into n (roughly) equal-sized chunks""" @@ -61,15 +61,14 @@ def prep_batches(input_ids, image_tensor, model): return tkn_dict - -def train_gen(args): +def train(args): device = 'cuda' # set device appropriately EPOCHS = 10 G_losses = [] D_losses = [] iters = 0 - ## boot up model and get everything running properly - THIS IS THE GENERATOR + ## boot up model and get everything running properly disable_torch_init() model_path = os.path.expanduser(args.model_path) model_name = get_model_name_from_path(model_path) @@ -83,7 +82,7 @@ def train_gen(args): idx = line["question_id"] # can be used to identify each batch, probably good to use to keep track of progress during training image_file = line["image"] qs = line["text"] - if model.config.mm_use_im_start_end: + if model.config.mm_use_im_start_end: # gets skipped with current config qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs else: qs = DEFAULT_IMAGE_TOKEN + '\n' + qs @@ -99,91 +98,16 @@ def train_gen(args): image_tensor = process_images([image], image_processor, model.config)[0] tkn_dict = prep_batches(input_ids, image_tensor, model) - - - - - - real_label = 1 - fake_label = 0 - - - - - lr = 0.0002 - beta1 = 0.5 - - # instantiate discriminator and send to device - - IMAGE_SIZE = 1024 * 5 # TODO: ensure this is correct - NUM_CLASSES = 2 - discrim = Discriminator(IMAGE_SIZE, NUM_CLASSES) # TODO: apply weights - discrim.to(device) - - # TODO: instantiate generator and send to device - i dont think this code is correct, maybe its better to load the weights directly into a new class - mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu') - mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()} - gen = model.load_state_dict(mm_projector_weights, strict=False) - - # loss functions and optimizers - criterion = nn.BCELoss() - optimizerD = optim.Adam(discrim.parameters(), lr=lr, betas=(beta1, 0.999)) - optimizerG = optim.Adam(gen.parameters(), lr=lr, betas=(beta1, 0.999)) - - for epoch in range(EPOCHS): - # TODO: do we train the discrim at all? or simply send generated tokens to discrim and calcualte loss - for i, (data1, data0) in enumerate(zip(dataloader1, dataloader0)): # this loop is following along DCGAN tutorial in pytroch documentation - - # train discriminator with all real batch - discrim.zero_grad() - - real_batch = data1[0].to(device) - rb_size = real_batch.size(0) - label = torch.full((rb_size,), real_label, dtype=torch.float, device=device) - - # forward pass through discrim - output = discrim(real_batch).view(-1) - - # calculate loss on real token batch - errD_real = criterion(output, label) - - #calculate gradients backward pass - errD_real.backward() - D_x = output.mean().item() - - # train discriminator with all fake batch - fake_batch = data0[0].to(device) - fb_size = fake_batch.size(0) - label = torch.full((fb_size,), fake_label, dtype=torch.float, device=device) - - # project the tokens - fake_tkns = gen(fake_batch) - - # send fake batch to discrim for classification - output = discrim(fake_tkns.detach()).view(-1) - - # calculate discrim loss on fake batch - errD_fake = criterion(output, label) - D_G_z1 = output.mean().item() - - # compute error of D as sum over the fake and the real batches - errD = errD_real + errD_fake - - # update D - optimizerD.step() - - - - pass - pass - return NotImplemented + real_label = 1 + fake_label = 0 -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--model-path", type=str, default="facebook/opt-350m") - parser.add_argument("--model-base", type=str, default=None) - parser.add_argument("--data, type=str, default=") - args = parser.parse_args() + if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model-path", type=str, default= "/home/smirrashidi/llava-v1.5-13b") + parser.add_argument("--model-base", type=str, default=None) + parser.add_argument("--image_folder", type=str, default= "/home/smirrashidi/coco_data/images") + parser.add_argument("--conversation_file", type=str, default= "/home/smirrashidi/coco_data/discrim_data.jsonl") + args = parser.parse_args() - train_gen(args) \ No newline at end of file + train(args) \ No newline at end of file From 35a0d75334d47f746abddc58636d3ce56bfa3b9f Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Sun, 21 Jul 2024 06:26:37 +0000 Subject: [PATCH 08/41] debugging pipeline --- llava/VLLMSafety/pipeline.py | 89 +++++++++++++++++++++--------------- 1 file changed, 51 insertions(+), 38 deletions(-) diff --git a/llava/VLLMSafety/pipeline.py b/llava/VLLMSafety/pipeline.py index 52a6500e4..d09a503fa 100644 --- a/llava/VLLMSafety/pipeline.py +++ b/llava/VLLMSafety/pipeline.py @@ -6,6 +6,7 @@ import torch.nn.parallel import torch.optim as optim import torch.utils.data +import math import os import random from datetime import datetime @@ -15,9 +16,9 @@ from llava.model.builder import load_pretrained_model from llava.utils import disable_torch_init from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path -from llava.llava_arch import prepare_inputs_labels_for_multimodal +#from llava.model.llava_arch import prepare_inputs_labels_for_multimodal -from discriminator.py import Discriminator # import Laya's discriminator +# from discriminator.py import Discriminator # import Lthe discriminator class def split_list(lst, n): # taken from model_vqa.py """Split a list into n (roughly) equal-sized chunks""" @@ -29,26 +30,23 @@ def get_chunk(lst, n, k): # taken from model_vqa.py chunks = split_list(lst, n) return chunks[k] -def prep_batches(input_ids, image_tensor, model): +def get_tkns(input_ids, image_tensor, model, img_size): '''takes in one prompt and one image and returns a dictionary that has the language tokens from the prompt as one entry and the image tokens from the prompt as another''' position_ids = None # set to None in generate() attention_mask = None # set to None in generate() must figure out if this and the above is acceptable - image_size = image_tensor.size() # prep_inputs... returns None as the first value, but idk why - none_q, position_ids, attention_mask, past_key_values, input_embeds, labels, chunk_sizes = model.prepare_inputs_labels_for_multimodal( + none_q, position_ids, attention_mask, past_key_values, input_embeds, labels, chunk_sizes = model.prepare_inputs_labels_for_multimodal( input_ids = input_ids, position_ids = position_ids, attention_mask = attention_mask, past_key_values = None, labels = None, - images = image_tensor, - image_sizes=image_size + images = image_tensor.unsqueeze(0).half().cuda(), + image_sizes = img_size ) - # filter output to create the batch CURRENTLY ONLY WORKS IN ONE CASE where its text - image - text -> NEED TO GENERALIZE - # also has not been tested yet so does it work? tbh idk split_embeddings = torch.split(input_embeds, chunk_sizes, dim=0) lang_tkns = torch.cat(split_embeddings[0], split_embeddings[2]) img_tkns = split_embeddings[1] @@ -60,8 +58,31 @@ def prep_batches(input_ids, image_tensor, model): return tkn_dict +def prep_batches(line, model, tokenizer, image_processor, rags, **kwargs): + idx = line["id"] # can be used to identify each batch, probably good to use to keep track of progress during training + image_file = line["image"] + qs = line["text"] + + if qs.startswith(f"{DEFAULT_IMAGE_TOKEN}\n") == False: + qs = DEFAULT_IMAGE_TOKEN + '\n' + qs + + conv = conv_templates[args.conv_mode].copy() + conv.append_message(conv.roles[0], qs) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + + input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() + + image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB') + image_tensor = process_images([image], image_processor, model.config)[0] + image_sizes = [image.size] + + tkn_dict = get_tkns(input_ids, image_tensor, model, image_sizes) + def train(args): + args_dict = vars(args) + device = 'cuda' # set device appropriately EPOCHS = 10 G_losses = [] @@ -75,39 +96,31 @@ def train(args): tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) # get data - following along with model_vqa.py - questions = [json.loads(q) for q in open(os.path.expanduser(args.data), "r")] + questions = [json.loads(q) for q in open(os.path.expanduser(args.conversation_file), "r")] questions = get_chunk(questions, args.num_chunks, args.chunk_idx) - for line in questions: - idx = line["question_id"] # can be used to identify each batch, probably good to use to keep track of progress during training - image_file = line["image"] - qs = line["text"] - if model.config.mm_use_im_start_end: # gets skipped with current config - qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs - else: - qs = DEFAULT_IMAGE_TOKEN + '\n' + qs - - conv = conv_templates[args.conv_mode].copy() - conv.append_message(conv.roles[0], qs) - conv.append_message(conv.roles[1], None) - prompt = conv.get_prompt() - - input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() + # right now each batch is created one by one for each conversation in the file, maybe we want to precompute all the + # batches ahead of time? or maybe we consolidate this for-loop into a function? for now it should work but + # just some things to think about - image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB') - image_tensor = process_images([image], image_processor, model.config)[0] - - tkn_dict = prep_batches(input_ids, image_tensor, model) + for line in questions: + tkn_dict = prep_batches(line, model, tokenizer, image_processor, args, **args_dict) real_label = 1 fake_label = 0 - if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--model-path", type=str, default= "/home/smirrashidi/llava-v1.5-13b") - parser.add_argument("--model-base", type=str, default=None) - parser.add_argument("--image_folder", type=str, default= "/home/smirrashidi/coco_data/images") - parser.add_argument("--conversation_file", type=str, default= "/home/smirrashidi/coco_data/discrim_data.jsonl") - args = parser.parse_args() - - train(args) \ No newline at end of file +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model-path", type=str, default= "/home/smirrashidi/llava-v1.5-13b") + parser.add_argument("--model-base", type=str, default=None) + parser.add_argument("--image_folder", type=str, default= "/home/smirrashidi/coco_data/images") + parser.add_argument("--conversation_file", type=str, default= "/home/smirrashidi/coco_data/discrim_data.jsonl") + parser.add_argument("--conv-mode", type=str, default="llava_v1") + parser.add_argument("--num-chunks", type=int, default=1) + parser.add_argument("--chunk-idx", type=int, default=0) + parser.add_argument("--temperature", type=float, default=0.2) + parser.add_argument("--top_p", type=float, default=None) + parser.add_argument("--num_beams", type=int, default=1) + args = parser.parse_args() + + train(args) \ No newline at end of file From f4075a1b7f965d54a17ee652a51eaf7c8c2c33b7 Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Mon, 22 Jul 2024 02:41:54 +0000 Subject: [PATCH 09/41] finished debugging --- llava/VLLMSafety/pipeline.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/llava/VLLMSafety/pipeline.py b/llava/VLLMSafety/pipeline.py index d09a503fa..fff84bc2b 100644 --- a/llava/VLLMSafety/pipeline.py +++ b/llava/VLLMSafety/pipeline.py @@ -47,8 +47,8 @@ def get_tkns(input_ids, image_tensor, model, img_size): image_sizes = img_size ) - split_embeddings = torch.split(input_embeds, chunk_sizes, dim=0) - lang_tkns = torch.cat(split_embeddings[0], split_embeddings[2]) + split_embeddings = torch.split(input_embeds[0], chunk_sizes, dim=0) + lang_tkns = torch.cat((split_embeddings[0], split_embeddings[2]), 0) img_tkns = split_embeddings[1] tkn_dict = { @@ -59,12 +59,19 @@ def get_tkns(input_ids, image_tensor, model, img_size): return tkn_dict def prep_batches(line, model, tokenizer, image_processor, rags, **kwargs): - idx = line["id"] # can be used to identify each batch, probably good to use to keep track of progress during training + q_id = line["id"] # can be used to identify each batch, probably good to use to keep track of progress during training image_file = line["image"] qs = line["text"] if qs.startswith(f"{DEFAULT_IMAGE_TOKEN}\n") == False: + idx = qs.find(DEFAULT_IMAGE_TOKEN) + len(DEFAULT_IMAGE_TOKEN) + qs = qs[idx:].strip() + qs = qs[idx:] qs = DEFAULT_IMAGE_TOKEN + '\n' + qs + assert qs.startswith(f"{DEFAULT_IMAGE_TOKEN}\n") == True, f'no image tag found in text \n text = {qs} \n id = {q_id}' + + # something to note: this appends a default prompt to each prompt, might impact discrim since it will keep getting trained on + # the same tokens. i'll adjust to remove this soon conv = conv_templates[args.conv_mode].copy() conv.append_message(conv.roles[0], qs) From 50ddf73ca3507ac4a916b47632f2020916f9240c Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Mon, 22 Jul 2024 04:39:39 +0000 Subject: [PATCH 10/41] update --- llava/VLLMSafety/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llava/VLLMSafety/pipeline.py b/llava/VLLMSafety/pipeline.py index fff84bc2b..bde2b4aad 100644 --- a/llava/VLLMSafety/pipeline.py +++ b/llava/VLLMSafety/pipeline.py @@ -48,7 +48,7 @@ def get_tkns(input_ids, image_tensor, model, img_size): ) split_embeddings = torch.split(input_embeds[0], chunk_sizes, dim=0) - lang_tkns = torch.cat((split_embeddings[0], split_embeddings[2]), 0) + lang_tkns = split_embeddings[2] # only the second to avoid adding the same tokens over and over img_tkns = split_embeddings[1] tkn_dict = { From a96449e05578acaa983294b9205aa75c41a28b16 Mon Sep 17 00:00:00 2001 From: Laya Pullela Date: Tue, 23 Jul 2024 09:10:25 -0700 Subject: [PATCH 11/41] integrated discriminator --- llava/VLLMSafety/discriminator.py | 135 ++++++++++++++++++++++++++++++ llava/VLLMSafety/pipeline.py | 40 ++++----- 2 files changed, 155 insertions(+), 20 deletions(-) create mode 100644 llava/VLLMSafety/discriminator.py diff --git a/llava/VLLMSafety/discriminator.py b/llava/VLLMSafety/discriminator.py new file mode 100644 index 000000000..6e5eb4f89 --- /dev/null +++ b/llava/VLLMSafety/discriminator.py @@ -0,0 +1,135 @@ +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +import numpy as np +import random + + +class EasyNeuralNetwork(nn.Module): + def __init__(self, input_size, classes): + super().__init__() # we can add more layers later + # layer 1 + self.fc1 = nn.Linear(input_size, 50) + # layer 2 + self.fc2 = nn.Linear(50, classes) + + def forward(self, x): + # run x through the layers and activation functions + # (relu activation function is just max(0, x)) + x = F.relu(self.fc1(x)) + # normally there's no activation function on last layer (except softmax etc. when needed) + x = self.fc2(x) + + return x + + +def evaluate(model, loss_function, X, y): + predictions = model(X) # pass thorugh model + loss = loss_function(predictions, y) + predictions = predictions.argmax(dim=1).cpu().numpy() + acc = (predictions == y.cpu().numpy()).mean() + return predictions, acc, loss + + +def train(training_dataloader, IMAGE_SHAPE=1024 * 5, NUM_CLASSES=2, device='cuda', EPOCHS=10): + model = EasyNeuralNetwork(IMAGE_SHAPE, NUM_CLASSES) + model.to(device) # put the model on the device (remember its cuda on workstation) + optimizer = optim.Adam(model.parameters(), lr=0.001) + loss_function = nn.CrossEntropyLoss() + + training_acc_lst, training_loss_lst = [], [] + + epochs_acc = [] + for epoch in range(EPOCHS): + print(f'Epoch {epoch + 1}') + epoch_acc = [] + training_acc_checkpoint, training_loss_checkpoint = [], [] + for step, (data, labels) in enumerate(training_dataloader): + data = data.float().unsqueeze(0) + labels = labels.unsqueeze(0) + + data, labels = data.to(device), labels.to(device) # Convert labels to tensor if not already + + predictions, acc, loss = evaluate(model, loss_function, data, labels) + training_acc_checkpoint.append(acc) + epoch_acc.append(acc) + + # loss already calculated in the evaluate() call. just append it + training_loss_checkpoint.append(loss.item()) + + # back propagation + loss.backward() + + # gradient descent + optimizer.step() + + # zero the gradients so they do not accumulate + optimizer.zero_grad() + + # epoch end + print("Accuracy: ", np.mean(epoch_acc)) + epochs_acc.append(np.mean(epoch_acc)) + + # can do some optimizations here if you want early stopping, right now im not gonna implement this + + model.train(mode=False) # exit training mode + + return training_acc_lst, training_loss_lst, model + + +# def test(): +# model.train(False) # since were testing + +# test_loss = [] +# test_acc = [] + +# for X,y in test_loader: +# with torch.no_grad(): +# X, y = X.to(device), y.to(device) +# predictions = model(X) #as above: check dimentions + +# loss = loss_function(predictions, y) +# test_loss.append(loss.item()) + +# test_acc.append((predictions.argmax(dim=1).cpu().numpy() == y.cpu().numpy()).mean()) + +# print(f'Accuracy: {np.mean(test_acc):.2f}, Loss: {np.mean(test_loss):.2f}') + +# return test_acc #idc about test_loss + + +def preprocess_and_call_train(get_tkns): + # set device to cpu + device = 'mps:0' if torch.backends.mps.is_available() else 'cpu' # if we are running this on workstation change this to cuda + + # Example data loading (assuming you have loaded im_tok and lang_tok) + im_tok = [entry['img_tkns'] for entry in get_tkns.items()] + lang_tok = [entry['lang_tkns'] for entry in get_tkns.items()] + + combined_tokens = [(token, torch.tensor(0)) for token in im_tok] + [(token, torch.tensor(1)) for token in lang_tok] + + # Optionally shuffle the combined list to randomize the order + random.shuffle(combined_tokens) + + # testing code... if our embeddings are the wrong side we are doing something wrong. + assert im_tok[0].flatten().size() == [1024*5, + 1], ("flattened image tokens fed to discriminator do not match the size of " + "disc first layer") + assert lang_tok[0].flatten().size() == [1024*5, + 1], ("flattened language tokens fed to discriminator do not match the size " + "of disc first layer") + + # train network + training_acc_lst, training_loss_lst, model = train(combined_tokens, device=device) + + print("------final training accuracy:", training_acc_lst[-1]) + + # not gonna do any eval for now + # test_acc = test() + + # save the model + # PATH = 'models/desc_v1_llava.pth' + # torch.save(model, PATH) + + return model diff --git a/llava/VLLMSafety/pipeline.py b/llava/VLLMSafety/pipeline.py index bde2b4aad..ef860b6e4 100644 --- a/llava/VLLMSafety/pipeline.py +++ b/llava/VLLMSafety/pipeline.py @@ -18,7 +18,7 @@ from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path #from llava.model.llava_arch import prepare_inputs_labels_for_multimodal -# from discriminator.py import Discriminator # import Lthe discriminator class +from discriminator import preprocess_and_call_train def split_list(lst, n): # taken from model_vqa.py """Split a list into n (roughly) equal-sized chunks""" @@ -31,24 +31,24 @@ def get_chunk(lst, n, k): # taken from model_vqa.py return chunks[k] def get_tkns(input_ids, image_tensor, model, img_size): - '''takes in one prompt and one image and returns a dictionary that has the language tokens + '''takes in one prompt and one image and returns a dictionary that has the language tokens from the prompt as one entry and the image tokens from the prompt as another''' position_ids = None # set to None in generate() - attention_mask = None # set to None in generate() must figure out if this and the above is acceptable + attention_mask = None # set to None in generate() must figure out if this and the above is acceptable - # prep_inputs... returns None as the first value, but idk why + # prep_inputs... returns None as the first value, but idk why none_q, position_ids, attention_mask, past_key_values, input_embeds, labels, chunk_sizes = model.prepare_inputs_labels_for_multimodal( input_ids = input_ids, position_ids = position_ids, - attention_mask = attention_mask, + attention_mask = attention_mask, past_key_values = None, - labels = None, + labels = None, images = image_tensor.unsqueeze(0).half().cuda(), - image_sizes = img_size - ) + image_sizes = img_size + ) split_embeddings = torch.split(input_embeds[0], chunk_sizes, dim=0) - lang_tkns = split_embeddings[2] # only the second to avoid adding the same tokens over and over + lang_tkns = split_embeddings[2] # only the second to avoid adding the same tokens over and over img_tkns = split_embeddings[1] tkn_dict = { @@ -71,7 +71,7 @@ def prep_batches(line, model, tokenizer, image_processor, rags, **kwargs): assert qs.startswith(f"{DEFAULT_IMAGE_TOKEN}\n") == True, f'no image tag found in text \n text = {qs} \n id = {q_id}' # something to note: this appends a default prompt to each prompt, might impact discrim since it will keep getting trained on - # the same tokens. i'll adjust to remove this soon + # the same tokens. i'll adjust to remove this soon conv = conv_templates[args.conv_mode].copy() conv.append_message(conv.roles[0], qs) @@ -84,17 +84,18 @@ def prep_batches(line, model, tokenizer, image_processor, rags, **kwargs): image_tensor = process_images([image], image_processor, model.config)[0] image_sizes = [image.size] - tkn_dict = get_tkns(input_ids, image_tensor, model, image_sizes) + tkn_dict = get_tkns(input_ids, image_tensor, model, image_sizes) #returns tkn_dict with image and language tokens + projection_model = preprocess_and_call_train(tkn_dict) def train(args): args_dict = vars(args) - + device = 'cuda' # set device appropriately EPOCHS = 10 G_losses = [] D_losses = [] - iters = 0 + iters = 0 ## boot up model and get everything running properly disable_torch_init() @@ -106,15 +107,14 @@ def train(args): questions = [json.loads(q) for q in open(os.path.expanduser(args.conversation_file), "r")] questions = get_chunk(questions, args.num_chunks, args.chunk_idx) - # right now each batch is created one by one for each conversation in the file, maybe we want to precompute all the - # batches ahead of time? or maybe we consolidate this for-loop into a function? for now it should work but - # just some things to think about + # right now each batch is created one by one for each conversation in the file, maybe we want to precompute all the + # batches ahead of time? maybe we consolidate this for-loop into a function? for now it should work but + # just some things to think about - for line in questions: + for line in questions: tkn_dict = prep_batches(line, model, tokenizer, image_processor, args, **args_dict) - real_label = 1 - fake_label = 0 + if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -130,4 +130,4 @@ def train(args): parser.add_argument("--num_beams", type=int, default=1) args = parser.parse_args() - train(args) \ No newline at end of file + train(args) From 76fb9ceee7cfb98fd61b84f2835f2304fe07e572 Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Wed, 24 Jul 2024 20:08:52 +0000 Subject: [PATCH 12/41] fix? --- llava/VLLMSafety/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llava/VLLMSafety/pipeline.py b/llava/VLLMSafety/pipeline.py index ef860b6e4..bc9fb436a 100644 --- a/llava/VLLMSafety/pipeline.py +++ b/llava/VLLMSafety/pipeline.py @@ -58,7 +58,7 @@ def get_tkns(input_ids, image_tensor, model, img_size): return tkn_dict -def prep_batches(line, model, tokenizer, image_processor, rags, **kwargs): +def prep_batches(line, model, tokenizer, image_processor, args, **kwargs): q_id = line["id"] # can be used to identify each batch, probably good to use to keep track of progress during training image_file = line["image"] qs = line["text"] From ee61bfdf556f8082f2b80dfa35ec7c7274a4a831 Mon Sep 17 00:00:00 2001 From: lpullela Date: Wed, 24 Jul 2024 20:15:26 +0000 Subject: [PATCH 13/41] move call to training bug --- llava/VLLMSafety/pipeline.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llava/VLLMSafety/pipeline.py b/llava/VLLMSafety/pipeline.py index ef860b6e4..a1fd9aec7 100644 --- a/llava/VLLMSafety/pipeline.py +++ b/llava/VLLMSafety/pipeline.py @@ -86,7 +86,7 @@ def prep_batches(line, model, tokenizer, image_processor, rags, **kwargs): tkn_dict = get_tkns(input_ids, image_tensor, model, image_sizes) #returns tkn_dict with image and language tokens - projection_model = preprocess_and_call_train(tkn_dict) + return tkn_dict def train(args): args_dict = vars(args) @@ -101,6 +101,7 @@ def train(args): disable_torch_init() model_path = os.path.expanduser(args.model_path) model_name = get_model_name_from_path(model_path) + breakpoint() tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) # get data - following along with model_vqa.py @@ -114,6 +115,8 @@ def train(args): for line in questions: tkn_dict = prep_batches(line, model, tokenizer, image_processor, args, **args_dict) + projection_model = preprocess_and_call_train(tkn_dict) + if __name__ == "__main__": From 1262f5a9ce233a80974a426ec585deecb8784766 Mon Sep 17 00:00:00 2001 From: lpullela Date: Wed, 24 Jul 2024 21:26:21 +0000 Subject: [PATCH 14/41] adding updated llava arch --- llava/model/llava_arch.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/llava/model/llava_arch.py b/llava/model/llava_arch.py index 71507f9c4..15a16b6c2 100644 --- a/llava/model/llava_arch.py +++ b/llava/model/llava_arch.py @@ -150,6 +150,7 @@ def prepare_inputs_labels_for_multimodal( if vision_tower is None or images is None or input_ids.shape[1] == 1: return input_ids, position_ids, attention_mask, past_key_values, None, labels + assert images.ndim < 5, "image dimension exceeds 5" if type(images) is list or images.ndim == 5: if type(images) is list: images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images] @@ -199,7 +200,7 @@ def prepare_inputs_labels_for_multimodal( else: raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}") else: - image_features = self.encode_images(images) # send image through CLIP, and projected by W matrix into language space + image_features = self.encode_images(images) # TODO: image start / end is not implemented here to support pretraining. if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False): @@ -209,6 +210,7 @@ def prepare_inputs_labels_for_multimodal( # it is a headache to deal with None all the time. # But it is not ideal, and if you have a better idea, # please open an issue / submit a PR, thanks. + _labels = labels _position_ids = position_ids _attention_mask = attention_mask @@ -216,9 +218,9 @@ def prepare_inputs_labels_for_multimodal( attention_mask = torch.ones_like(input_ids, dtype=torch.bool) else: attention_mask = attention_mask.bool() - if position_ids is None: # identifies the start of each token + if position_ids is None: position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device) - if labels is None: # i dont see where this is used + if labels is None: labels = torch.full_like(input_ids, IGNORE_INDEX) # remove the padding using attention_mask -- FIXME @@ -253,6 +255,8 @@ def prepare_inputs_labels_for_multimodal( cur_new_input_embeds = [] cur_new_labels = [] + chunk_sizes = [split_sizes[0], image_features.size(1), split_sizes[1]] # used for filtering for the discriminator + for i in range(num_images + 1): cur_new_input_embeds.append(cur_input_embeds_no_im[i]) cur_new_labels.append(cur_labels_noim[i]) @@ -296,7 +300,7 @@ def prepare_inputs_labels_for_multimodal( new_labels_padded[i, -cur_len:] = cur_new_labels attention_mask[i, -cur_len:] = True position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device) - else: # zero padding + else: new_input_embeds_padded.append(torch.cat(( cur_new_embed, torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device) @@ -305,6 +309,8 @@ def prepare_inputs_labels_for_multimodal( new_labels_padded[i, :cur_len] = cur_new_labels attention_mask[i, :cur_len] = True position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device) + + assert torch.equal(new_input_embeds[0], new_input_embeds_padded[0]) == True, "padding changed the tensor in prepare_inputs_labels_for_multimodal, so tensor filtering will not work" new_input_embeds = torch.stack(new_input_embeds_padded, dim=0) @@ -321,7 +327,7 @@ def prepare_inputs_labels_for_multimodal( if _position_ids is None: position_ids = None - return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels + return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels, chunk_sizes def initialize_vision_tokenizer(self, model_args, tokenizer): if model_args.mm_use_im_patch_token: @@ -365,4 +371,4 @@ def initialize_vision_tokenizer(self, model_args, tokenizer): for p in self.get_input_embeddings().parameters(): p.requires_grad = False for p in self.get_output_embeddings().parameters(): - p.requires_grad = False + p.requires_grad = False \ No newline at end of file From 1287702471345485efe05d03a543e67050e6c159 Mon Sep 17 00:00:00 2001 From: lpullela Date: Thu, 25 Jul 2024 14:40:26 +0000 Subject: [PATCH 15/41] disc working with 92% acc, can add more layers to nn for better acc --- llava/VLLMSafety/discriminator.py | 27 +++++++++++++-------------- llava/VLLMSafety/pipeline.py | 9 +++------ 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/llava/VLLMSafety/discriminator.py b/llava/VLLMSafety/discriminator.py index 6e5eb4f89..c910ab1a8 100644 --- a/llava/VLLMSafety/discriminator.py +++ b/llava/VLLMSafety/discriminator.py @@ -32,14 +32,12 @@ def evaluate(model, loss_function, X, y): return predictions, acc, loss -def train(training_dataloader, IMAGE_SHAPE=1024 * 5, NUM_CLASSES=2, device='cuda', EPOCHS=10): +def train(training_dataloader, IMAGE_SHAPE=1024 * 5, NUM_CLASSES=2, device='cuda', EPOCHS=100): model = EasyNeuralNetwork(IMAGE_SHAPE, NUM_CLASSES) model.to(device) # put the model on the device (remember its cuda on workstation) optimizer = optim.Adam(model.parameters(), lr=0.001) loss_function = nn.CrossEntropyLoss() - training_acc_lst, training_loss_lst = [], [] - epochs_acc = [] for epoch in range(EPOCHS): print(f'Epoch {epoch + 1}') @@ -75,7 +73,7 @@ def train(training_dataloader, IMAGE_SHAPE=1024 * 5, NUM_CLASSES=2, device='cuda model.train(mode=False) # exit training mode - return training_acc_lst, training_loss_lst, model + return epochs_acc, model # def test(): @@ -101,29 +99,30 @@ def train(training_dataloader, IMAGE_SHAPE=1024 * 5, NUM_CLASSES=2, device='cuda def preprocess_and_call_train(get_tkns): # set device to cpu - device = 'mps:0' if torch.backends.mps.is_available() else 'cpu' # if we are running this on workstation change this to cuda + device = 'cuda' if torch.cuda.is_available() else 'cpu' # if we are running this on workstation change this to cuda # Example data loading (assuming you have loaded im_tok and lang_tok) - im_tok = [entry['img_tkns'] for entry in get_tkns.items()] - lang_tok = [entry['lang_tkns'] for entry in get_tkns.items()] - combined_tokens = [(token, torch.tensor(0)) for token in im_tok] + [(token, torch.tensor(1)) for token in lang_tok] + im_tok = get_tkns["img_tkns"].tolist() + lang_tok = get_tkns["lang_tkns"].tolist() + + combined_tokens = [(torch.tensor(token), torch.tensor(0)) for token in im_tok] + [(torch.tensor(token), torch.tensor(1)) for token in lang_tok] # Optionally shuffle the combined list to randomize the order random.shuffle(combined_tokens) # testing code... if our embeddings are the wrong side we are doing something wrong. - assert im_tok[0].flatten().size() == [1024*5, - 1], ("flattened image tokens fed to discriminator do not match the size of " + assert combined_tokens[0][0].flatten().size() == torch.Size([1024*5]), ("flattened image tokens fed to discriminator do not match the size of " "disc first layer") - assert lang_tok[0].flatten().size() == [1024*5, - 1], ("flattened language tokens fed to discriminator do not match the size " + assert combined_tokens[0][0].flatten().size() == torch.Size([1024*5]), ("flattened language tokens fed to discriminator do not match the size " "of disc first layer") # train network - training_acc_lst, training_loss_lst, model = train(combined_tokens, device=device) + epochs_acc, model = train(combined_tokens, device=device) + - print("------final training accuracy:", training_acc_lst[-1]) + if( len(epochs_acc) > 0 ): + print("-----------final epochs acc--------------: ", epochs_acc[-1]) # not gonna do any eval for now # test_acc = test() diff --git a/llava/VLLMSafety/pipeline.py b/llava/VLLMSafety/pipeline.py index a3c569c67..d2e9b9db9 100644 --- a/llava/VLLMSafety/pipeline.py +++ b/llava/VLLMSafety/pipeline.py @@ -52,13 +52,13 @@ def get_tkns(input_ids, image_tensor, model, img_size): img_tkns = split_embeddings[1] tkn_dict = { - lang_tkns: lang_tkns, - img_tkns: img_tkns + "lang_tkns": lang_tkns, + "img_tkns": img_tkns } return tkn_dict -def prep_batches(line, model, tokenizer, image_processor, args, **kwargs): +def prep_batches(line, model, tokenizer, image_processor, rags, **kwargs): q_id = line["id"] # can be used to identify each batch, probably good to use to keep track of progress during training image_file = line["image"] qs = line["text"] @@ -101,7 +101,6 @@ def train(args): disable_torch_init() model_path = os.path.expanduser(args.model_path) model_name = get_model_name_from_path(model_path) - breakpoint() tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) # get data - following along with model_vqa.py @@ -117,8 +116,6 @@ def train(args): projection_model = preprocess_and_call_train(tkn_dict) - - if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model-path", type=str, default= "/home/smirrashidi/llava-v1.5-13b") From b2d8882b0328a323e507522743acc7921aa998f1 Mon Sep 17 00:00:00 2001 From: lpullela Date: Sun, 28 Jul 2024 00:25:27 +0000 Subject: [PATCH 16/41] laya fine tuning script --- scripts/finetune_test.sh | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 scripts/finetune_test.sh diff --git a/scripts/finetune_test.sh b/scripts/finetune_test.sh new file mode 100644 index 000000000..425911f4f --- /dev/null +++ b/scripts/finetune_test.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +deepspeed llava/train/train_mem.py \ + --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ + --deepspeed ./scripts/zero3.json \ + --model_name_or_path liuhaotian/llava-v1.5-13b \ + --version v1 \ + --data_path /home/lpullela/LLaVA/playground/data/llava_v1_5_mix665k_subset10.json \ + --image_folder /home/lpullela/LLaVA/playground/data/ \ + --vision_tower openai/clip-vit-large-patch14-336 \ + --mm_projector_type mlp2x_gelu \ + --mm_vision_select_layer -2 \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --image_aspect_ratio pad \ + --group_by_modality_length True \ + --bf16 True \ + --output_dir ./checkpoints/llava-v1.5-13b-task-lora \ + --num_train_epochs 1 \ + --per_device_train_batch_size 16 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 1 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 50000 \ + --save_total_limit 1 \ + --learning_rate 2e-4 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 True \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 4 \ + --lazy_preprocess True \ + --report_to wandb \ No newline at end of file From e143c1ff0fdf975c5251686984e166c0dc970486 Mon Sep 17 00:00:00 2001 From: lpullela Date: Mon, 29 Jul 2024 00:32:28 +0000 Subject: [PATCH 17/41] integrated discriminator within fine tuning script, i believe this should run with cuda space available but untested till then --- llava/VLLMSafety/discriminator.py | 178 +++++++++++++--------- llava/model/language_model/llava_llama.py | 8 +- llava/model/llava_arch.py | 17 ++- 3 files changed, 120 insertions(+), 83 deletions(-) diff --git a/llava/VLLMSafety/discriminator.py b/llava/VLLMSafety/discriminator.py index c910ab1a8..8f508be12 100644 --- a/llava/VLLMSafety/discriminator.py +++ b/llava/VLLMSafety/discriminator.py @@ -5,7 +5,6 @@ import numpy as np import random - class EasyNeuralNetwork(nn.Module): def __init__(self, input_size, classes): super().__init__() # we can add more layers later @@ -23,112 +22,141 @@ def forward(self, x): return x +class Discriminator: + + def __init__(self): + self.model = EasyNeuralNetwork(5120, 2) + + def evaluate(self,model, loss_function, X, y): + predictions = model(X) # pass thorugh model + # print("shape of y: ", y.shape) + # print("prediction: ", predictions) + loss = loss_function(predictions, y) + predictions = predictions.argmax(dim=1).cpu().numpy() + acc = (predictions == y.cpu().numpy()).mean() + return predictions, acc, loss + + + def train(self,training_dataloader, IMAGE_SHAPE=1024 * 5, NUM_CLASSES=2, device='cuda', EPOCHS=1): + self.model.train(mode=True) + self.model.to(device) # put the model on the device (remember its cuda on workstation) + optimizer = optim.Adam(self.model.parameters(), lr=0.001) + loss_function = nn.CrossEntropyLoss() + + epochs_acc = [] + for epoch in range(EPOCHS): + print(f'Epoch {epoch + 1}') + epoch_acc = [] + training_acc_checkpoint, training_loss_checkpoint = [], [] + for step, (data, labels) in enumerate(training_dataloader): + data = data.float().unsqueeze(0) + labels = labels.unsqueeze(0) + + data, labels = data.to(device), labels.to(device) # Convert labels to tensor if not already + + predictions, acc, loss = self.evaluate(self.model, loss_function, data, labels) + training_acc_checkpoint.append(acc) + epoch_acc.append(acc) -def evaluate(model, loss_function, X, y): - predictions = model(X) # pass thorugh model - loss = loss_function(predictions, y) - predictions = predictions.argmax(dim=1).cpu().numpy() - acc = (predictions == y.cpu().numpy()).mean() - return predictions, acc, loss + # loss already calculated in the evaluate() call. just append it + training_loss_checkpoint.append(loss.item()) + # back propagation + loss.backward() -def train(training_dataloader, IMAGE_SHAPE=1024 * 5, NUM_CLASSES=2, device='cuda', EPOCHS=100): - model = EasyNeuralNetwork(IMAGE_SHAPE, NUM_CLASSES) - model.to(device) # put the model on the device (remember its cuda on workstation) - optimizer = optim.Adam(model.parameters(), lr=0.001) - loss_function = nn.CrossEntropyLoss() + # gradient descent + optimizer.step() - epochs_acc = [] - for epoch in range(EPOCHS): - print(f'Epoch {epoch + 1}') - epoch_acc = [] - training_acc_checkpoint, training_loss_checkpoint = [], [] - for step, (data, labels) in enumerate(training_dataloader): - data = data.float().unsqueeze(0) - labels = labels.unsqueeze(0) + # zero the gradients so they do not accumulate + optimizer.zero_grad() - data, labels = data.to(device), labels.to(device) # Convert labels to tensor if not already + # epoch end + print("Accuracy: ", np.mean(epoch_acc)) + epochs_acc.append(np.mean(epoch_acc)) - predictions, acc, loss = evaluate(model, loss_function, data, labels) - training_acc_checkpoint.append(acc) - epoch_acc.append(acc) + # can do some optimizations here if you want early stopping, right now im not gonna implement this - # loss already calculated in the evaluate() call. just append it - training_loss_checkpoint.append(loss.item()) + self.model.train(mode=False) # exit training mode - # back propagation - loss.backward() + return epochs_acc, self.model - # gradient descent - optimizer.step() - # zero the gradients so they do not accumulate - optimizer.zero_grad() + # def test(): + # model.train(False) # since were testing - # epoch end - print("Accuracy: ", np.mean(epoch_acc)) - epochs_acc.append(np.mean(epoch_acc)) + # test_loss = [] + # test_acc = [] - # can do some optimizations here if you want early stopping, right now im not gonna implement this + # for X,y in test_loader: + # with torch.no_grad(): + # X, y = X.to(device), y.to(device) + # predictions = model(X) #as above: check dimentions - model.train(mode=False) # exit training mode + # loss = loss_function(predictions, y) + # test_loss.append(loss.item()) - return epochs_acc, model + # test_acc.append((predictions.argmax(dim=1).cpu().numpy() == y.cpu().numpy()).mean()) + # print(f'Accuracy: {np.mean(test_acc):.2f}, Loss: {np.mean(test_loss):.2f}') -# def test(): -# model.train(False) # since were testing + # return test_acc #idc about test_loss -# test_loss = [] -# test_acc = [] -# for X,y in test_loader: -# with torch.no_grad(): -# X, y = X.to(device), y.to(device) -# predictions = model(X) #as above: check dimentions + def preprocess_and_call_train(self,get_tkns): + # set device to cpu + device = 'cuda' if torch.cuda.is_available() else 'cpu' # if we are running this on workstation change this to cuda -# loss = loss_function(predictions, y) -# test_loss.append(loss.item()) + # Example data loading (assuming you have loaded im_tok and lang_tok) -# test_acc.append((predictions.argmax(dim=1).cpu().numpy() == y.cpu().numpy()).mean()) + im_tok = get_tkns["image"] + lang_tok = get_tkns["lang"] -# print(f'Accuracy: {np.mean(test_acc):.2f}, Loss: {np.mean(test_loss):.2f}') + lang_tok_list = [] + for tensor in lang_tok: + for i in range(tensor.size(0)): + lang_tok_list.append(tensor[i, :]) -# return test_acc #idc about test_loss + im_tok_list = [] + for tensor in im_tok: + for i in range(tensor.size(0)): + for j in range(tensor.size(1)): + im_tok_list.append(tensor[i, j, :]) + # print("image tokens arr length: ", len(im_tok)) + # print("image tokens[0] shape: ", im_tok[0].shape) # image tokens[0] shape: torch.Size([16, 576, 5120]) -def preprocess_and_call_train(get_tkns): - # set device to cpu - device = 'cuda' if torch.cuda.is_available() else 'cpu' # if we are running this on workstation change this to cuda + # print("lang tokens arr length: ", len(lang_tok)) + # print("lang tokens[0] shape: ", lang_tok[0].shape) # lang tokens[0] shape: torch.Size([1277, 5120]) - # Example data loading (assuming you have loaded im_tok and lang_tok) - im_tok = get_tkns["img_tkns"].tolist() - lang_tok = get_tkns["lang_tkns"].tolist() + combined_tokens = [(torch.tensor(token).float(), torch.tensor(0).float()) for token in im_tok_list] + [(torch.tensor(token).float(), torch.tensor(1).float()) for token in lang_tok_list] - combined_tokens = [(torch.tensor(token), torch.tensor(0)) for token in im_tok] + [(torch.tensor(token), torch.tensor(1)) for token in lang_tok] + print("im_tok: ", im_tok[0].shape) + print("lang_tok: ", lang_tok_list[0].shape) - # Optionally shuffle the combined list to randomize the order - random.shuffle(combined_tokens) + # Optionally shuffle the combined list to randomize the order + random.shuffle(combined_tokens) - # testing code... if our embeddings are the wrong side we are doing something wrong. - assert combined_tokens[0][0].flatten().size() == torch.Size([1024*5]), ("flattened image tokens fed to discriminator do not match the size of " - "disc first layer") - assert combined_tokens[0][0].flatten().size() == torch.Size([1024*5]), ("flattened language tokens fed to discriminator do not match the size " - "of disc first layer") + # testing code... if our embeddings are the wrong side we are doing something wrong. + print("combined_tokens[0][0].flatten().size(): ", combined_tokens[0][0].flatten().size()) + assert combined_tokens[0][0].flatten().size() == torch.Size([1024*5]), ("flattened image tokens fed to discriminator do not match the size of " + "disc first layer") + print("combined_tokens[-1][0].flatten().size(): ", combined_tokens[-1][0].flatten().size()) + assert combined_tokens[-1][0].flatten().size() == torch.Size([1024*5]), ("flattened language tokens fed to discriminator do not match the size " + "of disc first layer") - # train network - epochs_acc, model = train(combined_tokens, device=device) + # train network + epochs_acc, model = self.train(combined_tokens, device=device) - if( len(epochs_acc) > 0 ): - print("-----------final epochs acc--------------: ", epochs_acc[-1]) + if( len(epochs_acc) > 0 ): + print("-----------final epochs acc--------------: ", epochs_acc[-1]) - # not gonna do any eval for now - # test_acc = test() + # not gonna do any eval for now + # test_acc = test() - # save the model - # PATH = 'models/desc_v1_llava.pth' - # torch.save(model, PATH) + # save the model + # PATH = 'models/desc_v1_llava.pth' + # torch.save(model, PATH) - return model + return model diff --git a/llava/model/language_model/llava_llama.py b/llava/model/language_model/llava_llama.py index 069d0d1c1..f570cc5f2 100644 --- a/llava/model/language_model/llava_llama.py +++ b/llava/model/language_model/llava_llama.py @@ -23,6 +23,7 @@ from transformers.modeling_outputs import CausalLMOutputWithPast from transformers.generation.utils import GenerateOutput +from llava.VLLMSafety.discriminator import Discriminator from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM @@ -47,6 +48,11 @@ def __init__(self, config): self.pretraining_tp = config.pretraining_tp self.vocab_size = config.vocab_size self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.disc_data = { + "images": [], + "lang": [], + } + self.discriminator = Discriminator() # Initialize weights and apply final processing self.post_init() @@ -155,4 +161,4 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, return inputs AutoConfig.register("llava_llama", LlavaConfig) -AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM) +AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM) \ No newline at end of file diff --git a/llava/model/llava_arch.py b/llava/model/llava_arch.py index 15a16b6c2..8b3867b2b 100644 --- a/llava/model/llava_arch.py +++ b/llava/model/llava_arch.py @@ -140,17 +140,20 @@ def get_vision_tower(self): def encode_images(self, images): image_features = self.get_model().get_vision_tower()(images) image_features = self.get_model().mm_projector(image_features) + + self.disc_data['image'].append(image_features) return image_features def prepare_inputs_labels_for_multimodal( self, input_ids, position_ids, attention_mask, past_key_values, labels, images, image_sizes=None ): + self.disc_data['image'] = [] + self.disc_data['lang'] = [] vision_tower = self.get_vision_tower() if vision_tower is None or images is None or input_ids.shape[1] == 1: return input_ids, position_ids, attention_mask, past_key_values, None, labels - assert images.ndim < 5, "image dimension exceeds 5" if type(images) is list or images.ndim == 5: if type(images) is list: images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images] @@ -210,7 +213,6 @@ def prepare_inputs_labels_for_multimodal( # it is a headache to deal with None all the time. # But it is not ideal, and if you have a better idea, # please open an issue / submit a PR, thanks. - _labels = labels _position_ids = position_ids _attention_mask = attention_mask @@ -255,8 +257,6 @@ def prepare_inputs_labels_for_multimodal( cur_new_input_embeds = [] cur_new_labels = [] - chunk_sizes = [split_sizes[0], image_features.size(1), split_sizes[1]] # used for filtering for the discriminator - for i in range(num_images + 1): cur_new_input_embeds.append(cur_input_embeds_no_im[i]) cur_new_labels.append(cur_labels_noim[i]) @@ -274,6 +274,11 @@ def prepare_inputs_labels_for_multimodal( new_input_embeds.append(cur_new_input_embeds) new_labels.append(cur_new_labels) + self.disc_data['lang'] = new_input_embeds + + # call discriminator: + self.discriminator.preprocess_and_call_train(self.disc_data) + # Truncate sequences to max length as image embeddings can make the sequence longer tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None) if tokenizer_model_max_length is not None: @@ -309,8 +314,6 @@ def prepare_inputs_labels_for_multimodal( new_labels_padded[i, :cur_len] = cur_new_labels attention_mask[i, :cur_len] = True position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device) - - assert torch.equal(new_input_embeds[0], new_input_embeds_padded[0]) == True, "padding changed the tensor in prepare_inputs_labels_for_multimodal, so tensor filtering will not work" new_input_embeds = torch.stack(new_input_embeds_padded, dim=0) @@ -327,7 +330,7 @@ def prepare_inputs_labels_for_multimodal( if _position_ids is None: position_ids = None - return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels, chunk_sizes + return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels def initialize_vision_tokenizer(self, model_args, tokenizer): if model_args.mm_use_im_patch_token: From 05db79222165ef9ef2d6afd96cd42dcc60eda0da Mon Sep 17 00:00:00 2001 From: lpullela Date: Mon, 29 Jul 2024 00:36:09 +0000 Subject: [PATCH 18/41] float conversions not necessary unless lora --- llava/VLLMSafety/discriminator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llava/VLLMSafety/discriminator.py b/llava/VLLMSafety/discriminator.py index 8f508be12..088b98bd9 100644 --- a/llava/VLLMSafety/discriminator.py +++ b/llava/VLLMSafety/discriminator.py @@ -129,7 +129,7 @@ def preprocess_and_call_train(self,get_tkns): # print("lang tokens[0] shape: ", lang_tok[0].shape) # lang tokens[0] shape: torch.Size([1277, 5120]) - combined_tokens = [(torch.tensor(token).float(), torch.tensor(0).float()) for token in im_tok_list] + [(torch.tensor(token).float(), torch.tensor(1).float()) for token in lang_tok_list] + combined_tokens = [(torch.tensor(token), torch.tensor(0)) for token in im_tok_list] + [(torch.tensor(token), torch.tensor(1)) for token in lang_tok_list] print("im_tok: ", im_tok[0].shape) print("lang_tok: ", lang_tok_list[0].shape) From 87e7336f19bcbbc3ba44096276f95b37869645cf Mon Sep 17 00:00:00 2001 From: lpullela Date: Mon, 29 Jul 2024 04:09:29 +0000 Subject: [PATCH 19/41] laya: error in getting pure language token embeddings --- llava/model/llava_arch.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llava/model/llava_arch.py b/llava/model/llava_arch.py index 8b3867b2b..209f62f3f 100644 --- a/llava/model/llava_arch.py +++ b/llava/model/llava_arch.py @@ -149,7 +149,9 @@ def prepare_inputs_labels_for_multimodal( images, image_sizes=None ): self.disc_data['image'] = [] - self.disc_data['lang'] = [] + #print("shape of disdc data image shape", self.disc_data['image'].shape) + self.disc_data['lang'] = self.get_model().embed_tokens(input_ids) + vision_tower = self.get_vision_tower() if vision_tower is None or images is None or input_ids.shape[1] == 1: return input_ids, position_ids, attention_mask, past_key_values, None, labels @@ -274,8 +276,6 @@ def prepare_inputs_labels_for_multimodal( new_input_embeds.append(cur_new_input_embeds) new_labels.append(cur_new_labels) - self.disc_data['lang'] = new_input_embeds - # call discriminator: self.discriminator.preprocess_and_call_train(self.disc_data) From 3aab4f8ccb57bd13618d436a20587c0b57bf58bc Mon Sep 17 00:00:00 2001 From: lpullela Date: Mon, 29 Jul 2024 15:39:50 +0000 Subject: [PATCH 20/41] check this --- llava/model/llava_arch.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/llava/model/llava_arch.py b/llava/model/llava_arch.py index 209f62f3f..4eae1b9b0 100644 --- a/llava/model/llava_arch.py +++ b/llava/model/llava_arch.py @@ -149,8 +149,7 @@ def prepare_inputs_labels_for_multimodal( images, image_sizes=None ): self.disc_data['image'] = [] - #print("shape of disdc data image shape", self.disc_data['image'].shape) - self.disc_data['lang'] = self.get_model().embed_tokens(input_ids) + self.disc_data['lang'] = [] vision_tower = self.get_vision_tower() if vision_tower is None or images is None or input_ids.shape[1] == 1: @@ -256,6 +255,11 @@ def prepare_inputs_labels_for_multimodal( split_sizes = [x.shape[0] for x in cur_labels_noim] cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim)) cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0) + + + #curr input embeds is coming from cur_input_ids_noim which means its already filitered + self.disc_data['lang'].append(cur_input_embeds) + cur_new_input_embeds = [] cur_new_labels = [] From ea4fa59ad99abbf1fcd4ac52089d1459cdab2557 Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Wed, 14 Aug 2024 16:25:37 +0000 Subject: [PATCH 21/41] adding local changes --- llava/train/llava_trainer.py | 548 ++++++++++++++++++++++++++++++++++ scripts/v1_5/finetune_lora.sh | 2 +- 2 files changed, 549 insertions(+), 1 deletion(-) diff --git a/llava/train/llava_trainer.py b/llava/train/llava_trainer.py index ce2853a41..c746db62a 100644 --- a/llava/train/llava_trainer.py +++ b/llava/train/llava_trainer.py @@ -131,6 +131,11 @@ def __iter__(self): class LLaVATrainer(Trainer): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.d_optimizer = optim.Adam(model.discrminator.parameters(), lr=lr, betas=(beta1, 0.999)) # what kind of optimizer do we want to use? + # also need to figure out how to accesst the discriminator, also what learning rate/betas do we want? + def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]: if self.train_dataset is None or not has_length(self.train_dataset): @@ -253,3 +258,546 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None): pass else: super(LLaVATrainer, self)._save(output_dir, state_dict) + + def compute_loss(self, model, inputs, return_outputs=False): + if self.label_smoother is not None and "labels" in inputs: + labels = inputs.pop("labels") + else: + labels = None + + loss_dict = model(**inputs) # i think this is correct + #outputs = model(**inputs) + + ## not sure if we need the rest of this? can we just return the loss? + + # Save past state if it exists + # TODO: this needs to be fixed and made cleaner later. + if self.args.past_index >= 0: + self._past = outputs[self.args.past_index] + + if labels is not None: + unwrapped_model = self.accelerator.unwrap_model(model) + if _is_peft_model(unwrapped_model): + model_name = unwrapped_model.base_model.model._get_name() + else: + model_name = unwrapped_model._get_name() + if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): + loss = self.label_smoother(outputs, labels, shift_labels=True) + else: + loss = self.label_smoother(outputs, labels) + else: + if isinstance(outputs, dict) and "loss" not in outputs: + raise ValueError( + "The model did not return a loss from the inputs, only the following keys: " + f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." + ) + # We don't use .loss here since the model may return tuples instead of ModelOutput. + loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] + + return (loss, outputs) if return_outputs else loss + + def _inner_training_loop( + self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None + ): + self.accelerator.free_memory() + self._train_batch_size = batch_size + if self.args.auto_find_batch_size: + if self.state.train_batch_size != self._train_batch_size: + from accelerate.utils import release_memory + + (self.model_wrapped,) = release_memory(self.model_wrapped) + self.model_wrapped = self.model + + # Check for DeepSpeed *after* the intial pass and modify the config + if self.is_deepspeed_enabled: + # Temporarily unset `self.args.train_batch_size` + original_bs = self.args.per_device_train_batch_size + self.args.per_device_train_batch_size = self._train_batch_size // max(1, self.args.n_gpu) + self.propagate_args_to_deepspeed(True) + self.args.per_device_train_batch_size = original_bs + self.state.train_batch_size = self._train_batch_size + logger.debug(f"Currently training with a batch size of: {self._train_batch_size}") + # Data loader and number of training steps + train_dataloader = self.get_train_dataloader() + if self.is_fsdp_xla_v2_enabled: + train_dataloader = tpu_spmd_dataloader(train_dataloader) + + # Setting up training control variables: + # number of training epochs: num_train_epochs + # number of training steps per epoch: num_update_steps_per_epoch + # total number of training steps to execute: max_steps + total_train_batch_size = self._train_batch_size * args.gradient_accumulation_steps * args.world_size + + len_dataloader = None + num_train_tokens = None + if has_length(train_dataloader): + len_dataloader = len(train_dataloader) + num_update_steps_per_epoch = len_dataloader // args.gradient_accumulation_steps + num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1) + num_examples = self.num_examples(train_dataloader) + if args.max_steps > 0: + max_steps = args.max_steps + num_train_epochs = args.max_steps // num_update_steps_per_epoch + int( + args.max_steps % num_update_steps_per_epoch > 0 + ) + # May be slightly incorrect if the last batch in the training dataloader has a smaller size but it's + # the best we can do. + num_train_samples = args.max_steps * total_train_batch_size + if args.include_tokens_per_second: + num_train_tokens = ( + self.num_tokens(train_dataloader, args.max_steps) * args.gradient_accumulation_steps + ) + else: + max_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch) + num_train_epochs = math.ceil(args.num_train_epochs) + num_train_samples = self.num_examples(train_dataloader) * args.num_train_epochs + if args.include_tokens_per_second: + num_train_tokens = self.num_tokens(train_dataloader) * args.num_train_epochs + elif args.max_steps > 0: # Rely on max_steps when dataloader does not have a working size + max_steps = args.max_steps + # Setting a very large number of epochs so we go as many times as necessary over the iterator. + num_train_epochs = sys.maxsize + num_update_steps_per_epoch = max_steps + num_examples = total_train_batch_size * args.max_steps + num_train_samples = args.max_steps * total_train_batch_size + if args.include_tokens_per_second: + num_train_tokens = self.num_tokens(train_dataloader, args.max_steps) * args.gradient_accumulation_steps + else: + raise ValueError( + "args.max_steps must be set to a positive value if dataloader does not have a length, was" + f" {args.max_steps}" + ) + + if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug: + if self.args.n_gpu > 1: + # nn.DataParallel(model) replicates the model, creating new variables and module + # references registered here no longer work on other gpus, breaking the module + raise ValueError( + "Currently --debug underflow_overflow is not supported under DP. Please use DDP" + " (torchrun or torch.distributed.launch (deprecated))." + ) + else: + debug_overflow = DebugUnderflowOverflow(self.model) # noqa + + delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled + + # We need to reset the scheduler, as its parameters may be different on subsequent calls + if self._created_lr_scheduler: + self.lr_scheduler = None + self._created_lr_scheduler = False + + if self.is_deepspeed_enabled: + self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps) + + if not delay_optimizer_creation: + self.create_optimizer_and_scheduler(num_training_steps=max_steps) + + self.state = TrainerState( + stateful_callbacks=[ + cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState) + ] + ) + self.state.is_hyper_param_search = trial is not None + self.state.train_batch_size = self._train_batch_size + + # Compute absolute values for logging, eval, and save if given as ratio + if args.logging_steps is not None: + if args.logging_steps < 1: + self.state.logging_steps = math.ceil(max_steps * args.logging_steps) + else: + self.state.logging_steps = args.logging_steps + if args.eval_steps is not None: + if args.eval_steps < 1: + self.state.eval_steps = math.ceil(max_steps * args.eval_steps) + else: + self.state.eval_steps = args.eval_steps + if args.save_steps is not None: + if args.save_steps < 1: + self.state.save_steps = math.ceil(max_steps * args.save_steps) + else: + self.state.save_steps = args.save_steps + + # Activate gradient checkpointing if needed + if args.gradient_checkpointing: + if args.gradient_checkpointing_kwargs is None: + gradient_checkpointing_kwargs = {} + else: + gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs + + self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs) + + model = self._wrap_model(self.model_wrapped) + + # as the model is wrapped, don't use `accelerator.prepare` + # this is for unhandled cases such as + # FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX + use_accelerator_prepare = True if model is self.model else False + + if delay_optimizer_creation: + if use_accelerator_prepare: + self._fsdp_qlora_plugin_updates() + self.model = self.accelerator.prepare(self.model) + self.create_optimizer_and_scheduler(num_training_steps=max_steps) + + # prepare using `accelerator` prepare + if use_accelerator_prepare: + self.model.train() + if hasattr(self.lr_scheduler, "step"): + if self.use_apex: + model = self.accelerator.prepare(self.model) + else: + model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer) + else: + # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config. + model, self.optimizer, self.lr_scheduler = self.accelerator.prepare( + self.model, self.optimizer, self.lr_scheduler + ) + elif self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]: + # In this case we are in DDP + LOMO, which should be supported + self.optimizer = self.accelerator.prepare(self.optimizer) + + if self.is_fsdp_enabled: + self.model = self.model_wrapped = model + + # for the rest of this function `model` is the outside model, whether it was wrapped or not + if model is not self.model: + self.model_wrapped = model + + # backward compatibility + if self.is_deepspeed_enabled: + self.deepspeed = self.model_wrapped + + # ckpt loading + if resume_from_checkpoint is not None: + if self.is_deepspeed_enabled: + deepspeed_load_checkpoint( + self.model_wrapped, resume_from_checkpoint, load_module_strict=not _is_peft_model(self.model) + ) + elif is_sagemaker_mp_enabled() or self.is_fsdp_enabled: + self._load_from_checkpoint(resume_from_checkpoint, self.model_wrapped) + + # Check if saved optimizer or scheduler states exist + self._load_optimizer_and_scheduler(resume_from_checkpoint) + + # important: at this point: + # self.model is the Transformers Model + # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model), + # FSDP(Transformers Model), Dynamo Optimized Module(Transformers Model) etc. + + # Train! + logger.info("***** Running training *****") + logger.info(f" Num examples = {num_examples:,}") + logger.info(f" Num Epochs = {num_train_epochs:,}") + logger.info(f" Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}") + if self.args.per_device_train_batch_size != self._train_batch_size: + logger.info(f" Training with DataParallel so batch size has been adjusted to: {self._train_batch_size:,}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Total optimization steps = {max_steps:,}") + logger.info(f" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}") + + self.state.epoch = 0 + start_time = time.time() + epochs_trained = 0 + steps_trained_in_current_epoch = 0 + steps_trained_progress_bar = None + + # Check if continuing training from a checkpoint + if resume_from_checkpoint is not None and os.path.isfile( + os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME) + ): + self.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)) + self.compare_trainer_and_checkpoint_args(self.args, self.state) + self._load_callback_state() + epochs_trained = int(self.state.global_step // num_update_steps_per_epoch) + if not args.ignore_data_skip: + steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch) + steps_trained_in_current_epoch *= args.gradient_accumulation_steps + else: + steps_trained_in_current_epoch = 0 + + logger.info(" Continuing training from checkpoint, will skip to saved global_step") + logger.info(f" Continuing training from epoch {epochs_trained}") + logger.info(f" Continuing training from global step {self.state.global_step}") + if not args.ignore_data_skip: + logger.info( + f" Will skip the first {epochs_trained} epochs then the first" + f" {steps_trained_in_current_epoch} batches in the first epoch." + ) + + # Update the references + self.callback_handler.model = self.model + self.callback_handler.optimizer = self.optimizer + self.callback_handler.lr_scheduler = self.lr_scheduler + self.callback_handler.train_dataloader = train_dataloader + if self.hp_name is not None and self._trial is not None: + # use self._trial because the SigOpt/Optuna hpo only call `_hp_search_setup(trial)` instead of passing trial + # parameter to Train when using DDP. + self.state.trial_name = self.hp_name(self._trial) + if trial is not None: + assignments = trial.assignments if self.hp_search_backend == HPSearchBackend.SIGOPT else trial + self.state.trial_params = hp_params(assignments) + else: + self.state.trial_params = None + # This should be the same if the state has been saved but in case the training arguments changed, it's safer + # to set this after the load. + self.state.max_steps = max_steps + self.state.num_train_epochs = num_train_epochs + self.state.is_local_process_zero = self.is_local_process_zero() + self.state.is_world_process_zero = self.is_world_process_zero() + + # tr_loss is a tensor to avoid synchronization of TPUs through .item() + tr_loss = torch.tensor(0.0).to(args.device) + # _total_loss_scalar is updated everytime .item() has to be called on tr_loss and stores the sum of all losses + self._total_loss_scalar = 0.0 + self._globalstep_last_logged = self.state.global_step + model.zero_grad() + grad_norm: Optional[float] = None + self.control = self.callback_handler.on_train_begin(args, self.state, self.control) + + if args.eval_on_start: + self._evaluate(trial, ignore_keys_for_eval, skip_scheduler=True) + + total_batched_samples = 0 + for epoch in range(epochs_trained, num_train_epochs): + epoch_iterator = train_dataloader + if hasattr(epoch_iterator, "set_epoch"): + epoch_iterator.set_epoch(epoch) + + # Reset the past mems state at the beginning of each epoch if necessary. + if args.past_index >= 0: + self._past = None + + steps_in_epoch = ( + len(epoch_iterator) + if len_dataloader is not None + else args.max_steps * args.gradient_accumulation_steps + ) + self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control) + + if epoch == epochs_trained and resume_from_checkpoint is not None and steps_trained_in_current_epoch == 0: + self._load_rng_state(resume_from_checkpoint) + + rng_to_sync = False + steps_skipped = 0 + if steps_trained_in_current_epoch > 0: + epoch_iterator = skip_first_batches(epoch_iterator, steps_trained_in_current_epoch) + steps_skipped = steps_trained_in_current_epoch + steps_trained_in_current_epoch = 0 + rng_to_sync = True + + step = -1 + for step, inputs in enumerate(epoch_iterator): + total_batched_samples += 1 + + if self.args.include_num_input_tokens_seen: + main_input_name = getattr(self.model, "main_input_name", "input_ids") + if main_input_name not in inputs: + logger.warning( + "Tried to track the number of tokens seen, however the current model is " + "not configured properly to know what item is the input. To fix this, add " + "a `main_input_name` attribute to the model class you are using." + ) + else: + self.state.num_input_tokens_seen += ( + torch.sum( + self.accelerator.gather( + torch.tensor( + inputs[main_input_name].numel(), device=self.args.device, dtype=torch.int64 + ) + ) + ) + .cpu() + .item() + ) + if rng_to_sync: + self._load_rng_state(resume_from_checkpoint) + rng_to_sync = False + + # Skip past any already trained steps if resuming training + if steps_trained_in_current_epoch > 0: + steps_trained_in_current_epoch -= 1 + if steps_trained_progress_bar is not None: + steps_trained_progress_bar.update(1) + if steps_trained_in_current_epoch == 0: + self._load_rng_state(resume_from_checkpoint) + continue + elif steps_trained_progress_bar is not None: + steps_trained_progress_bar.close() + steps_trained_progress_bar = None + + if step % args.gradient_accumulation_steps == 0: + self.control = self.callback_handler.on_step_begin(args, self.state, self.control) + + with self.accelerator.accumulate(model): + tr_loss_step = self.training_step(model, inputs) + + if ( + args.logging_nan_inf_filter + and not is_torch_xla_available() + and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) + ): + # if loss is nan or inf simply add the average of previous logged losses + tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged) + else: + if tr_loss.device != tr_loss_step.device: + raise ValueError( + f"Calculated loss must be on the original device: {tr_loss.device} but device in use is {tr_loss_step.device}" + ) + tr_loss += tr_loss_step + + self.current_flos += float(self.floating_point_ops(inputs)) + + is_last_step_and_steps_less_than_grad_acc = ( + steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch + ) + + if ( + total_batched_samples % args.gradient_accumulation_steps == 0 + or + # last step in epoch but step is always smaller than gradient_accumulation_steps + is_last_step_and_steps_less_than_grad_acc + ): + # the `or` condition of `is_last_step_and_steps_less_than_grad_acc` is not covered + # in accelerate. So, explicitly enable sync gradients to True in that case. + if is_last_step_and_steps_less_than_grad_acc: + self.accelerator.gradient_state._set_sync_gradients(True) + + # Gradient clipping + if args.max_grad_norm is not None and args.max_grad_norm > 0: + # deepspeed does its own clipping + + if is_sagemaker_mp_enabled() and args.fp16: + _grad_norm = self.optimizer.clip_master_grads(args.max_grad_norm) + elif self.use_apex: + # Revert to normal clipping otherwise, handling Apex or full precision + _grad_norm = nn.utils.clip_grad_norm_( + amp.master_params(self.optimizer), + args.max_grad_norm, + ) + else: + _grad_norm = self.accelerator.clip_grad_norm_( + model.parameters(), + args.max_grad_norm, + ) + + if ( + is_accelerate_available() + and self.accelerator.distributed_type == DistributedType.DEEPSPEED + ): + grad_norm = model.get_global_grad_norm() + # In some cases the grad norm may not return a float + if hasattr(grad_norm, "item"): + grad_norm = grad_norm.item() + else: + grad_norm = _grad_norm + + self.optimizer.step() # need to calculate the gradients for the discriminator somewhere, not sure where yet, i think in training_step? + self.d_optimizer.step() + + self.control = self.callback_handler.on_optimizer_step(args, self.state, self.control) + + optimizer_was_run = not self.accelerator.optimizer_step_was_skipped + if optimizer_was_run: + # Delay optimizer scheduling until metrics are generated + if not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): + self.lr_scheduler.step() + + model.zero_grad() + self.state.global_step += 1 + self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch + self.control = self.callback_handler.on_step_end(args, self.state, self.control) + + self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval) + else: + self.control = self.callback_handler.on_substep_end(args, self.state, self.control) + + if self.control.should_epoch_stop or self.control.should_training_stop: + # PyTorch/XLA relies on the data loader to insert the mark_step for + # each step. Since we are breaking the loop early, we need to manually + # insert the mark_step here. + if is_torch_xla_available(): + xm.mark_step() + break + if step < 0: + logger.warning( + "There seems not to be a single sample in your epoch_iterator, stopping training at step" + f" {self.state.global_step}! This is expected if you're using an IterableDataset and set" + f" num_steps ({max_steps}) higher than the number of available samples." + ) + self.control.should_training_stop = True + + self.control = self.callback_handler.on_epoch_end(args, self.state, self.control) + self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval) + + if DebugOption.TPU_METRICS_DEBUG in self.args.debug: + if is_torch_xla_available(): + # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) + xm.master_print(met.metrics_report()) + else: + logger.warning( + "You enabled PyTorch/XLA debug metrics but you don't have a TPU " + "configured. Check your training configuration if this is unexpected." + ) + if self.control.should_training_stop: + break + + if args.past_index and hasattr(self, "_past"): + # Clean the state at the end of training + delattr(self, "_past") + + logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") + if args.load_best_model_at_end and self.state.best_model_checkpoint is not None: + # Wait for everyone to get here so we are sure the model has been saved by process 0. + if is_torch_xla_available(): + xm.rendezvous("load_best_model_at_end") + elif args.parallel_mode == ParallelMode.DISTRIBUTED: + dist.barrier() + elif is_sagemaker_mp_enabled(): + smp.barrier() + + self._load_best_model() + + # add remaining tr_loss + self._total_loss_scalar += tr_loss.item() + effective_global_step = max(self.state.global_step, 0.001) # Avoid ZeroDivisionError + train_loss = self._total_loss_scalar / effective_global_step + + metrics = speed_metrics( + "train", + start_time, + num_samples=num_train_samples, + num_steps=self.state.max_steps, + num_tokens=num_train_tokens, + ) + self.store_flos() + metrics["total_flos"] = self.state.total_flos + metrics["train_loss"] = train_loss + + self.is_in_train = False + + self._memory_tracker.stop_and_update_metrics(metrics) + + self.log(metrics) + + run_dir = self._get_output_dir(trial) + checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir) + + # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save. + if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1: + for checkpoint in checkpoints_sorted: + if not os.path.samefile(checkpoint, self.state.best_model_checkpoint): + logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit") + shutil.rmtree(checkpoint, ignore_errors=True) + + self.control = self.callback_handler.on_train_end(args, self.state, self.control) + + # Wait for the checkpoint to be uploaded. + self._finish_current_push() + + # After training we make sure to retrieve back the original forward pass method + # for the embedding layer by removing the forward post hook. + if self.neftune_noise_alpha is not None: + self._deactivate_neftune(self.model) + + return TrainOutput(self.state.global_step, train_loss, metrics) \ No newline at end of file diff --git a/scripts/v1_5/finetune_lora.sh b/scripts/v1_5/finetune_lora.sh index 90f00707c..761086631 100644 --- a/scripts/v1_5/finetune_lora.sh +++ b/scripts/v1_5/finetune_lora.sh @@ -8,7 +8,7 @@ deepspeed llava/train/train_mem.py \ --data_path ./playground/data/llava_v1_5_mix665k.json \ --image_folder ./playground/data \ --vision_tower openai/clip-vit-large-patch14-336 \ - --pretrain_mm_mlp_adapter ./checkpoints/llava-v1.5-13b-pretrain/mm_projector.bin \ + --pretrain_mm_mlp_adapter ./checkpoints/llava-v1.5-mlp2x-336px-pretrain-vicuna-7b-v1.5/mm_projector.bin \ --mm_projector_type mlp2x_gelu \ --mm_vision_select_layer -2 \ --mm_use_im_start_end False \ From f95905271149108dabc76427ad99aee12b405157 Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Thu, 15 Aug 2024 21:39:41 +0000 Subject: [PATCH 22/41] provisional training changes --- llava/model/language_model/llava_llama.py | 30 ++++++++++++++++++++--- llava/train/llava_trainer.py | 9 ++++--- llava/train/train.py | 2 ++ scripts/v1_5/finetune_lora.sh | 2 +- 4 files changed, 35 insertions(+), 8 deletions(-) diff --git a/llava/model/language_model/llava_llama.py b/llava/model/language_model/llava_llama.py index f570cc5f2..da5efa883 100644 --- a/llava/model/language_model/llava_llama.py +++ b/llava/model/language_model/llava_llama.py @@ -74,8 +74,9 @@ def forward( images: Optional[torch.FloatTensor] = None, image_sizes: Optional[List[List[int]]] = None, return_dict: Optional[bool] = None, - ) -> Union[Tuple, CausalLMOutputWithPast]: - + d_mode: Optional[bool] = False # False means run without discriminator first + ) -> Union[Tuple, CausalLMOutputWithPast]: + if inputs_embeds is None: ( input_ids, @@ -94,7 +95,8 @@ def forward( image_sizes ) - return super().forward( + if d_mode == False: + return super().forward( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, @@ -106,6 +108,28 @@ def forward( output_hidden_states=output_hidden_states, return_dict=return_dict ) + else: + real_d_loss = self.discriminator.evaluate(self.disc_data["lang"])[2] # are we training discrim here? should we be calculating gradients? + fake_d_loss = self.discriminator.evaluate(self.disc_data["images"])[2] + model_loss = tuple(super().forward( # i dont think we can directly unpack the output of forward() so i convert to tuple: may not be necessary + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + labels=labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict + ))[0] # i think the loss is the first item? + + final_d_loss = real_d_loss + fake_d_loss + + return { + "model_loss": model_loss, + "d_loss": final_d_loss + } @torch.no_grad() def generate( diff --git a/llava/train/llava_trainer.py b/llava/train/llava_trainer.py index c746db62a..e7cfb33c3 100644 --- a/llava/train/llava_trainer.py +++ b/llava/train/llava_trainer.py @@ -1,6 +1,7 @@ import os import torch import torch.nn as nn +import torch.optim as optim from torch.utils.data import Sampler @@ -133,8 +134,8 @@ def __iter__(self): class LLaVATrainer(Trainer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.d_optimizer = optim.Adam(model.discrminator.parameters(), lr=lr, betas=(beta1, 0.999)) # what kind of optimizer do we want to use? - # also need to figure out how to accesst the discriminator, also what learning rate/betas do we want? + self.d_optimizer = optim.Adam(self.model.di(), lr=lr, betas=(beta1, 0.999)) # what kind of optimizer do we want to use? + # also need to figure out how to access the discriminator, also what learning rate/betas do we want? def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]: @@ -692,8 +693,8 @@ def _inner_training_loop( else: grad_norm = _grad_norm - self.optimizer.step() # need to calculate the gradients for the discriminator somewhere, not sure where yet, i think in training_step? - self.d_optimizer.step() + self.optimizer.step() + self.d_optimizer.step() # need to calculate the gradients for the discriminator somewhere, not sure where yet, i think in training_step? self.control = self.callback_handler.on_optimizer_step(args, self.state, self.control) diff --git a/llava/train/train.py b/llava/train/train.py index 477c668b6..4ee9b8861 100644 --- a/llava/train/train.py +++ b/llava/train/train.py @@ -958,6 +958,8 @@ def make_inputs_require_grad(module, input, output): data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args) + + model.to("cuda") trainer = LLaVATrainer(model=model, tokenizer=tokenizer, args=training_args, diff --git a/scripts/v1_5/finetune_lora.sh b/scripts/v1_5/finetune_lora.sh index 761086631..db2b3ac33 100644 --- a/scripts/v1_5/finetune_lora.sh +++ b/scripts/v1_5/finetune_lora.sh @@ -8,7 +8,7 @@ deepspeed llava/train/train_mem.py \ --data_path ./playground/data/llava_v1_5_mix665k.json \ --image_folder ./playground/data \ --vision_tower openai/clip-vit-large-patch14-336 \ - --pretrain_mm_mlp_adapter ./checkpoints/llava-v1.5-mlp2x-336px-pretrain-vicuna-7b-v1.5/mm_projector.bin \ + --pretrain_mm_mlp_adapter ./checkpoints/llava-v1.5-13b/mm_projector.bin \ --mm_projector_type mlp2x_gelu \ --mm_vision_select_layer -2 \ --mm_use_im_start_end False \ From ea030ef71a344938794414ff041558b5bfa66d2c Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Thu, 15 Aug 2024 21:40:18 +0000 Subject: [PATCH 23/41] forgot a file --- llava/train/llava_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llava/train/llava_trainer.py b/llava/train/llava_trainer.py index e7cfb33c3..d1faa6630 100644 --- a/llava/train/llava_trainer.py +++ b/llava/train/llava_trainer.py @@ -134,7 +134,7 @@ def __iter__(self): class LLaVATrainer(Trainer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.d_optimizer = optim.Adam(self.model.di(), lr=lr, betas=(beta1, 0.999)) # what kind of optimizer do we want to use? + self.d_optimizer = optim.Adam(self.model.discriminator(), lr=lr, betas=(beta1, 0.999)) # what kind of optimizer do we want to use? # also need to figure out how to access the discriminator, also what learning rate/betas do we want? From 60ece7c713412a29f08ffcf26fa920d2d4c5a4fd Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Mon, 19 Aug 2024 02:04:43 +0000 Subject: [PATCH 24/41] fixed debugger and working on adding the the d_mode: --- llava/model/language_model/llava_llama.py | 4 +- llava/train/llava_trainer.py | 196 ++++++++++------------ 2 files changed, 89 insertions(+), 111 deletions(-) diff --git a/llava/model/language_model/llava_llama.py b/llava/model/language_model/llava_llama.py index da5efa883..9f4993225 100644 --- a/llava/model/language_model/llava_llama.py +++ b/llava/model/language_model/llava_llama.py @@ -94,7 +94,7 @@ def forward( images, image_sizes ) - + d_mode = False if d_mode == False: return super().forward( input_ids=input_ids, @@ -126,7 +126,7 @@ def forward( final_d_loss = real_d_loss + fake_d_loss - return { + return { # sum the loss and return as a tuple like the first branch "model_loss": model_loss, "d_loss": final_d_loss } diff --git a/llava/train/llava_trainer.py b/llava/train/llava_trainer.py index d1faa6630..7f8512c5d 100644 --- a/llava/train/llava_trainer.py +++ b/llava/train/llava_trainer.py @@ -1,7 +1,9 @@ import os import torch import torch.nn as nn +import math import torch.optim as optim +import time from torch.utils.data import Sampler @@ -10,10 +12,19 @@ is_sagemaker_mp_enabled, get_parameter_names, has_length, + get_model_param_count, + hp_params, + skip_first_batches, ALL_LAYERNORM_LAYERS, logger, + DebugOption, + DebugUnderflowOverflow, + TrainerState, + OptimizerNames, + TrainOutput ) from typing import List, Optional +TRAINER_STATE_NAME = "trainer_state.json" def maybe_zero_3(param, ignore_status=False, name=None): @@ -134,7 +145,7 @@ def __iter__(self): class LLaVATrainer(Trainer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.d_optimizer = optim.Adam(self.model.discriminator(), lr=lr, betas=(beta1, 0.999)) # what kind of optimizer do we want to use? + #self.d_optimizer = optim.Adam(self.model.discriminator(), lr=lr, betas=(beta1, 0.999)) # what kind of optimizer do we want to use? # also need to figure out how to access the discriminator, also what learning rate/betas do we want? @@ -260,43 +271,6 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None): else: super(LLaVATrainer, self)._save(output_dir, state_dict) - def compute_loss(self, model, inputs, return_outputs=False): - if self.label_smoother is not None and "labels" in inputs: - labels = inputs.pop("labels") - else: - labels = None - - loss_dict = model(**inputs) # i think this is correct - #outputs = model(**inputs) - - ## not sure if we need the rest of this? can we just return the loss? - - # Save past state if it exists - # TODO: this needs to be fixed and made cleaner later. - if self.args.past_index >= 0: - self._past = outputs[self.args.past_index] - - if labels is not None: - unwrapped_model = self.accelerator.unwrap_model(model) - if _is_peft_model(unwrapped_model): - model_name = unwrapped_model.base_model.model._get_name() - else: - model_name = unwrapped_model._get_name() - if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): - loss = self.label_smoother(outputs, labels, shift_labels=True) - else: - loss = self.label_smoother(outputs, labels) - else: - if isinstance(outputs, dict) and "loss" not in outputs: - raise ValueError( - "The model did not return a loss from the inputs, only the following keys: " - f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." - ) - # We don't use .loss here since the model may return tuples instead of ModelOutput. - loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] - - return (loss, outputs) if return_outputs else loss - def _inner_training_loop( self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None ): @@ -320,8 +294,6 @@ def _inner_training_loop( logger.debug(f"Currently training with a batch size of: {self._train_batch_size}") # Data loader and number of training steps train_dataloader = self.get_train_dataloader() - if self.is_fsdp_xla_v2_enabled: - train_dataloader = tpu_spmd_dataloader(train_dataloader) # Setting up training control variables: # number of training epochs: num_train_epochs @@ -393,11 +365,7 @@ def _inner_training_loop( if not delay_optimizer_creation: self.create_optimizer_and_scheduler(num_training_steps=max_steps) - self.state = TrainerState( - stateful_callbacks=[ - cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState) - ] - ) + self.state = TrainerState() self.state.is_hyper_param_search = trial is not None self.state.train_batch_size = self._train_batch_size @@ -435,9 +403,6 @@ def _inner_training_loop( use_accelerator_prepare = True if model is self.model else False if delay_optimizer_creation: - if use_accelerator_prepare: - self._fsdp_qlora_plugin_updates() - self.model = self.accelerator.prepare(self.model) self.create_optimizer_and_scheduler(num_training_steps=max_steps) # prepare using `accelerator` prepare @@ -453,9 +418,6 @@ def _inner_training_loop( model, self.optimizer, self.lr_scheduler = self.accelerator.prepare( self.model, self.optimizer, self.lr_scheduler ) - elif self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]: - # In this case we are in DDP + LOMO, which should be supported - self.optimizer = self.accelerator.prepare(self.optimizer) if self.is_fsdp_enabled: self.model = self.model_wrapped = model @@ -471,9 +433,7 @@ def _inner_training_loop( # ckpt loading if resume_from_checkpoint is not None: if self.is_deepspeed_enabled: - deepspeed_load_checkpoint( - self.model_wrapped, resume_from_checkpoint, load_module_strict=not _is_peft_model(self.model) - ) + deepspeed_load_checkpoint(self.model_wrapped, resume_from_checkpoint) elif is_sagemaker_mp_enabled() or self.is_fsdp_enabled: self._load_from_checkpoint(resume_from_checkpoint, self.model_wrapped) @@ -508,9 +468,7 @@ def _inner_training_loop( os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME) ): self.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)) - self.compare_trainer_and_checkpoint_args(self.args, self.state) - self._load_callback_state() - epochs_trained = int(self.state.global_step // num_update_steps_per_epoch) + epochs_trained = self.state.global_step // num_update_steps_per_epoch if not args.ignore_data_skip: steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch) steps_trained_in_current_epoch *= args.gradient_accumulation_steps @@ -553,11 +511,26 @@ def _inner_training_loop( self._total_loss_scalar = 0.0 self._globalstep_last_logged = self.state.global_step model.zero_grad() - grad_norm: Optional[float] = None + self.control = self.callback_handler.on_train_begin(args, self.state, self.control) - if args.eval_on_start: - self._evaluate(trial, ignore_keys_for_eval, skip_scheduler=True) + # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point. + if not args.ignore_data_skip: + for epoch in range(epochs_trained): + sampler = get_dataloader_sampler(train_dataloader) + sampler_kinds = [RandomSampler] + if version.parse(accelerate_version) > version.parse("0.23.0"): + sampler_kinds.append(SeedableRandomSampler) + is_random_sampler = isinstance(sampler, tuple(sampler_kinds)) + if not is_random_sampler: + # We just need to begin an iteration to create the randomization of the sampler. + for _ in train_dataloader: + break + else: + # Otherwise we need to call the whooooole sampler cause there is some random operation added + # AT THE VERY END! + sampler = sampler if sampler is not None else [] + _ = list(sampler) total_batched_samples = 0 for epoch in range(epochs_trained, num_train_epochs): @@ -589,6 +562,7 @@ def _inner_training_loop( step = -1 for step, inputs in enumerate(epoch_iterator): + inputs['d_mode'] = True if (step % 2 == 0) else False total_batched_samples += 1 if self.args.include_num_input_tokens_seen: @@ -600,17 +574,7 @@ def _inner_training_loop( "a `main_input_name` attribute to the model class you are using." ) else: - self.state.num_input_tokens_seen += ( - torch.sum( - self.accelerator.gather( - torch.tensor( - inputs[main_input_name].numel(), device=self.args.device, dtype=torch.int64 - ) - ) - ) - .cpu() - .item() - ) + self.state.num_input_tokens_seen += self.accelerator.gather(inputs[main_input_name]).numel() if rng_to_sync: self._load_rng_state(resume_from_checkpoint) rng_to_sync = False @@ -635,16 +599,12 @@ def _inner_training_loop( if ( args.logging_nan_inf_filter - and not is_torch_xla_available() + and not is_torch_tpu_available() and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) ): # if loss is nan or inf simply add the average of previous logged losses tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged) else: - if tr_loss.device != tr_loss_step.device: - raise ValueError( - f"Calculated loss must be on the original device: {tr_loss.device} but device in use is {tr_loss_step.device}" - ) tr_loss += tr_loss_step self.current_flos += float(self.floating_point_ops(inputs)) @@ -669,35 +629,21 @@ def _inner_training_loop( # deepspeed does its own clipping if is_sagemaker_mp_enabled() and args.fp16: - _grad_norm = self.optimizer.clip_master_grads(args.max_grad_norm) + self.optimizer.clip_master_grads(args.max_grad_norm) elif self.use_apex: # Revert to normal clipping otherwise, handling Apex or full precision - _grad_norm = nn.utils.clip_grad_norm_( + nn.utils.clip_grad_norm_( amp.master_params(self.optimizer), args.max_grad_norm, ) else: - _grad_norm = self.accelerator.clip_grad_norm_( + self.accelerator.clip_grad_norm_( model.parameters(), args.max_grad_norm, ) - if ( - is_accelerate_available() - and self.accelerator.distributed_type == DistributedType.DEEPSPEED - ): - grad_norm = model.get_global_grad_norm() - # In some cases the grad norm may not return a float - if hasattr(grad_norm, "item"): - grad_norm = grad_norm.item() - else: - grad_norm = _grad_norm - - self.optimizer.step() - self.d_optimizer.step() # need to calculate the gradients for the discriminator somewhere, not sure where yet, i think in training_step? - - self.control = self.callback_handler.on_optimizer_step(args, self.state, self.control) - + # Optimizer step + self.optimizer.step() optimizer_was_run = not self.accelerator.optimizer_step_was_skipped if optimizer_was_run: # Delay optimizer scheduling until metrics are generated @@ -709,30 +655,25 @@ def _inner_training_loop( self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch self.control = self.callback_handler.on_step_end(args, self.state, self.control) - self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval) + self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) else: self.control = self.callback_handler.on_substep_end(args, self.state, self.control) if self.control.should_epoch_stop or self.control.should_training_stop: - # PyTorch/XLA relies on the data loader to insert the mark_step for - # each step. Since we are breaking the loop early, we need to manually - # insert the mark_step here. - if is_torch_xla_available(): - xm.mark_step() break if step < 0: logger.warning( - "There seems not to be a single sample in your epoch_iterator, stopping training at step" + "There seems to be not a single sample in your epoch_iterator, stopping training at step" f" {self.state.global_step}! This is expected if you're using an IterableDataset and set" f" num_steps ({max_steps}) higher than the number of available samples." ) self.control.should_training_stop = True self.control = self.callback_handler.on_epoch_end(args, self.state, self.control) - self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval) + self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) if DebugOption.TPU_METRICS_DEBUG in self.args.debug: - if is_torch_xla_available(): + if is_torch_tpu_available(): # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) xm.master_print(met.metrics_report()) else: @@ -750,7 +691,7 @@ def _inner_training_loop( logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") if args.load_best_model_at_end and self.state.best_model_checkpoint is not None: # Wait for everyone to get here so we are sure the model has been saved by process 0. - if is_torch_xla_available(): + if is_torch_tpu_available(): xm.rendezvous("load_best_model_at_end") elif args.parallel_mode == ParallelMode.DISTRIBUTED: dist.barrier() @@ -761,8 +702,7 @@ def _inner_training_loop( # add remaining tr_loss self._total_loss_scalar += tr_loss.item() - effective_global_step = max(self.state.global_step, 0.001) # Avoid ZeroDivisionError - train_loss = self._total_loss_scalar / effective_global_step + train_loss = self._total_loss_scalar / self.state.global_step metrics = speed_metrics( "train", @@ -789,7 +729,7 @@ def _inner_training_loop( for checkpoint in checkpoints_sorted: if not os.path.samefile(checkpoint, self.state.best_model_checkpoint): logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit") - shutil.rmtree(checkpoint, ignore_errors=True) + shutil.rmtree(checkpoint) self.control = self.callback_handler.on_train_end(args, self.state, self.control) @@ -801,4 +741,42 @@ def _inner_training_loop( if self.neftune_noise_alpha is not None: self._deactivate_neftune(self.model) - return TrainOutput(self.state.global_step, train_loss, metrics) \ No newline at end of file + return TrainOutput(self.state.global_step, train_loss, metrics) + + # def compute_loss(self, model, inputs, return_outputs=False): + # if self.label_smoother is not None and "labels" in inputs: + # labels = inputs.pop("labels") + # else: + # labels = None + + # loss_dict = model(**inputs) # i think this is correct + # #outputs = model(**inputs) + + # ## not sure if we need the rest of this? can we just return the loss? + + # # Save past state if it exists + # # TODO: this needs to be fixed and made cleaner later. + # if self.args.past_index >= 0: + # self._past = outputs[self.args.past_index] + + # if labels is not None: + # unwrapped_model = self.accelerator.unwrap_model(model) + # if _is_peft_model(unwrapped_model): + # model_name = unwrapped_model.base_model.model._get_name() + # else: + # model_name = unwrapped_model._get_name() + # if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): + # loss = self.label_smoother(outputs, labels, shift_labels=True) + # else: + # loss = self.label_smoother(outputs, labels) + # else: + # if isinstance(outputs, dict) and "loss" not in outputs: + # raise ValueError( + # "The model did not return a loss from the inputs, only the following keys: " + # f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." + # ) + # # We don't use .loss here since the model may return tuples instead of ModelOutput. + # loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] + + # return (loss, outputs) if return_outputs else loss + From 13eac7c9ad67af799f712afcf49cac71c888b9c6 Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Tue, 20 Aug 2024 06:23:38 +0000 Subject: [PATCH 25/41] added in d_mode --- llava/model/language_model/llava_llama.py | 7 +++---- llava/train/llava_trainer.py | 23 ++++++++++++++++++----- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/llava/model/language_model/llava_llama.py b/llava/model/language_model/llava_llama.py index 9f4993225..696414ab2 100644 --- a/llava/model/language_model/llava_llama.py +++ b/llava/model/language_model/llava_llama.py @@ -74,7 +74,7 @@ def forward( images: Optional[torch.FloatTensor] = None, image_sizes: Optional[List[List[int]]] = None, return_dict: Optional[bool] = None, - d_mode: Optional[bool] = False # False means run without discriminator first + d_mode: Optional[bool] = False # False means run without discriminator ) -> Union[Tuple, CausalLMOutputWithPast]: if inputs_embeds is None: @@ -94,7 +94,6 @@ def forward( images, image_sizes ) - d_mode = False if d_mode == False: return super().forward( input_ids=input_ids, @@ -111,7 +110,7 @@ def forward( else: real_d_loss = self.discriminator.evaluate(self.disc_data["lang"])[2] # are we training discrim here? should we be calculating gradients? fake_d_loss = self.discriminator.evaluate(self.disc_data["images"])[2] - model_loss = tuple(super().forward( # i dont think we can directly unpack the output of forward() so i convert to tuple: may not be necessary + model_loss = super().forward( # i dont think we can directly unpack the output of forward() so i convert to tuple: may not be necessary input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, @@ -122,7 +121,7 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict - ))[0] # i think the loss is the first item? + )[0] # i think the loss is the first item? final_d_loss = real_d_loss + fake_d_loss diff --git a/llava/train/llava_trainer.py b/llava/train/llava_trainer.py index 7f8512c5d..1d6d2df2c 100644 --- a/llava/train/llava_trainer.py +++ b/llava/train/llava_trainer.py @@ -3,9 +3,13 @@ import torch.nn as nn import math import torch.optim as optim +from packaging import version import time +import sys from torch.utils.data import Sampler +#from accelerate.data_loader import SeedableRandomSampler +from transformers.integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint from transformers import Trainer from transformers.trainer import ( @@ -13,19 +17,28 @@ get_parameter_names, has_length, get_model_param_count, + speed_metrics, + get_dataloader_sampler, hp_params, skip_first_batches, + is_torch_tpu_available, ALL_LAYERNORM_LAYERS, logger, + accelerate_version, DebugOption, DebugUnderflowOverflow, - TrainerState, - OptimizerNames, - TrainOutput + TrainerState, + HPSearchBackend, + TrainOutput, + shutil, + RandomSampler, + ParallelMode ) + from typing import List, Optional -TRAINER_STATE_NAME = "trainer_state.json" +TRAINER_STATE_NAME = "trainer_state.json" + def maybe_zero_3(param, ignore_status=False, name=None): from deepspeed import zero @@ -562,7 +575,7 @@ def _inner_training_loop( step = -1 for step, inputs in enumerate(epoch_iterator): - inputs['d_mode'] = True if (step % 2 == 0) else False + inputs['d_mode'] = True if (step % 2 == 0) else False # set d_mode total_batched_samples += 1 if self.args.include_num_input_tokens_seen: From a30a34ecfc9c8e58507fc8b7fe6e2717e5bcfc90 Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Tue, 20 Aug 2024 21:44:17 +0000 Subject: [PATCH 26/41] created second optimizer --- llava/VLLMSafety/discriminator.py | 23 +++++++- llava/model/language_model/llava_llama.py | 23 ++++---- llava/model/llava_arch.py | 8 +-- llava/train/llava_trainer.py | 69 +++++++++-------------- 4 files changed, 62 insertions(+), 61 deletions(-) diff --git a/llava/VLLMSafety/discriminator.py b/llava/VLLMSafety/discriminator.py index 088b98bd9..b7dc56975 100644 --- a/llava/VLLMSafety/discriminator.py +++ b/llava/VLLMSafety/discriminator.py @@ -26,8 +26,7 @@ class Discriminator: def __init__(self): self.model = EasyNeuralNetwork(5120, 2) - - def evaluate(self,model, loss_function, X, y): + def evaluate(self, model, loss_function, X, y): predictions = model(X) # pass thorugh model # print("shape of y: ", y.shape) # print("prediction: ", predictions) @@ -35,7 +34,27 @@ def evaluate(self,model, loss_function, X, y): predictions = predictions.argmax(dim=1).cpu().numpy() acc = (predictions == y.cpu().numpy()).mean() return predictions, acc, loss + + + def call_discrim(self, data): + device = 'cuda' + loss_function = nn.BCELoss() # from DCGAN + + img_tok = data["image"] + lang_tok = data["lang"] + + img_label = torch.full((img_tok.size(0),), 1, dtype=torch.float, device=device) # 1 for images + lang_label = torch.full((lang_tok.size(0),), 0, dtype=torch.float, device=device) # 0 for language + + _, _, img_loss = self.evaluate(self. model, loss_function, img_tok, img_label) + _, _, lang_loss = self.evaluate(self.model, loss_function, lang_tok, lang_label) + + img_loss.backward() + lang_loss.backward() + + final_loss = img_loss + lang_loss + return final_loss def train(self,training_dataloader, IMAGE_SHAPE=1024 * 5, NUM_CLASSES=2, device='cuda', EPOCHS=1): self.model.train(mode=True) diff --git a/llava/model/language_model/llava_llama.py b/llava/model/language_model/llava_llama.py index 696414ab2..10c9641df 100644 --- a/llava/model/language_model/llava_llama.py +++ b/llava/model/language_model/llava_llama.py @@ -49,8 +49,8 @@ def __init__(self, config): self.vocab_size = config.vocab_size self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.disc_data = { - "images": [], - "lang": [], + "image": None, + "lang": None, } self.discriminator = Discriminator() @@ -94,6 +94,7 @@ def forward( images, image_sizes ) + if d_mode == False: return super().forward( input_ids=input_ids, @@ -107,10 +108,9 @@ def forward( output_hidden_states=output_hidden_states, return_dict=return_dict ) - else: - real_d_loss = self.discriminator.evaluate(self.disc_data["lang"])[2] # are we training discrim here? should we be calculating gradients? - fake_d_loss = self.discriminator.evaluate(self.disc_data["images"])[2] - model_loss = super().forward( # i dont think we can directly unpack the output of forward() so i convert to tuple: may not be necessary + else: + d_loss = self.discriminator.call_discrim(self.disc_data) # d loss is sum of disc loss on images and lang, i think it should be just images + model_output = super().forward( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, @@ -121,14 +121,11 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict - )[0] # i think the loss is the first item? - - final_d_loss = real_d_loss + fake_d_loss + ) + + model_output.loss = model_output.loss + d_loss # not sure if add or subtract cannot tell - return { # sum the loss and return as a tuple like the first branch - "model_loss": model_loss, - "d_loss": final_d_loss - } + return model_output @torch.no_grad() def generate( diff --git a/llava/model/llava_arch.py b/llava/model/llava_arch.py index 4eae1b9b0..52aa949dc 100644 --- a/llava/model/llava_arch.py +++ b/llava/model/llava_arch.py @@ -141,7 +141,6 @@ def encode_images(self, images): image_features = self.get_model().get_vision_tower()(images) image_features = self.get_model().mm_projector(image_features) - self.disc_data['image'].append(image_features) return image_features def prepare_inputs_labels_for_multimodal( @@ -206,6 +205,8 @@ def prepare_inputs_labels_for_multimodal( else: image_features = self.encode_images(images) + self.disc_data['image'] = image_features + # TODO: image start / end is not implemented here to support pretraining. if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False): raise NotImplementedError @@ -258,7 +259,7 @@ def prepare_inputs_labels_for_multimodal( #curr input embeds is coming from cur_input_ids_noim which means its already filitered - self.disc_data['lang'].append(cur_input_embeds) + self.disc_data['lang'] = cur_input_embeds cur_new_input_embeds = [] cur_new_labels = [] @@ -280,9 +281,6 @@ def prepare_inputs_labels_for_multimodal( new_input_embeds.append(cur_new_input_embeds) new_labels.append(cur_new_labels) - # call discriminator: - self.discriminator.preprocess_and_call_train(self.disc_data) - # Truncate sequences to max length as image embeddings can make the sequence longer tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None) if tokenizer_model_max_length is not None: diff --git a/llava/train/llava_trainer.py b/llava/train/llava_trainer.py index 1d6d2df2c..bde6acdf9 100644 --- a/llava/train/llava_trainer.py +++ b/llava/train/llava_trainer.py @@ -8,8 +8,10 @@ import sys from torch.utils.data import Sampler +from transformers.trainer_utils import SchedulerType #from accelerate.data_loader import SeedableRandomSampler from transformers.integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint +from transformers.optimization import get_scheduler from transformers import Trainer from transformers.trainer import ( @@ -35,9 +37,11 @@ ParallelMode ) -from typing import List, Optional +from typing import List, Optional, Union TRAINER_STATE_NAME = "trainer_state.json" +lr = 0.0002 +beta1 = 0.5 def maybe_zero_3(param, ignore_status=False, name=None): @@ -122,7 +126,7 @@ def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, class LengthGroupedSampler(Sampler): - r""" + """ Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while keeping a bit of randomness. """ @@ -177,6 +181,20 @@ def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]: else: return super()._get_train_sampler() + + def create_optimizer_and_scheduler(self, num_training_steps: int): + """ + Setup the optimizer and the learning rate scheduler. + + We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the + Trainer's init through `optimizers`, or subclass and override this method (or `create_optimizer` and/or + `create_scheduler`) in a subclass. + """ + self.create_optimizer() + + optimizer = self.optimizer + self.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer) + def create_optimizer(self): """ Setup the optimizer. @@ -255,6 +273,8 @@ def create_optimizer(self): logger.debug(f"bitsandbytes: will optimize {module} in fp32") logger.info(f"skipped: {skipped/2**20}M params") + self.d_optimizer = optim.Adam(opt_model.discriminator.model.parameters(), lr= lr, betas=(beta1, 0.999)) # how to get discriminator parameters? + return self.optimizer def _save_checkpoint(self, model, trial, metrics=None): @@ -657,6 +677,11 @@ def _inner_training_loop( # Optimizer step self.optimizer.step() + + if inputs["d_mode"] == True: + self.d_optimizer.step() + model.discriminator.model.zero_grad() + optimizer_was_run = not self.accelerator.optimizer_step_was_skipped if optimizer_was_run: # Delay optimizer scheduling until metrics are generated @@ -754,42 +779,4 @@ def _inner_training_loop( if self.neftune_noise_alpha is not None: self._deactivate_neftune(self.model) - return TrainOutput(self.state.global_step, train_loss, metrics) - - # def compute_loss(self, model, inputs, return_outputs=False): - # if self.label_smoother is not None and "labels" in inputs: - # labels = inputs.pop("labels") - # else: - # labels = None - - # loss_dict = model(**inputs) # i think this is correct - # #outputs = model(**inputs) - - # ## not sure if we need the rest of this? can we just return the loss? - - # # Save past state if it exists - # # TODO: this needs to be fixed and made cleaner later. - # if self.args.past_index >= 0: - # self._past = outputs[self.args.past_index] - - # if labels is not None: - # unwrapped_model = self.accelerator.unwrap_model(model) - # if _is_peft_model(unwrapped_model): - # model_name = unwrapped_model.base_model.model._get_name() - # else: - # model_name = unwrapped_model._get_name() - # if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): - # loss = self.label_smoother(outputs, labels, shift_labels=True) - # else: - # loss = self.label_smoother(outputs, labels) - # else: - # if isinstance(outputs, dict) and "loss" not in outputs: - # raise ValueError( - # "The model did not return a loss from the inputs, only the following keys: " - # f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." - # ) - # # We don't use .loss here since the model may return tuples instead of ModelOutput. - # loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] - - # return (loss, outputs) if return_outputs else loss - + return TrainOutput(self.state.global_step, train_loss, metrics) \ No newline at end of file From 538c928f74f5265e76129a413c18ab450864a74e Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Sat, 24 Aug 2024 03:55:22 +0000 Subject: [PATCH 27/41] fixed discriminator --- llava/VLLMSafety/discriminator.py | 256 ++++++++++++---------- llava/model/language_model/llava_llama.py | 35 +-- llava/train/llava_trainer.py | 10 +- 3 files changed, 165 insertions(+), 136 deletions(-) diff --git a/llava/VLLMSafety/discriminator.py b/llava/VLLMSafety/discriminator.py index b7dc56975..d5ea98eb4 100644 --- a/llava/VLLMSafety/discriminator.py +++ b/llava/VLLMSafety/discriminator.py @@ -5,177 +5,199 @@ import numpy as np import random -class EasyNeuralNetwork(nn.Module): +class Discriminator(nn.Module): def __init__(self, input_size, classes): super().__init__() # we can add more layers later - # layer 1 + self.fc1 = nn.Linear(input_size, 50) - # layer 2 - self.fc2 = nn.Linear(50, classes) + self.fc2 = nn.Linear(50, 1) - def forward(self, x): - # run x through the layers and activation functions - # (relu activation function is just max(0, x)) + def linear(self, x): x = F.relu(self.fc1(x)) - # normally there's no activation function on last layer (except softmax etc. when needed) - x = self.fc2(x) + x = torch.sigmoid(self.fc2(x)) return x -class Discriminator: - - def __init__(self): - self.model = EasyNeuralNetwork(5120, 2) - def evaluate(self, model, loss_function, X, y): - predictions = model(X) # pass thorugh model - # print("shape of y: ", y.shape) - # print("prediction: ", predictions) - loss = loss_function(predictions, y) - predictions = predictions.argmax(dim=1).cpu().numpy() - acc = (predictions == y.cpu().numpy()).mean() - return predictions, acc, loss - - - def call_discrim(self, data): + + def forward(self, data, d_mode): device = 'cuda' loss_function = nn.BCELoss() # from DCGAN img_tok = data["image"] lang_tok = data["lang"] + + img_tok = img_tok.view(-1, 5120) # image tokens have dim=3 + + img_label = torch.full((img_tok.size(0), 1), 1, dtype=torch.bfloat16, device=device) # 1 for images + lang_label = torch.full((lang_tok.size(0), 1), 0, dtype=torch.bfloat16, device=device) # 0 for lang + + img_pred = self.linear(img_tok) # BCE expects output from a sigmoid (i think) + lang_pred = self.linear(lang_tok) + + img_loss = loss_function(img_pred, img_label) + lang_loss = loss_function(lang_pred, lang_label) # BCE expects output from a sigmoid (i think) + + if d_mode == True: + return img_loss + lang_loss # returning both losses to train disc - img_label = torch.full((img_tok.size(0),), 1, dtype=torch.float, device=device) # 1 for images - lang_label = torch.full((lang_tok.size(0),), 0, dtype=torch.float, device=device) # 0 for language + else: + return img_loss # returning image loss to maximize disc loss when training generator + +# class Discriminator: - _, _, img_loss = self.evaluate(self. model, loss_function, img_tok, img_label) - _, _, lang_loss = self.evaluate(self.model, loss_function, lang_tok, lang_label) +# def __init__(self): +# self.model = EasyNeuralNetwork(5120, 2) + +# # def evaluate(self, model, loss_function, X, y): +# def evaluate(self, loss_function, X, y): +# # predictions = model(X) # pass thorugh model +# predictions = self.model(X) +# # print("shape of y: ", y.shape) +# # print("prediction: ", predictions) +# loss = loss_function(predictions, y) +# predictions = predictions.argmax(dim=1).cpu().numpy() +# acc = (predictions == y.cpu().numpy()).mean() +# return predictions, acc, loss + - img_loss.backward() - lang_loss.backward() +# def call_discrim(self, data): +# device = 'cuda' +# loss_function = nn.BCELoss() # from DCGAN - final_loss = img_loss + lang_loss +# img_tok = data["image"] +# lang_tok = data["lang"] + +# img_label = torch.full((img_tok.size(0),), 1, dtype=torch.float, device=device) # 1 for images +# lang_label = torch.full((lang_tok.size(0),), 0, dtype=torch.float, device=device) # 0 for language - return final_loss +# _, _, img_loss = self.evaluate(self.model, loss_function, img_tok, img_label) +# _, _, lang_loss = self.evaluate(self.model, loss_function, lang_tok, lang_label) - def train(self,training_dataloader, IMAGE_SHAPE=1024 * 5, NUM_CLASSES=2, device='cuda', EPOCHS=1): - self.model.train(mode=True) - self.model.to(device) # put the model on the device (remember its cuda on workstation) - optimizer = optim.Adam(self.model.parameters(), lr=0.001) - loss_function = nn.CrossEntropyLoss() +# final_loss = img_loss + lang_loss - epochs_acc = [] - for epoch in range(EPOCHS): - print(f'Epoch {epoch + 1}') - epoch_acc = [] - training_acc_checkpoint, training_loss_checkpoint = [], [] - for step, (data, labels) in enumerate(training_dataloader): - data = data.float().unsqueeze(0) - labels = labels.unsqueeze(0) +# return final_loss - data, labels = data.to(device), labels.to(device) # Convert labels to tensor if not already +# def train(self,training_dataloader, IMAGE_SHAPE=1024 * 5, NUM_CLASSES=2, device='cuda', EPOCHS=1): +# self.model.train(mode=True) +# self.model.to(device) # put the model on the device (remember its cuda on workstation) +# optimizer = optim.Adam(self.model.parameters(), lr=0.001) +# loss_function = nn.CrossEntropyLoss() - predictions, acc, loss = self.evaluate(self.model, loss_function, data, labels) - training_acc_checkpoint.append(acc) - epoch_acc.append(acc) +# epochs_acc = [] +# for epoch in range(EPOCHS): +# print(f'Epoch {epoch + 1}') +# epoch_acc = [] +# training_acc_checkpoint, training_loss_checkpoint = [], [] +# for step, (data, labels) in enumerate(training_dataloader): +# data = data.float().unsqueeze(0) +# labels = labels.unsqueeze(0) - # loss already calculated in the evaluate() call. just append it - training_loss_checkpoint.append(loss.item()) +# data, labels = data.to(device), labels.to(device) # Convert labels to tensor if not already - # back propagation - loss.backward() +# predictions, acc, loss = self.evaluate(self.model, loss_function, data, labels) +# training_acc_checkpoint.append(acc) +# epoch_acc.append(acc) - # gradient descent - optimizer.step() +# # loss already calculated in the evaluate() call. just append it +# training_loss_checkpoint.append(loss.item()) - # zero the gradients so they do not accumulate - optimizer.zero_grad() +# # back propagation +# loss.backward() - # epoch end - print("Accuracy: ", np.mean(epoch_acc)) - epochs_acc.append(np.mean(epoch_acc)) +# # gradient descent +# optimizer.step() - # can do some optimizations here if you want early stopping, right now im not gonna implement this +# # zero the gradients so they do not accumulate +# optimizer.zero_grad() - self.model.train(mode=False) # exit training mode +# # epoch end +# print("Accuracy: ", np.mean(epoch_acc)) +# epochs_acc.append(np.mean(epoch_acc)) - return epochs_acc, self.model +# # can do some optimizations here if you want early stopping, right now im not gonna implement this +# self.model.train(mode=False) # exit training mode - # def test(): - # model.train(False) # since were testing +# return epochs_acc, self.model - # test_loss = [] - # test_acc = [] - # for X,y in test_loader: - # with torch.no_grad(): - # X, y = X.to(device), y.to(device) - # predictions = model(X) #as above: check dimentions +# # def test(): +# # model.train(False) # since were testing - # loss = loss_function(predictions, y) - # test_loss.append(loss.item()) +# # test_loss = [] +# # test_acc = [] - # test_acc.append((predictions.argmax(dim=1).cpu().numpy() == y.cpu().numpy()).mean()) +# # for X,y in test_loader: +# # with torch.no_grad(): +# # X, y = X.to(device), y.to(device) +# # predictions = model(X) #as above: check dimentions - # print(f'Accuracy: {np.mean(test_acc):.2f}, Loss: {np.mean(test_loss):.2f}') +# # loss = loss_function(predictions, y) +# # test_loss.append(loss.item()) - # return test_acc #idc about test_loss +# # test_acc.append((predictions.argmax(dim=1).cpu().numpy() == y.cpu().numpy()).mean()) +# # print(f'Accuracy: {np.mean(test_acc):.2f}, Loss: {np.mean(test_loss):.2f}') - def preprocess_and_call_train(self,get_tkns): - # set device to cpu - device = 'cuda' if torch.cuda.is_available() else 'cpu' # if we are running this on workstation change this to cuda +# # return test_acc #idc about test_loss - # Example data loading (assuming you have loaded im_tok and lang_tok) - im_tok = get_tkns["image"] - lang_tok = get_tkns["lang"] +# def preprocess_and_call_train(self,get_tkns): +# # set device to cpu +# device = 'cuda' if torch.cuda.is_available() else 'cpu' # if we are running this on workstation change this to cuda - lang_tok_list = [] - for tensor in lang_tok: - for i in range(tensor.size(0)): - lang_tok_list.append(tensor[i, :]) +# # Example data loading (assuming you have loaded im_tok and lang_tok) - im_tok_list = [] - for tensor in im_tok: - for i in range(tensor.size(0)): - for j in range(tensor.size(1)): - im_tok_list.append(tensor[i, j, :]) +# im_tok = get_tkns["image"] +# lang_tok = get_tkns["lang"] - # print("image tokens arr length: ", len(im_tok)) - # print("image tokens[0] shape: ", im_tok[0].shape) # image tokens[0] shape: torch.Size([16, 576, 5120]) +# lang_tok_list = [] +# for tensor in lang_tok: +# for i in range(tensor.size(0)): +# lang_tok_list.append(tensor[i, :]) - # print("lang tokens arr length: ", len(lang_tok)) - # print("lang tokens[0] shape: ", lang_tok[0].shape) # lang tokens[0] shape: torch.Size([1277, 5120]) +# im_tok_list = [] +# for tensor in im_tok: +# for i in range(tensor.size(0)): +# for j in range(tensor.size(1)): +# im_tok_list.append(tensor[i, j, :]) +# # print("image tokens arr length: ", len(im_tok)) +# # print("image tokens[0] shape: ", im_tok[0].shape) # image tokens[0] shape: torch.Size([16, 576, 5120]) - combined_tokens = [(torch.tensor(token), torch.tensor(0)) for token in im_tok_list] + [(torch.tensor(token), torch.tensor(1)) for token in lang_tok_list] +# # print("lang tokens arr length: ", len(lang_tok)) +# # print("lang tokens[0] shape: ", lang_tok[0].shape) # lang tokens[0] shape: torch.Size([1277, 5120]) - print("im_tok: ", im_tok[0].shape) - print("lang_tok: ", lang_tok_list[0].shape) - # Optionally shuffle the combined list to randomize the order - random.shuffle(combined_tokens) +# combined_tokens = [(torch.tensor(token), torch.tensor(0)) for token in im_tok_list] + [(torch.tensor(token), torch.tensor(1)) for token in lang_tok_list] - # testing code... if our embeddings are the wrong side we are doing something wrong. - print("combined_tokens[0][0].flatten().size(): ", combined_tokens[0][0].flatten().size()) - assert combined_tokens[0][0].flatten().size() == torch.Size([1024*5]), ("flattened image tokens fed to discriminator do not match the size of " - "disc first layer") - print("combined_tokens[-1][0].flatten().size(): ", combined_tokens[-1][0].flatten().size()) - assert combined_tokens[-1][0].flatten().size() == torch.Size([1024*5]), ("flattened language tokens fed to discriminator do not match the size " - "of disc first layer") +# print("im_tok: ", im_tok[0].shape) +# print("lang_tok: ", lang_tok_list[0].shape) - # train network - epochs_acc, model = self.train(combined_tokens, device=device) +# # Optionally shuffle the combined list to randomize the order +# random.shuffle(combined_tokens) +# # testing code... if our embeddings are the wrong side we are doing something wrong. +# print("combined_tokens[0][0].flatten().size(): ", combined_tokens[0][0].flatten().size()) +# assert combined_tokens[0][0].flatten().size() == torch.Size([1024*5]), ("flattened image tokens fed to discriminator do not match the size of " +# "disc first layer") +# print("combined_tokens[-1][0].flatten().size(): ", combined_tokens[-1][0].flatten().size()) +# assert combined_tokens[-1][0].flatten().size() == torch.Size([1024*5]), ("flattened language tokens fed to discriminator do not match the size " +# "of disc first layer") - if( len(epochs_acc) > 0 ): - print("-----------final epochs acc--------------: ", epochs_acc[-1]) +# # train network +# epochs_acc, model = self.train(combined_tokens, device=device) - # not gonna do any eval for now - # test_acc = test() - # save the model - # PATH = 'models/desc_v1_llava.pth' - # torch.save(model, PATH) +# if( len(epochs_acc) > 0 ): +# print("-----------final epochs acc--------------: ", epochs_acc[-1]) - return model +# # not gonna do any eval for now +# # test_acc = test() + +# # save the model +# # PATH = 'models/desc_v1_llava.pth' +# # torch.save(model, PATH) + +# return model + diff --git a/llava/model/language_model/llava_llama.py b/llava/model/language_model/llava_llama.py index 10c9641df..8490f0f54 100644 --- a/llava/model/language_model/llava_llama.py +++ b/llava/model/language_model/llava_llama.py @@ -52,7 +52,7 @@ def __init__(self, config): "image": None, "lang": None, } - self.discriminator = Discriminator() + self.discriminator = Discriminator(5120, 2) # hard coding in sizes for now # Initialize weights and apply final processing self.post_init() @@ -96,20 +96,25 @@ def forward( ) if d_mode == False: - return super().forward( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - labels=labels, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict - ) + d_loss = self.discriminator.forward(self.disc_data, d_mode=False) # d loss is sum of disc loss on images and lang + model_output = super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + labels=labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict + ) + + model_output.loss = d_loss # returning only discriminator loss + + return model_output else: - d_loss = self.discriminator.call_discrim(self.disc_data) # d loss is sum of disc loss on images and lang, i think it should be just images + d_loss = self.discriminator.forward(self.disc_data, d_mode=True) # d loss is only the disc loss on the image tokens (following DCGAN) model_output = super().forward( input_ids=input_ids, attention_mask=attention_mask, @@ -123,7 +128,7 @@ def forward( return_dict=return_dict ) - model_output.loss = model_output.loss + d_loss # not sure if add or subtract cannot tell + model_output.loss = model_output.loss + d_loss # returning sum of model and discriminator loss return model_output diff --git a/llava/train/llava_trainer.py b/llava/train/llava_trainer.py index bde6acdf9..2f00cf6e9 100644 --- a/llava/train/llava_trainer.py +++ b/llava/train/llava_trainer.py @@ -273,7 +273,7 @@ def create_optimizer(self): logger.debug(f"bitsandbytes: will optimize {module} in fp32") logger.info(f"skipped: {skipped/2**20}M params") - self.d_optimizer = optim.Adam(opt_model.discriminator.model.parameters(), lr= lr, betas=(beta1, 0.999)) # how to get discriminator parameters? + self.d_optimizer = optim.Adam(opt_model.discriminator.parameters(), lr= lr, betas=(beta1, 0.999)) # how to get discriminator parameters? return self.optimizer @@ -676,11 +676,13 @@ def _inner_training_loop( ) # Optimizer step - self.optimizer.step() - + if inputs["d_mode"] == True: self.d_optimizer.step() - model.discriminator.model.zero_grad() + model.discriminator.zero_grad() + + else: + self.optimizer.step() optimizer_was_run = not self.accelerator.optimizer_step_was_skipped if optimizer_was_run: From 0568ab29509ce834d61021d3a4c1960af6b7ab0b Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Wed, 28 Aug 2024 19:00:33 +0000 Subject: [PATCH 28/41] pipeline is ready --- llava/VLLMSafety/discriminator.py | 17 ++++++++++------- llava/model/language_model/llava_llama.py | 6 +++--- llava/train/llava_trainer.py | 5 ++--- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/llava/VLLMSafety/discriminator.py b/llava/VLLMSafety/discriminator.py index d5ea98eb4..1f84e0739 100644 --- a/llava/VLLMSafety/discriminator.py +++ b/llava/VLLMSafety/discriminator.py @@ -27,21 +27,24 @@ def forward(self, data, d_mode): lang_tok = data["lang"] img_tok = img_tok.view(-1, 5120) # image tokens have dim=3 - - img_label = torch.full((img_tok.size(0), 1), 1, dtype=torch.bfloat16, device=device) # 1 for images - lang_label = torch.full((lang_tok.size(0), 1), 0, dtype=torch.bfloat16, device=device) # 0 for lang img_pred = self.linear(img_tok) # BCE expects output from a sigmoid (i think) lang_pred = self.linear(lang_tok) - - img_loss = loss_function(img_pred, img_label) - lang_loss = loss_function(lang_pred, lang_label) # BCE expects output from a sigmoid (i think) if d_mode == True: + img_label = torch.full((img_tok.size(0), 1), 1, dtype=torch.bfloat16, device=device) # 1 for images + lang_label = torch.full((lang_tok.size(0), 1), 0, dtype=torch.bfloat16, device=device) # 0 for lang + + img_loss = loss_function(img_pred, img_label) + lang_loss = loss_function(lang_pred, lang_label) + return img_loss + lang_loss # returning both losses to train disc else: - return img_loss # returning image loss to maximize disc loss when training generator + lang_label = torch.full((img_tok.size(0), 1), 0, dtype=torch.bfloat16, device=device) # 0 for lang + img_with_lang_label_loss = loss_function(img_pred, lang_label) # trying to follow DCGAN + + return img_with_lang_label_loss # returning image loss to maximize disc loss when training generator # class Discriminator: diff --git a/llava/model/language_model/llava_llama.py b/llava/model/language_model/llava_llama.py index 8490f0f54..6c6444ef0 100644 --- a/llava/model/language_model/llava_llama.py +++ b/llava/model/language_model/llava_llama.py @@ -95,8 +95,8 @@ def forward( image_sizes ) - if d_mode == False: - d_loss = self.discriminator.forward(self.disc_data, d_mode=False) # d loss is sum of disc loss on images and lang + if d_mode == True: + d_loss = self.discriminator.forward(self.disc_data, d_mode=True) # d loss is sum of disc loss on images and lang model_output = super().forward( input_ids=input_ids, attention_mask=attention_mask, @@ -114,7 +114,7 @@ def forward( return model_output else: - d_loss = self.discriminator.forward(self.disc_data, d_mode=True) # d loss is only the disc loss on the image tokens (following DCGAN) + d_loss = self.discriminator.forward(self.disc_data, d_mode=False) # d loss is disc loss on the image toke with lang labels (following DCGAN) model_output = super().forward( input_ids=input_ids, attention_mask=attention_mask, diff --git a/llava/train/llava_trainer.py b/llava/train/llava_trainer.py index 2f00cf6e9..b7894ee79 100644 --- a/llava/train/llava_trainer.py +++ b/llava/train/llava_trainer.py @@ -5,6 +5,7 @@ import torch.optim as optim from packaging import version import time +import deepspeed import sys from torch.utils.data import Sampler @@ -162,8 +163,6 @@ def __iter__(self): class LLaVATrainer(Trainer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - #self.d_optimizer = optim.Adam(self.model.discriminator(), lr=lr, betas=(beta1, 0.999)) # what kind of optimizer do we want to use? - # also need to figure out how to access the discriminator, also what learning rate/betas do we want? def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]: @@ -679,7 +678,7 @@ def _inner_training_loop( if inputs["d_mode"] == True: self.d_optimizer.step() - model.discriminator.zero_grad() + model.module.base_model.model.discriminator.zero_grad() else: self.optimizer.step() From fd9ede016cceef273ec949233198b7b88a438c1f Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Wed, 4 Sep 2024 18:10:32 +0000 Subject: [PATCH 29/41] trying to test discrim --- llava/VLLMSafety/evaluate_disc.py | 1019 +++++++++++++++++++++++++++++ llava/VLLMSafety/test_discrim.py | 97 +++ 2 files changed, 1116 insertions(+) create mode 100644 llava/VLLMSafety/evaluate_disc.py create mode 100644 llava/VLLMSafety/test_discrim.py diff --git a/llava/VLLMSafety/evaluate_disc.py b/llava/VLLMSafety/evaluate_disc.py new file mode 100644 index 000000000..37783b4f5 --- /dev/null +++ b/llava/VLLMSafety/evaluate_disc.py @@ -0,0 +1,1019 @@ +# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright: +# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright: +# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import copy +from dataclasses import dataclass, field +import json +import logging +import pathlib +from typing import Dict, Optional, Sequence, List + +import torch + +import transformers +import tokenizers + +from llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN +from torch.utils.data import Dataset +from llava.train.llava_trainer import LLaVATrainer + +from llava import conversation as conversation_lib +from llava.model import * +from llava.mm_utils import tokenizer_image_token + +from PIL import Image + +from llava.VLLMSafety.discriminator import Discriminator # REMOVE ME + + +local_rank = None + + +def rank0_print(*args): + if local_rank == 0: + print(*args) + + +from packaging import version +IS_TOKENIZER_GREATER_THAN_0_14 = version.parse(tokenizers.__version__) >= version.parse('0.14') + + +@dataclass +class ModelArguments: + model_name_or_path: Optional[str] = field(default="facebook/opt-125m") + version: Optional[str] = field(default="v0") + freeze_backbone: bool = field(default=False) + tune_mm_mlp_adapter: bool = field(default=False) + vision_tower: Optional[str] = field(default=None) + mm_vision_select_layer: Optional[int] = field(default=-1) # default to the last layer + pretrain_mm_mlp_adapter: Optional[str] = field(default=None) + mm_projector_type: Optional[str] = field(default='linear') + mm_use_im_start_end: bool = field(default=False) + mm_use_im_patch_token: bool = field(default=True) + mm_patch_merge_type: Optional[str] = field(default='flat') + mm_vision_select_feature: Optional[str] = field(default="patch") + + +@dataclass +class DataArguments: + data_path: str = field(default=None, + metadata={"help": "Path to the training data."}) + lazy_preprocess: bool = False + is_multimodal: bool = False + image_folder: Optional[str] = field(default=None) + image_aspect_ratio: str = 'square' + + +@dataclass +class TrainingArguments(transformers.TrainingArguments): + cache_dir: Optional[str] = field(default=None) + optim: str = field(default="adamw_torch") + remove_unused_columns: bool = field(default=False) + freeze_mm_mlp_adapter: bool = field(default=False) + mpt_attn_impl: Optional[str] = field(default="triton") + model_max_length: int = field( + default=512, + metadata={ + "help": + "Maximum sequence length. Sequences will be right padded (and possibly truncated)." + }, + ) + double_quant: bool = field( + default=True, + metadata={"help": "Compress the quantization statistics through double quantization."} + ) + quant_type: str = field( + default="nf4", + metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."} + ) + bits: int = field( + default=16, + metadata={"help": "How many bits to use."} + ) + lora_enable: bool = False + lora_r: int = 64 + lora_alpha: int = 16 + lora_dropout: float = 0.05 + lora_weight_path: str = "" + lora_bias: str = "none" + mm_projector_lr: Optional[float] = None + group_by_modality_length: bool = field(default=False) + + +def maybe_zero_3(param, ignore_status=False, name=None): + from deepspeed import zero + from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus + if hasattr(param, "ds_id"): + if param.ds_status == ZeroParamStatus.NOT_AVAILABLE: + if not ignore_status: + logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}") + with zero.GatheredParameters([param]): + param = param.data.detach().cpu().clone() + else: + param = param.detach().cpu().clone() + return param + + +# Borrowed from peft.utils.get_peft_model_state_dict +def get_peft_state_maybe_zero_3(named_params, bias): + if bias == "none": + to_return = {k: t for k, t in named_params if "lora_" in k} + elif bias == "all": + to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k} + elif bias == "lora_only": + to_return = {} + maybe_lora_bias = {} + lora_bias_names = set() + for k, t in named_params: + if "lora_" in k: + to_return[k] = t + bias_name = k.split("lora_")[0] + "bias" + lora_bias_names.add(bias_name) + elif "bias" in k: + maybe_lora_bias[k] = t + for k, t in maybe_lora_bias: + if bias_name in lora_bias_names: + to_return[bias_name] = t + else: + raise NotImplementedError + to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()} + return to_return + + +def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True): + to_return = {k: t for k, t in named_params if "lora_" not in k} + if require_grad_only: + to_return = {k: t for k, t in to_return.items() if t.requires_grad} + to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()} + return to_return + + +def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match): + to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)} + to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()} + return to_return + + +def find_all_linear_names(model): + cls = torch.nn.Linear + lora_module_names = set() + multimodal_keywords = ['mm_projector', 'vision_tower', 'vision_resampler'] + for name, module in model.named_modules(): + if any(mm_keyword in name for mm_keyword in multimodal_keywords): + continue + if isinstance(module, cls): + names = name.split('.') + lora_module_names.add(names[0] if len(names) == 1 else names[-1]) + + if 'lm_head' in lora_module_names: # needed for 16-bit + lora_module_names.remove('lm_head') + return list(lora_module_names) + + +def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, + output_dir: str): + """Collects the state dict and dump to disk.""" + + if getattr(trainer.args, "tune_mm_mlp_adapter", False): + # Only save Adapter + keys_to_match = ['mm_projector'] + if getattr(trainer.args, "use_im_start_end", False): + keys_to_match.extend(['embed_tokens', 'embed_in']) + + weight_to_save = get_mm_adapter_state_maybe_zero_3(trainer.model.named_parameters(), keys_to_match) + trainer.model.config.save_pretrained(output_dir) + + current_folder = output_dir.split('/')[-1] + parent_folder = os.path.dirname(output_dir) + if trainer.args.local_rank == 0 or trainer.args.local_rank == -1: + if current_folder.startswith('checkpoint-'): + mm_projector_folder = os.path.join(parent_folder, "mm_projector") + os.makedirs(mm_projector_folder, exist_ok=True) + torch.save(weight_to_save, os.path.join(mm_projector_folder, f'{current_folder}.bin')) + else: + torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin')) + return + + if trainer.deepspeed: + torch.cuda.synchronize() + trainer.save_model(output_dir) + return + + state_dict = trainer.model.state_dict() + if trainer.args.should_save: + cpu_state_dict = { + key: value.cpu() + for key, value in state_dict.items() + } + del state_dict + trainer._save(output_dir, state_dict=cpu_state_dict) # noqa + + +def smart_tokenizer_and_embedding_resize( + special_tokens_dict: Dict, + tokenizer: transformers.PreTrainedTokenizer, + model: transformers.PreTrainedModel, +): + """Resize tokenizer and embedding. + + Note: This is the unoptimized version that may make your embedding size not be divisible by 64. + """ + num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) + model.resize_token_embeddings(len(tokenizer)) + + if num_new_tokens > 0: + input_embeddings = model.get_input_embeddings().weight.data + output_embeddings = model.get_output_embeddings().weight.data + + input_embeddings_avg = input_embeddings[:-num_new_tokens].mean( + dim=0, keepdim=True) + output_embeddings_avg = output_embeddings[:-num_new_tokens].mean( + dim=0, keepdim=True) + + input_embeddings[-num_new_tokens:] = input_embeddings_avg + output_embeddings[-num_new_tokens:] = output_embeddings_avg + + +def _tokenize_fn(strings: Sequence[str], + tokenizer: transformers.PreTrainedTokenizer) -> Dict: + """Tokenize a list of strings.""" + tokenized_list = [ + tokenizer( + text, + return_tensors="pt", + padding="longest", + max_length=tokenizer.model_max_length, + truncation=True, + ) for text in strings + ] + input_ids = labels = [ + tokenized.input_ids[0] for tokenized in tokenized_list + ] + input_ids_lens = labels_lens = [ + tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() + for tokenized in tokenized_list + ] + return dict( + input_ids=input_ids, + labels=labels, + input_ids_lens=input_ids_lens, + labels_lens=labels_lens, + ) + + +def _mask_targets(target, tokenized_lens, speakers): + # cur_idx = 0 + cur_idx = tokenized_lens[0] + tokenized_lens = tokenized_lens[1:] + target[:cur_idx] = IGNORE_INDEX + for tokenized_len, speaker in zip(tokenized_lens, speakers): + if speaker == "human": + target[cur_idx+2:cur_idx + tokenized_len] = IGNORE_INDEX + cur_idx += tokenized_len + + +def _add_speaker_and_signal(header, source, get_conversation=True): + """Add speaker and start/end signal on each round.""" + BEGIN_SIGNAL = "### " + END_SIGNAL = "\n" + conversation = header + for sentence in source: + from_str = sentence["from"] + if from_str.lower() == "human": + from_str = conversation_lib.default_conversation.roles[0] + elif from_str.lower() == "gpt": + from_str = conversation_lib.default_conversation.roles[1] + else: + from_str = 'unknown' + sentence["value"] = (BEGIN_SIGNAL + from_str + ": " + + sentence["value"] + END_SIGNAL) + if get_conversation: + conversation += sentence["value"] + conversation += BEGIN_SIGNAL + return conversation + + +def preprocess_multimodal( + sources: Sequence[str], + data_args: DataArguments +) -> Dict: + is_multimodal = data_args.is_multimodal + if not is_multimodal: + return sources + + for source in sources: + for sentence in source: + if DEFAULT_IMAGE_TOKEN in sentence['value']: + sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip() + sentence['value'] = DEFAULT_IMAGE_TOKEN + '\n' + sentence['value'] + sentence['value'] = sentence['value'].strip() + if "mmtag" in conversation_lib.default_conversation.version: + sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '' + DEFAULT_IMAGE_TOKEN + '') + replace_token = DEFAULT_IMAGE_TOKEN + if data_args.mm_use_im_start_end: + replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN + sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token) + + return sources + + +def preprocess_llama_2( + sources, + tokenizer: transformers.PreTrainedTokenizer, + has_image: bool = False +) -> Dict: + conv = conversation_lib.default_conversation.copy() + roles = {"human": conv.roles[0], "gpt": conv.roles[1]} + + # Apply prompt templates + conversations = [] + for i, source in enumerate(sources): + if roles[source[0]["from"]] != conv.roles[0]: + # Skip the first one if it is not from human + source = source[1:] + + conv.messages = [] + for j, sentence in enumerate(source): + role = roles[sentence["from"]] + assert role == conv.roles[j % 2], f"{i}" + conv.append_message(role, sentence["value"]) + conversations.append(conv.get_prompt()) + + # Tokenize conversations + + if has_image: + input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0) + else: + input_ids = tokenizer( + conversations, + return_tensors="pt", + padding="longest", + max_length=tokenizer.model_max_length, + truncation=True, + ).input_ids + + targets = input_ids.clone() + + assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_2 + + # Mask targets + sep = "[/INST] " + for conversation, target in zip(conversations, targets): + total_len = int(target.ne(tokenizer.pad_token_id).sum()) + + rounds = conversation.split(conv.sep2) + cur_len = 1 + target[:cur_len] = IGNORE_INDEX + for i, rou in enumerate(rounds): + if rou == "": + break + + parts = rou.split(sep) + if len(parts) != 2: + break + parts[0] += sep + + if has_image: + round_len = len(tokenizer_image_token(rou, tokenizer)) + instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2 + else: + round_len = len(tokenizer(rou).input_ids) + instruction_len = len(tokenizer(parts[0]).input_ids) - 2 + + target[cur_len : cur_len + instruction_len] = IGNORE_INDEX + + cur_len += round_len + target[cur_len:] = IGNORE_INDEX + + if cur_len < tokenizer.model_max_length: + if cur_len != total_len: + target[:] = IGNORE_INDEX + print( + f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." + f" (ignored)" + ) + + return dict( + input_ids=input_ids, + labels=targets, + ) + + +def preprocess_v1( + sources, + tokenizer: transformers.PreTrainedTokenizer, + has_image: bool = False +) -> Dict: + conv = conversation_lib.default_conversation.copy() + roles = {"human": conv.roles[0], "gpt": conv.roles[1]} + + # Apply prompt templates + conversations = [] + for i, source in enumerate(sources): + if roles[source[0]["from"]] != conv.roles[0]: + # Skip the first one if it is not from human + source = source[1:] + + conv.messages = [] + for j, sentence in enumerate(source): + role = roles[sentence["from"]] + assert role == conv.roles[j % 2], f"{i}" + conv.append_message(role, sentence["value"]) + conversations.append(conv.get_prompt()) + + # Tokenize conversations + + if has_image: + input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0) + else: + input_ids = tokenizer( + conversations, + return_tensors="pt", + padding="longest", + max_length=tokenizer.model_max_length, + truncation=True, + ).input_ids + + targets = input_ids.clone() + + assert conv.sep_style == conversation_lib.SeparatorStyle.TWO + + # Mask targets + sep = conv.sep + conv.roles[1] + ": " + for conversation, target in zip(conversations, targets): + total_len = int(target.ne(tokenizer.pad_token_id).sum()) + + rounds = conversation.split(conv.sep2) + cur_len = 1 + target[:cur_len] = IGNORE_INDEX + for i, rou in enumerate(rounds): + if rou == "": + break + + parts = rou.split(sep) + if len(parts) != 2: + break + parts[0] += sep + + if has_image: + round_len = len(tokenizer_image_token(rou, tokenizer)) + instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2 + else: + round_len = len(tokenizer(rou).input_ids) + instruction_len = len(tokenizer(parts[0]).input_ids) - 2 + + if i != 0 and not tokenizer.legacy and IS_TOKENIZER_GREATER_THAN_0_14: + round_len -= 1 + instruction_len -= 1 + + target[cur_len : cur_len + instruction_len] = IGNORE_INDEX + + cur_len += round_len + target[cur_len:] = IGNORE_INDEX + + if cur_len < tokenizer.model_max_length: + if cur_len != total_len: + target[:] = IGNORE_INDEX + print( + f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." + f" (ignored)" + ) + + return dict( + input_ids=input_ids, + labels=targets, + ) + + +def preprocess_mpt( + sources, + tokenizer: transformers.PreTrainedTokenizer, + has_image: bool = False +) -> Dict: + conv = conversation_lib.default_conversation.copy() + roles = {"human": conv.roles[0], "gpt": conv.roles[1]} + + # Apply prompt templates + conversations = [] + for i, source in enumerate(sources): + if roles[source[0]["from"]] != conv.roles[0]: + # Skip the first one if it is not from human + source = source[1:] + + conv.messages = [] + for j, sentence in enumerate(source): + role = roles[sentence["from"]] + assert role == conv.roles[j % 2], f"{i}" + conv.append_message(role, sentence["value"]) + conversations.append(conv.get_prompt()) + + # Tokenize conversations + + if has_image: + input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0) + else: + input_ids = tokenizer( + conversations, + return_tensors="pt", + padding="longest", + max_length=tokenizer.model_max_length, + truncation=True, + ).input_ids + + targets = input_ids.clone() + assert conv.sep_style == conversation_lib.SeparatorStyle.MPT + + # Mask targets + sep = conv.sep + conv.roles[1] + for conversation, target in zip(conversations, targets): + total_len = int(target.ne(tokenizer.pad_token_id).sum()) + + rounds = conversation.split(conv.sep) + re_rounds = [conv.sep.join(rounds[:3])] # system + user + gpt + for conv_idx in range(3, len(rounds), 2): + re_rounds.append(conv.sep.join(rounds[conv_idx:conv_idx+2])) # user + gpt + cur_len = 0 + target[:cur_len] = IGNORE_INDEX + for i, rou in enumerate(re_rounds): + if rou == "": + break + + parts = rou.split(sep) + if len(parts) != 2: + break + parts[0] += sep + + if has_image: + round_len = len(tokenizer_image_token(rou, tokenizer)) + instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1 + else: + round_len = len(tokenizer(rou).input_ids) + instruction_len = len(tokenizer(parts[0]).input_ids) - 1 + + if i != 0 and getattr(tokenizer, 'legacy', False) and IS_TOKENIZER_GREATER_THAN_0_14: + round_len += 1 + instruction_len += 1 + + target[cur_len : cur_len + instruction_len] = IGNORE_INDEX + + cur_len += round_len + target[cur_len:] = IGNORE_INDEX + + if cur_len < tokenizer.model_max_length: + if cur_len != total_len: + target[:] = IGNORE_INDEX + print( + f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." + f" (ignored)" + ) + + return dict( + input_ids=input_ids, + labels=targets, + ) + + +def preprocess_plain( + sources: Sequence[str], + tokenizer: transformers.PreTrainedTokenizer, +) -> Dict: + # add end signal and concatenate together + conversations = [] + for source in sources: + assert len(source) == 2 + assert DEFAULT_IMAGE_TOKEN in source[0]['value'] + source[0]['value'] = DEFAULT_IMAGE_TOKEN + conversation = source[0]['value'] + source[1]['value'] + conversation_lib.default_conversation.sep + conversations.append(conversation) + # tokenize conversations + input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations] + targets = copy.deepcopy(input_ids) + for target, source in zip(targets, sources): + tokenized_len = len(tokenizer_image_token(source[0]['value'], tokenizer)) + target[:tokenized_len] = IGNORE_INDEX + + return dict(input_ids=input_ids, labels=targets) + + +def preprocess( + sources: Sequence[str], + tokenizer: transformers.PreTrainedTokenizer, + has_image: bool = False +) -> Dict: + """ + Given a list of sources, each is a conversation list. This transform: + 1. Add signal '### ' at the beginning each sentence, with end signal '\n'; + 2. Concatenate conversations together; + 3. Tokenize the concatenated conversation; + 4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX. + """ + if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN: + return preprocess_plain(sources, tokenizer) + if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.LLAMA_2: + return preprocess_llama_2(sources, tokenizer, has_image=has_image) + if conversation_lib.default_conversation.version.startswith("v1"): + return preprocess_v1(sources, tokenizer, has_image=has_image) + if conversation_lib.default_conversation.version == "mpt": + return preprocess_mpt(sources, tokenizer, has_image=has_image) + # add end signal and concatenate together + conversations = [] + for source in sources: + header = f"{conversation_lib.default_conversation.system}\n\n" + conversation = _add_speaker_and_signal(header, source) + conversations.append(conversation) + # tokenize conversations + def get_tokenize_len(prompts): + return [len(tokenizer_image_token(prompt, tokenizer)) for prompt in prompts] + + if has_image: + input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations] + else: + conversations_tokenized = _tokenize_fn(conversations, tokenizer) + input_ids = conversations_tokenized["input_ids"] + + targets = copy.deepcopy(input_ids) + for target, source in zip(targets, sources): + if has_image: + tokenized_lens = get_tokenize_len([header] + [s["value"] for s in source]) + else: + tokenized_lens = _tokenize_fn([header] + [s["value"] for s in source], tokenizer)["input_ids_lens"] + speakers = [sentence["from"] for sentence in source] + _mask_targets(target, tokenized_lens, speakers) + + return dict(input_ids=input_ids, labels=targets) + + +class LazySupervisedDataset(Dataset): + """Dataset for supervised fine-tuning.""" + + def __init__(self, data_path: str, + tokenizer: transformers.PreTrainedTokenizer, + data_args: DataArguments): + super(LazySupervisedDataset, self).__init__() + list_data_dict = json.load(open(data_path, "r")) + + rank0_print("Formatting inputs...Skip in lazy mode") + self.tokenizer = tokenizer + self.list_data_dict = list_data_dict + self.data_args = data_args + + def __len__(self): + return len(self.list_data_dict) + + @property + def lengths(self): + length_list = [] + for sample in self.list_data_dict: + img_tokens = 128 if 'image' in sample else 0 + length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens) + return length_list + + @property + def modality_lengths(self): + length_list = [] + for sample in self.list_data_dict: + cur_len = sum(len(conv['value'].split()) for conv in sample['conversations']) + cur_len = cur_len if 'image' in sample else -cur_len + length_list.append(cur_len) + return length_list + + def __getitem__(self, i) -> Dict[str, torch.Tensor]: + sources = self.list_data_dict[i] + if isinstance(i, int): + sources = [sources] + assert len(sources) == 1, "Don't know why it is wrapped to a list" # FIXME + if 'image' in sources[0]: + image_file = self.list_data_dict[i]['image'] + image_folder = self.data_args.image_folder + processor = self.data_args.image_processor + image = Image.open(os.path.join(image_folder, image_file)).convert('RGB') + if self.data_args.image_aspect_ratio == 'pad': + def expand2square(pil_img, background_color): + width, height = pil_img.size + if width == height: + return pil_img + elif width > height: + result = Image.new(pil_img.mode, (width, width), background_color) + result.paste(pil_img, (0, (width - height) // 2)) + return result + else: + result = Image.new(pil_img.mode, (height, height), background_color) + result.paste(pil_img, ((height - width) // 2, 0)) + return result + image = expand2square(image, tuple(int(x*255) for x in processor.image_mean)) + image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0] + else: + image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0] + sources = preprocess_multimodal( + copy.deepcopy([e["conversations"] for e in sources]), + self.data_args) + else: + sources = copy.deepcopy([e["conversations"] for e in sources]) + data_dict = preprocess( + sources, + self.tokenizer, + has_image=('image' in self.list_data_dict[i])) + if isinstance(i, int): + data_dict = dict(input_ids=data_dict["input_ids"][0], + labels=data_dict["labels"][0]) + + # image exist in the data + if 'image' in self.list_data_dict[i]: + data_dict['image'] = image + elif self.data_args.is_multimodal: + # image does not exist in the data, but the model is multimodal + crop_size = self.data_args.image_processor.crop_size + data_dict['image'] = torch.zeros(3, crop_size['height'], crop_size['width']) + return data_dict + + +@dataclass +class DataCollatorForSupervisedDataset(object): + """Collate examples for supervised fine-tuning.""" + + tokenizer: transformers.PreTrainedTokenizer + + def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: + input_ids, labels = tuple([instance[key] for instance in instances] + for key in ("input_ids", "labels")) + input_ids = torch.nn.utils.rnn.pad_sequence( + input_ids, + batch_first=True, + padding_value=self.tokenizer.pad_token_id) + labels = torch.nn.utils.rnn.pad_sequence(labels, + batch_first=True, + padding_value=IGNORE_INDEX) + input_ids = input_ids[:, :self.tokenizer.model_max_length] + labels = labels[:, :self.tokenizer.model_max_length] + batch = dict( + input_ids=input_ids, + labels=labels, + attention_mask=input_ids.ne(self.tokenizer.pad_token_id), + ) + + if 'image' in instances[0]: + images = [instance['image'] for instance in instances] + if all(x is not None and x.shape == images[0].shape for x in images): + batch['images'] = torch.stack(images) + else: + batch['images'] = images + + return batch + + +def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, + data_args) -> Dict: + """Make dataset and collator for supervised fine-tuning.""" + train_dataset = LazySupervisedDataset(tokenizer=tokenizer, + data_path=data_args.data_path, + data_args=data_args) + data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) + return dict(train_dataset=train_dataset, + eval_dataset=None, + data_collator=data_collator) + + +def eval_disc(attn_implementation=None): + global local_rank + + parser = transformers.HfArgumentParser( + (ModelArguments, DataArguments, TrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + local_rank = training_args.local_rank + compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32)) + + bnb_model_from_pretrained_args = {} + if training_args.bits in [4, 8]: + from transformers import BitsAndBytesConfig + bnb_model_from_pretrained_args.update(dict( + device_map={"": training_args.device}, + load_in_4bit=training_args.bits == 4, + load_in_8bit=training_args.bits == 8, + quantization_config=BitsAndBytesConfig( + load_in_4bit=training_args.bits == 4, + load_in_8bit=training_args.bits == 8, + llm_int8_skip_modules=["mm_projector"], + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=compute_dtype, + bnb_4bit_use_double_quant=training_args.double_quant, + bnb_4bit_quant_type=training_args.quant_type # {'fp4', 'nf4'} + ) + )) + + if model_args.vision_tower is not None: + if 'mpt' in model_args.model_name_or_path: + config = transformers.AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True) + config.attn_config['attn_impl'] = training_args.mpt_attn_impl + model = LlavaMptForCausalLM.from_pretrained( + model_args.model_name_or_path, + config=config, + cache_dir=training_args.cache_dir, + **bnb_model_from_pretrained_args + ) + else: + model = LlavaLlamaForCausalLM.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + attn_implementation=attn_implementation, + torch_dtype=(torch.bfloat16 if training_args.bf16 else None), + **bnb_model_from_pretrained_args + ) + else: + model = transformers.LlamaForCausalLM.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + attn_implementation=attn_implementation, + torch_dtype=(torch.bfloat16 if training_args.bf16 else None), + **bnb_model_from_pretrained_args + ) + model.config.use_cache = False + + if model_args.freeze_backbone: + model.model.requires_grad_(False) + + if training_args.bits in [4, 8]: + from peft import prepare_model_for_kbit_training + model.config.torch_dtype=(torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32)) + model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing) + + if training_args.gradient_checkpointing: + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + else: + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + if training_args.lora_enable: + from peft import LoraConfig, get_peft_model + lora_config = LoraConfig( + r=training_args.lora_r, + lora_alpha=training_args.lora_alpha, + target_modules=find_all_linear_names(model), + lora_dropout=training_args.lora_dropout, + bias=training_args.lora_bias, + task_type="CAUSAL_LM", + ) + if training_args.bits == 16: + if training_args.bf16: + model.to(torch.bfloat16) + if training_args.fp16: + model.to(torch.float16) + rank0_print("Adding LoRA adapters...") + model = get_peft_model(model, lora_config) + + if 'mpt' in model_args.model_name_or_path: + tokenizer = transformers.AutoTokenizer.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + model_max_length=training_args.model_max_length, + padding_side="right" + ) + else: + tokenizer = transformers.AutoTokenizer.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + model_max_length=training_args.model_max_length, + padding_side="right", + use_fast=False, + ) + + if model_args.version == "v0": + if tokenizer.pad_token is None: + smart_tokenizer_and_embedding_resize( + special_tokens_dict=dict(pad_token="[PAD]"), + tokenizer=tokenizer, + model=model, + ) + elif model_args.version == "v0.5": + tokenizer.pad_token = tokenizer.unk_token + else: + tokenizer.pad_token = tokenizer.unk_token + if model_args.version in conversation_lib.conv_templates: + conversation_lib.default_conversation = conversation_lib.conv_templates[model_args.version] + else: + conversation_lib.default_conversation = conversation_lib.conv_templates["vicuna_v1"] + + if model_args.vision_tower is not None: + model.get_model().initialize_vision_modules( + model_args=model_args, + fsdp=training_args.fsdp + ) + + vision_tower = model.get_vision_tower() + vision_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device) + + data_args.image_processor = vision_tower.image_processor + data_args.is_multimodal = True + + model.config.image_aspect_ratio = data_args.image_aspect_ratio + model.config.tokenizer_padding_side = tokenizer.padding_side + model.config.tokenizer_model_max_length = tokenizer.model_max_length + + model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter + if model_args.tune_mm_mlp_adapter: + model.requires_grad_(False) + for p in model.get_model().mm_projector.parameters(): + p.requires_grad = True + + model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter + if training_args.freeze_mm_mlp_adapter: + for p in model.get_model().mm_projector.parameters(): + p.requires_grad = False + + if training_args.bits in [4, 8]: + model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device) + + model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = model_args.mm_use_im_start_end + model.config.mm_projector_lr = training_args.mm_projector_lr + training_args.use_im_start_end = model_args.mm_use_im_start_end + model.config.mm_use_im_patch_token = model_args.mm_use_im_patch_token + model.initialize_vision_tokenizer(model_args, tokenizer=tokenizer) + + if training_args.bits in [4, 8]: + from peft.tuners.lora import LoraLayer + for name, module in model.named_modules(): + if isinstance(module, LoraLayer): + if training_args.bf16: + module = module.to(torch.bfloat16) + if 'norm' in name: + module = module.to(torch.float32) + if 'lm_head' in name or 'embed_tokens' in name: + if hasattr(module, 'weight'): + if training_args.bf16 and module.weight.dtype == torch.float32: + module = module.to(torch.bfloat16) + + data_module = make_supervised_data_module(tokenizer=tokenizer, + data_args=data_args) + + model.to("cuda") + trainer = LLaVATrainer(model=model, + tokenizer=tokenizer, + args=training_args, + **data_module) + + saved_model = Discriminator(5120) + + state_dict = torch.load("/home/smirrashidi/LLaVAFork/trained_disc.pt", weights_only=True) + + state_dict["fc1.weight"] = state_dict["fc1.lora_B.default.weight"] + state_dict["fc2.weight"] = state_dict["fc2.base_layer.weight"] + state_dict["fc1.bias"] = torch.zeros(state_dict["fc1.weight"].shape[0]) + state_dict["fc2.bias"] = torch.zeros(state_dict["fc2.weight"].shape[0]) + + keys_to_remove = [ + "fc1.base_layer.weight", "fc1.base_layer.bias", + "fc1.lora_A.default.weight", "fc1.lora_B.default.weight", + "fc2.base_layer.weight", "fc2.base_layer.bias", + "fc2.lora_A.default.weight", "fc2.lora_B.default.weight" + ] + + for key in keys_to_remove: + if key in state_dict: + del state_dict[key] + + saved_model.load_state_dict(state_dict, strict=True) + + trainer.evaluate(data_module["train_dataset"]) + + if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")): + trainer.train(resume_from_checkpoint=True) + else: + trainer.train() + trainer.save_state() + + model.config.use_cache = True + + if training_args.lora_enable: + state_dict = get_peft_state_maybe_zero_3( + model.named_parameters(), training_args.lora_bias + ) + non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3( + model.named_parameters() + ) + if training_args.local_rank == 0 or training_args.local_rank == -1: + model.config.save_pretrained(training_args.output_dir) + model.save_pretrained(training_args.output_dir, state_dict=state_dict) + torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin')) + else: + safe_save_model_for_hf_trainer(trainer=trainer, + output_dir=training_args.output_dir) + + +if __name__ == "__main__": + eval_disc(attn_implementation="flash_attention_2") diff --git a/llava/VLLMSafety/test_discrim.py b/llava/VLLMSafety/test_discrim.py new file mode 100644 index 000000000..ed59080cf --- /dev/null +++ b/llava/VLLMSafety/test_discrim.py @@ -0,0 +1,97 @@ +import argparse +import torch +import os +import json +from tqdm import tqdm +import shortuuid + +from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN +from llava.conversation import conv_templates, SeparatorStyle +from llava.model.builder import load_pretrained_model +from llava.utils import disable_torch_init +from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path +from llava.train.train import LazySupervisedDataset, DataCollatorForSupervisedDataset, make_supervised_data_module + +from PIL import Image +import math +from llava.VLLMSafety.discriminator import Discriminator + + +def split_list(lst, n): + """Split a list into n (roughly) equal-sized chunks""" + chunk_size = math.ceil(len(lst) / n) # integer division + return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] + + +def get_chunk(lst, n, k): + chunks = split_list(lst, n) + return chunks[k] + +def eval_model(args): + # Model + disable_torch_init() + model_path = os.path.expanduser(args.model_path) + model_name = get_model_name_from_path(model_path) + tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) + + total = 0 + num_img_correct= 0 + num_lang_correct = 0 + + saved_model = Discriminator(5120) + saved_model.load_state_dict(torch.load("/home/smirrashidi/LLaVAFork/trained_disc_copy"), strict=False) + + model.discriminator = saved_model + + + with torch.inference_mode(): + discrim_dict = model.forward_eval_discrim( + input_ids, + images=image_tensor.unsqueeze(0).half().cuda(), + image_sizes=[image.size], + do_sample=True if args.temperature > 0 else False, + temperature=args.temperature, + top_p=args.top_p, + num_beams=args.num_beams, + # no_repeat_ngram_size=3, + max_new_tokens=1024, + use_cache=True, + saved_model = saved_model + ) + + losses.append(discrim_dict["loss"]) + img_is_correct = discrim_dict["img_is_correct"] + lang_is_correct = discrim_dict["lang_is_correct"] + + if img_is_correct == True: + num_img_correct += 1 + + if lang_is_correct == True: + num_lang_correct += 1 + + total += 1 + + img_accuracy = num_img_correct / total + lang_accuracy = num_lang_correct / total + + print(f'Image Accuracy: {img_accuracy} \n Lang Accuracy: {lang_accuracy} \n') + + return losses + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model-path", type=str, default="facebook/opt-350m") + parser.add_argument("--model-base", type=str, default=None) + parser.add_argument("--image-folder", type=str, default="") + parser.add_argument("--question-file", type=str, default="tables/question.jsonl") + parser.add_argument("--answers-file", type=str, default="answer.jsonl") + parser.add_argument("--conv-mode", type=str, default="llava_v1") + parser.add_argument("--num-chunks", type=int, default=1) + parser.add_argument("--chunk-idx", type=int, default=0) + parser.add_argument("--temperature", type=float, default=0.2) + parser.add_argument("--top_p", type=float, default=None) + parser.add_argument("--num_beams", type=int, default=1) + args = parser.parse_args() + + eval_model(args) + From 43c28445c7513c6e76cf3656b4b2fcb3c7e289c1 Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Wed, 11 Sep 2024 06:58:49 +0000 Subject: [PATCH 30/41] tentative eval --- llava/VLLMSafety/discriminator.py | 37 ++- llava/VLLMSafety/evaluate_disc.py | 79 +++--- llava/VLLMSafety/test_discrim.py | 77 +++--- llava/model/language_model/llava_llama.py | 73 +++++- llava/train/llava_trainer.py | 299 +++++++++++++++++++++- scripts/v1_5/finetune.sh | 6 +- scripts/v1_5/finetune_lora.sh | 15 +- 7 files changed, 482 insertions(+), 104 deletions(-) diff --git a/llava/VLLMSafety/discriminator.py b/llava/VLLMSafety/discriminator.py index 1f84e0739..6b04aa689 100644 --- a/llava/VLLMSafety/discriminator.py +++ b/llava/VLLMSafety/discriminator.py @@ -3,10 +3,11 @@ import torch.optim as optim import torch.nn.functional as F import numpy as np +import json import random class Discriminator(nn.Module): - def __init__(self, input_size, classes): + def __init__(self, input_size): super().__init__() # we can add more layers later self.fc1 = nn.Linear(input_size, 50) @@ -31,18 +32,46 @@ def forward(self, data, d_mode): img_pred = self.linear(img_tok) # BCE expects output from a sigmoid (i think) lang_pred = self.linear(lang_tok) + img_pred_binary = torch.ge(img_pred, 0.5).float().to(torch.bfloat16) + lang_pred_binary = torch.ge(lang_pred, 0.5).float().to(torch.bfloat16) + if d_mode == True: img_label = torch.full((img_tok.size(0), 1), 1, dtype=torch.bfloat16, device=device) # 1 for images lang_label = torch.full((lang_tok.size(0), 1), 0, dtype=torch.bfloat16, device=device) # 0 for lang + img_loss = loss_function(img_pred_binary, img_label) + lang_loss = loss_function(lang_pred_binary, lang_label) + img_loss = loss_function(img_pred, img_label) lang_loss = loss_function(lang_pred, lang_label) - - return img_loss + lang_loss # returning both losses to train disc + loss = img_loss + lang_loss + + img_is_correct = torch.eq(img_pred_binary, img_label) + + lang_is_correct = torch.eq(lang_pred_binary, lang_label) + + return_dict = { + "loss": loss, + "img_is_correct" : img_is_correct, + "lang_is_correct": lang_is_correct, + } + + json_dict = { + "num_img_corr": return_dict["img_is_correct"].sum().item(), + "num_lang_corr": return_dict["lang_is_correct"].sum().item(), + "img_total": return_dict["img_is_correct"].size(0), + "lang_total": return_dict["img_is_correct"].size(0) + } + + with open("/home/smirrashidi/return_dict.json", "a") as json_file: + json.dump(json_dict, json_file) + json_file.write("\n") + + return return_dict else: lang_label = torch.full((img_tok.size(0), 1), 0, dtype=torch.bfloat16, device=device) # 0 for lang - img_with_lang_label_loss = loss_function(img_pred, lang_label) # trying to follow DCGAN + img_with_lang_label_loss = loss_function(img_pred_binary, lang_label) # trying to follow DCGAN return img_with_lang_label_loss # returning image loss to maximize disc loss when training generator diff --git a/llava/VLLMSafety/evaluate_disc.py b/llava/VLLMSafety/evaluate_disc.py index 37783b4f5..669f446b6 100644 --- a/llava/VLLMSafety/evaluate_disc.py +++ b/llava/VLLMSafety/evaluate_disc.py @@ -37,8 +37,6 @@ from PIL import Image -from llava.VLLMSafety.discriminator import Discriminator # REMOVE ME - local_rank = None @@ -113,6 +111,11 @@ class TrainingArguments(transformers.TrainingArguments): mm_projector_lr: Optional[float] = None group_by_modality_length: bool = field(default=False) +@dataclass +class DiscArguments: + test_data_path: Optional[str] = field(default="/home/smirrashidi/coco_data/coco_test_conversations.json") + test_image_folder:Optional[str] = field(default="/home/smirrashidi/coco_data/coco_test") + def maybe_zero_3(param, ignore_status=False, name=None): from deepspeed import zero @@ -776,23 +779,36 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, - data_args) -> Dict: + data_args, disc_args, testing) -> Dict: """Make dataset and collator for supervised fine-tuning.""" - train_dataset = LazySupervisedDataset(tokenizer=tokenizer, + + if testing == True: + data_args.image_folder = disc_args.test_image_folder + data_args.data_path = disc_args.test_data_path + + train_dataset = LazySupervisedDataset(tokenizer=tokenizer, + data_path=data_args.data_path, + data_args=data_args) + + data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) + + else: + train_dataset = LazySupervisedDataset(tokenizer=tokenizer, data_path=data_args.data_path, data_args=data_args) - data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) + data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) + return dict(train_dataset=train_dataset, - eval_dataset=None, - data_collator=data_collator) + eval_dataset=None, + data_collator=data_collator) -def eval_disc(attn_implementation=None): +def train(attn_implementation=None): global local_rank parser = transformers.HfArgumentParser( - (ModelArguments, DataArguments, TrainingArguments)) - model_args, data_args, training_args = parser.parse_args_into_dataclasses() + (ModelArguments, DataArguments, TrainingArguments, DiscArguments)) + model_args, data_args, training_args, disc_args = parser.parse_args_into_dataclasses() local_rank = training_args.local_rank compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32)) @@ -959,43 +975,34 @@ def make_inputs_require_grad(module, input, output): module = module.to(torch.bfloat16) data_module = make_supervised_data_module(tokenizer=tokenizer, - data_args=data_args) + data_args=data_args, + disc_args=disc_args, + testing = False) model.to("cuda") trainer = LLaVATrainer(model=model, tokenizer=tokenizer, args=training_args, **data_module) - - saved_model = Discriminator(5120) - - state_dict = torch.load("/home/smirrashidi/LLaVAFork/trained_disc.pt", weights_only=True) - - state_dict["fc1.weight"] = state_dict["fc1.lora_B.default.weight"] - state_dict["fc2.weight"] = state_dict["fc2.base_layer.weight"] - state_dict["fc1.bias"] = torch.zeros(state_dict["fc1.weight"].shape[0]) - state_dict["fc2.bias"] = torch.zeros(state_dict["fc2.weight"].shape[0]) - - keys_to_remove = [ - "fc1.base_layer.weight", "fc1.base_layer.bias", - "fc1.lora_A.default.weight", "fc1.lora_B.default.weight", - "fc2.base_layer.weight", "fc2.base_layer.bias", - "fc2.lora_A.default.weight", "fc2.lora_B.default.weight" - ] - - for key in keys_to_remove: - if key in state_dict: - del state_dict[key] - - saved_model.load_state_dict(state_dict, strict=True) - - trainer.evaluate(data_module["train_dataset"]) if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")): trainer.train(resume_from_checkpoint=True) else: trainer.train() trainer.save_state() + trainer.save_model() + + test_data_module = make_supervised_data_module(tokenizer=tokenizer, + data_args=data_args, + disc_args=disc_args, + testing = True) + + _, eval_dict = trainer.evaluate(test_data_module["train_dataset"]) + + print(eval_dict) + + with open("/home/smirrashidi/test_discrim.json", "w") as json_file: + json.dump(eval_dict, json_file, indent=4) model.config.use_cache = True @@ -1016,4 +1023,4 @@ def make_inputs_require_grad(module, input, output): if __name__ == "__main__": - eval_disc(attn_implementation="flash_attention_2") + train(attn_implementation="flash_attention_2") diff --git a/llava/VLLMSafety/test_discrim.py b/llava/VLLMSafety/test_discrim.py index ed59080cf..d8410cc2f 100644 --- a/llava/VLLMSafety/test_discrim.py +++ b/llava/VLLMSafety/test_discrim.py @@ -4,17 +4,20 @@ import json from tqdm import tqdm import shortuuid +from torch.utils.data import DataLoader from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN from llava.conversation import conv_templates, SeparatorStyle from llava.model.builder import load_pretrained_model from llava.utils import disable_torch_init from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path -from llava.train.train import LazySupervisedDataset, DataCollatorForSupervisedDataset, make_supervised_data_module +from llava.train.train import LazySupervisedDataset, DataCollatorForSupervisedDataset, DataArguments, TrainingArguments, make_supervised_data_module +from llava.train.llava_trainer import LLaVATrainer from PIL import Image import math from llava.VLLMSafety.discriminator import Discriminator +from llava.train.train import * def split_list(lst, n): @@ -28,55 +31,39 @@ def get_chunk(lst, n, k): return chunks[k] def eval_model(args): - # Model - disable_torch_init() - model_path = os.path.expanduser(args.model_path) - model_name = get_model_name_from_path(model_path) - tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) - - total = 0 - num_img_correct= 0 - num_lang_correct = 0 - - saved_model = Discriminator(5120) - saved_model.load_state_dict(torch.load("/home/smirrashidi/LLaVAFork/trained_disc_copy"), strict=False) - - model.discriminator = saved_model + model_path = os.path.expanduser(args.model_path) + model_name = get_model_name_from_path(model_path) + tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) + model_args = ModelArguments(model_name_or_path = args.model_path) + data_args = DataArguments(data_path = args.question_file, + image_folder = args.image_folder) + + training_args = TrainingArguments(output_dir="/home/smirrashidi/dump") + model = LlavaLlamaForCausalLM.from_pretrained( + model_args.model_name_or_path, + cache_dir=training_args.cache_dir, + attn_implementation=None, + torch_dtype=(torch.bfloat16 if training_args.bf16 else None) + ) + + total = 0 + num_img_correct= 0 + num_lang_correct = 0 + test_data = LazySupervisedDataset(tokenizer=tokenizer, + data_path=data_args.data_path, + data_args=data_args) - with torch.inference_mode(): - discrim_dict = model.forward_eval_discrim( - input_ids, - images=image_tensor.unsqueeze(0).half().cuda(), - image_sizes=[image.size], - do_sample=True if args.temperature > 0 else False, - temperature=args.temperature, - top_p=args.top_p, - num_beams=args.num_beams, - # no_repeat_ngram_size=3, - max_new_tokens=1024, - use_cache=True, - saved_model = saved_model - ) - - losses.append(discrim_dict["loss"]) - img_is_correct = discrim_dict["img_is_correct"] - lang_is_correct = discrim_dict["lang_is_correct"] + data_module = make_supervised_data_module(tokenizer=tokenizer, + data_args=data_args) - if img_is_correct == True: - num_img_correct += 1 + trainer = LLaVATrainer(model=model, + tokenizer=tokenizer, + args=training_args, + **data_module) - if lang_is_correct == True: - num_lang_correct += 1 - - total += 1 - - img_accuracy = num_img_correct / total - lang_accuracy = num_lang_correct / total - - print(f'Image Accuracy: {img_accuracy} \n Lang Accuracy: {lang_accuracy} \n') + trainer.evaluate(eval_dataset=test_data) - return losses if __name__ == "__main__": parser = argparse.ArgumentParser() diff --git a/llava/model/language_model/llava_llama.py b/llava/model/language_model/llava_llama.py index 6c6444ef0..ff8848984 100644 --- a/llava/model/language_model/llava_llama.py +++ b/llava/model/language_model/llava_llama.py @@ -52,7 +52,7 @@ def __init__(self, config): "image": None, "lang": None, } - self.discriminator = Discriminator(5120, 2) # hard coding in sizes for now + self.discriminator = Discriminator(5120) # hard coding in sizes for now # Initialize weights and apply final processing self.post_init() @@ -74,8 +74,27 @@ def forward( images: Optional[torch.FloatTensor] = None, image_sizes: Optional[List[List[int]]] = None, return_dict: Optional[bool] = None, - d_mode: Optional[bool] = False # False means run without discriminator + d_mode: Optional[bool] = False, # False means run without discriminator + eval_disc: Optional[bool] = False ) -> Union[Tuple, CausalLMOutputWithPast]: + + if eval_disc == True: + return self.forward_eval_discrim( + input_ids, + attention_mask, + position_ids, + past_key_values, + inputs_embeds, + labels, + use_cache, + output_attentions, + output_hidden_states, + images, + image_sizes, + return_dict, + d_mode, + eval_disc + ) if inputs_embeds is None: ( @@ -94,9 +113,11 @@ def forward( images, image_sizes ) + + d_mode = True # REMOVE WHEN NOT TESTING DISC if d_mode == True: - d_loss = self.discriminator.forward(self.disc_data, d_mode=True) # d loss is sum of disc loss on images and lang + discrim_dict = self.discriminator.forward(self.disc_data, d_mode=True) # d loss is sum of disc loss on images and lang model_output = super().forward( input_ids=input_ids, attention_mask=attention_mask, @@ -109,12 +130,14 @@ def forward( output_hidden_states=output_hidden_states, return_dict=return_dict ) + + d_loss = discrim_dict["loss"] model_output.loss = d_loss # returning only discriminator loss return model_output else: - d_loss = self.discriminator.forward(self.disc_data, d_mode=False) # d loss is disc loss on the image toke with lang labels (following DCGAN) + discrim_dict = self.discriminator.forward(self.disc_data, d_mode=True) # d loss is sum of disc loss on images and lang model_output = super().forward( input_ids=input_ids, attention_mask=attention_mask, @@ -127,10 +150,52 @@ def forward( output_hidden_states=output_hidden_states, return_dict=return_dict ) + + d_loss = discrim_dict["loss"] model_output.loss = model_output.loss + d_loss # returning sum of model and discriminator loss return model_output + + def forward_eval_discrim( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + images: Optional[torch.FloatTensor] = None, + image_sizes: Optional[List[List[int]]] = None, + return_dict: Optional[bool] = None, + d_mode: Optional[bool] = True, # False means run without discriminator + eval_disc: Optional[bool] = True + ): + + if inputs_embeds is None: + ( + input_ids, + position_ids, + attention_mask, + past_key_values, + inputs_embeds, + labels + ) = self.prepare_inputs_labels_for_multimodal( + input_ids, + position_ids, + attention_mask, + past_key_values, + labels, + images, + image_sizes + ) + + discrim_dict = self.discriminator.forward(self.disc_data, d_mode=True) # d loss is sum of disc loss on images and lang + + return discrim_dict @torch.no_grad() def generate( diff --git a/llava/train/llava_trainer.py b/llava/train/llava_trainer.py index b7894ee79..e261b5155 100644 --- a/llava/train/llava_trainer.py +++ b/llava/train/llava_trainer.py @@ -7,20 +7,27 @@ import time import deepspeed import sys +import json -from torch.utils.data import Sampler -from transformers.trainer_utils import SchedulerType -#from accelerate.data_loader import SeedableRandomSampler +from typing import Dict, Optional, Union, List, Any, Tuple + +from torch.utils.data import Sampler, Dataset, DataLoader +from transformers.trainer_utils import SchedulerType, PredictionOutput, EvalLoopOutput +from llava.VLLMSafety.discriminator import Discriminator from transformers.integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint from transformers.optimization import get_scheduler +from transformers.modeling_utils import unwrap_model +from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING_NAMES from transformers import Trainer +from transformers.trainer_pt_utils import nested_detach from transformers.trainer import ( is_sagemaker_mp_enabled, get_parameter_names, has_length, get_model_param_count, speed_metrics, + _is_peft_model, get_dataloader_sampler, hp_params, skip_first_batches, @@ -164,6 +171,12 @@ class LLaVATrainer(Trainer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.eval_disc_data = { + "num_img_corr": 0, + "num_lang_corr": 0, + "img_total": 0, + "lang_total": 0, + } def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]: if self.train_dataset is None or not has_length(self.train_dataset): @@ -595,6 +608,7 @@ def _inner_training_loop( step = -1 for step, inputs in enumerate(epoch_iterator): inputs['d_mode'] = True if (step % 2 == 0) else False # set d_mode + inputs["eval_disc"] = None # to avoid errors total_batched_samples += 1 if self.args.include_num_input_tokens_seen: @@ -675,11 +689,12 @@ def _inner_training_loop( ) # Optimizer step - + + inputs["d_mode"] = True # REMOVE WHEN NOT TESTING DISC if inputs["d_mode"] == True: self.d_optimizer.step() model.module.base_model.model.discriminator.zero_grad() - + else: self.optimizer.step() @@ -698,6 +713,8 @@ def _inner_training_loop( else: self.control = self.callback_handler.on_substep_end(args, self.state, self.control) + torch.cuda.empty_cache() + if self.control.should_epoch_stop or self.control.should_training_stop: break if step < 0: @@ -780,4 +797,274 @@ def _inner_training_loop( if self.neftune_noise_alpha is not None: self._deactivate_neftune(self.model) - return TrainOutput(self.state.global_step, train_loss, metrics) \ No newline at end of file + return TrainOutput(self.state.global_step, train_loss, metrics) + + def compute_loss(self, model, inputs, return_outputs=False): + """ + How the loss is computed by Trainer. By default, all models return the loss in the first element. + + Subclass and override for custom behavior. + """ + if inputs["eval_disc"] == True: + discrim_dict = model(**inputs) + print("running compute loss") + + # print(discrim_dict) + + # print( + # "img is correct", discrim_dict["img_is_correct"].sum().item(), + # "lang is correct", discrim_dict["lang_is_correct"].sum().item(), + # "img_total", discrim_dict["img_is_correct"].size(0), + # "lang_total", discrim_dict["img_is_correct"].size(0)) + + self.eval_disc_data["num_img_corr"] += discrim_dict["img_is_correct"].sum().item() + self.eval_disc_data["num_lang_corr"] += discrim_dict["lang_is_correct"].sum().item() + self.eval_disc_data["img_total"] += discrim_dict["img_is_correct"].size(0) + self.eval_disc_data["lang_total"] += discrim_dict["img_is_correct"].size(0) + + json_dict = { + "num_img_corr": discrim_dict["img_is_correct"].sum().item(), + "num_lang_corr": discrim_dict["lang_is_correct"].sum().item(), + "img_total": discrim_dict["img_is_correct"].size(0), + "lang_total": discrim_dict["img_is_correct"].size(0) + } + + with open("/home/smirrashidi/return_dict.json", "a") as json_file: + json.dump("eval", json_file) + json_file.write("\n") + json.dump(json_dict, json_file) + json_file.write("\n") + + if self.label_smoother is not None and "labels" in inputs: + labels = inputs.pop("labels") + else: + labels = None + outputs = model(**inputs) + # Save past state if it exists + # TODO: this needs to be fixed and made cleaner later. + if self.args.past_index >= 0: + self._past = outputs[self.args.past_index] + + unwrapped_model = unwrap_model(model) + if _is_peft_model(unwrapped_model): + model_name = unwrapped_model.base_model.model._get_name() + else: + model_name = unwrapped_model._get_name() + if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): + loss = self.label_smoother(outputs, labels, shift_labels=True) + else: + loss = self.label_smoother(outputs, labels) + else: + if isinstance(outputs, dict) and "loss" not in outputs: + raise ValueError( + "The model did not return a loss from the inputs, only the following keys: " + f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." + ) + # We don't use .loss here since the model may return tuples instead of ModelOutput. + loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] + + return (loss, outputs) if return_outputs else loss + + def evaluate( + self, + eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "eval", + ) -> Dict[str, float]: + """ + Run evaluation and returns metrics. + + The calling script will be responsible for providing a method to compute metrics, as they are task-dependent + (pass it to the init `compute_metrics` argument). + + You can also subclass and override this method to inject custom behavior. + + Args: + eval_dataset (Union[`Dataset`, Dict[str, `Dataset`]), *optional*): + Pass a dataset if you wish to override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns + not accepted by the `model.forward()` method are automatically removed. If it is a dictionary, it will + evaluate on each dataset, prepending the dictionary key to the metric name. Datasets must implement the + `__len__` method. + + + + If you pass a dictionary with names of datasets as keys and datasets as values, evaluate will run + separate evaluations on each dataset. This can be useful to monitor how training affects other + datasets or simply to get a more fine-grained evaluation. + When used with `load_best_model_at_end`, make sure `metric_for_best_model` references exactly one + of the datasets. If you, for example, pass in `{"data1": data1, "data2": data2}` for two datasets + `data1` and `data2`, you could specify `metric_for_best_model="eval_data1_loss"` for using the + loss on `data1` and `metric_for_best_model="eval_data1_loss"` for the loss on `data2`. + + + + ignore_keys (`List[str]`, *optional*): + A list of keys in the output of your model (if it is a dictionary) that should be ignored when + gathering predictions. + metric_key_prefix (`str`, *optional*, defaults to `"eval"`): + An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named + "eval_bleu" if the prefix is "eval" (default) + + Returns: + A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The + dictionary also contains the epoch number which comes from the training state. + """ + # handle multipe eval datasets + eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset + if isinstance(eval_dataset, dict): + metrics = {} + for eval_dataset_name, _eval_dataset in eval_dataset.items(): + dataset_metrics = self.evaluate( + eval_dataset=_eval_dataset, + ignore_keys=ignore_keys, + metric_key_prefix=f"{metric_key_prefix}_{eval_dataset_name}", + ) + metrics.update(dataset_metrics) + return metrics + + # memory metrics - must set up as early as possible + self._memory_tracker.start() + + eval_dataloader = self.get_eval_dataloader(eval_dataset) + start_time = time.time() + + eval_loop = self.evaluation_loop + output = eval_loop( + eval_dataloader, + description="Evaluation", + # No point gathering the predictions if there are no metrics, otherwise we defer to + # self.args.prediction_loss_only + prediction_loss_only=True if self.compute_metrics is None else None, + ignore_keys=ignore_keys, + metric_key_prefix=metric_key_prefix, + ) + + total_batch_size = self.args.eval_batch_size * self.args.world_size + if f"{metric_key_prefix}_jit_compilation_time" in output.metrics: + start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"] + output.metrics.update( + speed_metrics( + metric_key_prefix, + start_time, + num_samples=output.num_samples, + num_steps=math.ceil(output.num_samples / total_batch_size), + ) + ) + + self.log(output.metrics) + + if DebugOption.TPU_METRICS_DEBUG in self.args.debug: + # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) + xm.master_print(met.metrics_report()) + + self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics) + + self._memory_tracker.stop_and_update_metrics(output.metrics) + + return output.metrics, self.eval_disc_data + + def prediction_step( + self, + model: nn.Module, + inputs: Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[List[str]] = None, + ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on `model` using `inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (`nn.Module`): + The model to evaluate. + inputs (`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument `labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (`bool`): + Whether or not to return the loss only. + ignore_keys (`List[str]`, *optional*): + A list of keys in the output of your model (if it is a dictionary) that should be ignored when + gathering predictions. + + Return: + Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, + logits and labels (each being optional). + """ + has_labels = False if len(self.label_names) == 0 else all(inputs.get(k) is not None for k in self.label_names) + # For CLIP-like models capable of returning loss values. + # If `return_loss` is not specified or being `None` in `inputs`, we check if the default value of `return_loss` + # is `True` in `model.forward`. + return_loss = inputs.get("return_loss", None) + if return_loss is None: + return_loss = self.can_return_loss + loss_without_labels = True if len(self.label_names) == 0 and return_loss else False + + inputs = self._prepare_inputs(inputs) + if ignore_keys is None: + if hasattr(self.model, "config"): + ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", []) + else: + ignore_keys = [] + + # labels may be popped when computing the loss (label smoothing for instance) so we grab them first. + if has_labels or loss_without_labels: + labels = nested_detach(tuple(inputs.get(name) for name in self.label_names)) + if len(labels) == 1: + labels = labels[0] + else: + labels = None + + with torch.no_grad(): + if is_sagemaker_mp_enabled(): + raw_outputs = smp_forward_only(model, inputs) + if has_labels or loss_without_labels: + if isinstance(raw_outputs, dict): + loss_mb = raw_outputs["loss"] + logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys + ["loss"]) + else: + loss_mb = raw_outputs[0] + logits_mb = raw_outputs[1:] + + loss = loss_mb.reduce_mean().detach().cpu() + logits = smp_nested_concat(logits_mb) + else: + loss = None + if isinstance(raw_outputs, dict): + logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys) + else: + logits_mb = raw_outputs + logits = smp_nested_concat(logits_mb) + else: + if has_labels or loss_without_labels: + with self.compute_loss_context_manager(): + inputs["eval_disc"] = True + loss, outputs = self.compute_loss(model, inputs, return_outputs=True) + loss = loss.mean().detach() + + if isinstance(outputs, dict): + logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"]) + else: + logits = outputs[1:] + else: + loss = None + with self.compute_loss_context_manager(): + outputs = model(**inputs) + if isinstance(outputs, dict): + logits = tuple(v for k, v in outputs.items() if k not in ignore_keys) + else: + logits = outputs + # TODO: this needs to be fixed and made cleaner later. + if self.args.past_index >= 0: + self._past = outputs[self.args.past_index - 1] + + if prediction_loss_only: + return (loss, None, None) + + logits = nested_detach(logits) + if len(logits) == 1: + logits = logits[0] + + return (loss, logits, labels) \ No newline at end of file diff --git a/scripts/v1_5/finetune.sh b/scripts/v1_5/finetune.sh index 435448394..14c4a94c1 100644 --- a/scripts/v1_5/finetune.sh +++ b/scripts/v1_5/finetune.sh @@ -7,7 +7,7 @@ deepspeed llava/train/train_mem.py \ --data_path ./playground/data/llava_v1_5_mix665k.json \ --image_folder ./playground/data \ --vision_tower openai/clip-vit-large-patch14-336 \ - --pretrain_mm_mlp_adapter ./checkpoints/llava-v1.5-13b-pretrain/mm_projector.bin \ + --pretrain_mm_mlp_adapter ./checkpoints/llava-v1.5-13b/mm_projector.bin \ --mm_projector_type mlp2x_gelu \ --mm_vision_select_layer -2 \ --mm_use_im_start_end False \ @@ -15,9 +15,9 @@ deepspeed llava/train/train_mem.py \ --image_aspect_ratio pad \ --group_by_modality_length True \ --bf16 True \ - --output_dir ./checkpoints/llava-v1.5-13b \ + --output_dir ./checkpoints/llava-v1.5-13b-train_disc_no_lora \ --num_train_epochs 1 \ - --per_device_train_batch_size 16 \ + --per_device_train_batch_size 4 \ --per_device_eval_batch_size 4 \ --gradient_accumulation_steps 1 \ --evaluation_strategy "no" \ diff --git a/scripts/v1_5/finetune_lora.sh b/scripts/v1_5/finetune_lora.sh index db2b3ac33..648be7708 100644 --- a/scripts/v1_5/finetune_lora.sh +++ b/scripts/v1_5/finetune_lora.sh @@ -1,6 +1,10 @@ #!/bin/bash -deepspeed llava/train/train_mem.py \ +# llava/train/train_mem.py + +# REMOVE TEST DATA PATH, TEST DATA IMAGES, TESTING + +deepspeed llava/VLLMSafety/evaluate_disc.py \ --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ --deepspeed ./scripts/zero3.json \ --model_name_or_path lmsys/vicuna-13b-v1.5 \ @@ -16,15 +20,14 @@ deepspeed llava/train/train_mem.py \ --image_aspect_ratio pad \ --group_by_modality_length True \ --bf16 True \ - --output_dir ./checkpoints/llava-v1.5-13b-lora \ + --output_dir ./checkpoints/llava-v1.5-13b-eval_disc \ --num_train_epochs 1 \ - --per_device_train_batch_size 16 \ + --per_device_train_batch_size 4 \ --per_device_eval_batch_size 4 \ --gradient_accumulation_steps 1 \ --evaluation_strategy "no" \ --save_strategy "steps" \ - --save_steps 50000 \ - --save_total_limit 1 \ + --save_steps 50000 --save_total_limit 1 \ --learning_rate 2e-4 \ --weight_decay 0. \ --warmup_ratio 0.03 \ @@ -35,4 +38,4 @@ deepspeed llava/train/train_mem.py \ --gradient_checkpointing True \ --dataloader_num_workers 4 \ --lazy_preprocess True \ - --report_to wandb + --report_to wandb \ No newline at end of file From 40c42d769cb4622b78808e48089e5f157ac77449 Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Wed, 11 Sep 2024 20:29:42 +0000 Subject: [PATCH 31/41] code from meeting 9/11 --- llava/VLLMSafety/discriminator.py | 18 +++++++----------- llava/VLLMSafety/evaluate_disc.py | 2 +- llava/train/llava_trainer.py | 2 +- scripts/v1_5/finetune_lora.sh | 2 +- 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/llava/VLLMSafety/discriminator.py b/llava/VLLMSafety/discriminator.py index 6b04aa689..c73e4a440 100644 --- a/llava/VLLMSafety/discriminator.py +++ b/llava/VLLMSafety/discriminator.py @@ -18,7 +18,6 @@ def linear(self, x): x = torch.sigmoid(self.fc2(x)) return x - def forward(self, data, d_mode): device = 'cuda' @@ -32,22 +31,19 @@ def forward(self, data, d_mode): img_pred = self.linear(img_tok) # BCE expects output from a sigmoid (i think) lang_pred = self.linear(lang_tok) - img_pred_binary = torch.ge(img_pred, 0.5).float().to(torch.bfloat16) - lang_pred_binary = torch.ge(lang_pred, 0.5).float().to(torch.bfloat16) - if d_mode == True: img_label = torch.full((img_tok.size(0), 1), 1, dtype=torch.bfloat16, device=device) # 1 for images lang_label = torch.full((lang_tok.size(0), 1), 0, dtype=torch.bfloat16, device=device) # 0 for lang - img_loss = loss_function(img_pred_binary, img_label) - lang_loss = loss_function(lang_pred_binary, lang_label) - img_loss = loss_function(img_pred, img_label) lang_loss = loss_function(lang_pred, lang_label) + loss = img_loss + lang_loss - img_is_correct = torch.eq(img_pred_binary, img_label) - + img_pred_binary = torch.ge(img_pred, 0.5).float().to(torch.bfloat16) + lang_pred_binary = torch.lt(lang_pred, 0.5).float().to(torch.bfloat16) + + img_is_correct = torch.eq(img_pred_binary, img_label) lang_is_correct = torch.eq(lang_pred_binary, lang_label) return_dict = { @@ -63,7 +59,7 @@ def forward(self, data, d_mode): "lang_total": return_dict["img_is_correct"].size(0) } - with open("/home/smirrashidi/return_dict.json", "a") as json_file: + with open("/home/smirrashidi/return_dict2.json", "a") as json_file: json.dump(json_dict, json_file) json_file.write("\n") @@ -71,7 +67,7 @@ def forward(self, data, d_mode): else: lang_label = torch.full((img_tok.size(0), 1), 0, dtype=torch.bfloat16, device=device) # 0 for lang - img_with_lang_label_loss = loss_function(img_pred_binary, lang_label) # trying to follow DCGAN + img_with_lang_label_loss = loss_function(img_pred, lang_label) # trying to follow DCGAN return img_with_lang_label_loss # returning image loss to maximize disc loss when training generator diff --git a/llava/VLLMSafety/evaluate_disc.py b/llava/VLLMSafety/evaluate_disc.py index 669f446b6..6aee0e979 100644 --- a/llava/VLLMSafety/evaluate_disc.py +++ b/llava/VLLMSafety/evaluate_disc.py @@ -1001,7 +1001,7 @@ def make_inputs_require_grad(module, input, output): print(eval_dict) - with open("/home/smirrashidi/test_discrim.json", "w") as json_file: + with open("/home/smirrashidi/test_discrim2.json", "w") as json_file: json.dump(eval_dict, json_file, indent=4) model.config.use_cache = True diff --git a/llava/train/llava_trainer.py b/llava/train/llava_trainer.py index e261b5155..1c57f0b37 100644 --- a/llava/train/llava_trainer.py +++ b/llava/train/llava_trainer.py @@ -829,7 +829,7 @@ def compute_loss(self, model, inputs, return_outputs=False): "lang_total": discrim_dict["img_is_correct"].size(0) } - with open("/home/smirrashidi/return_dict.json", "a") as json_file: + with open("/home/smirrashidi/return_dict2.json", "a") as json_file: json.dump("eval", json_file) json_file.write("\n") json.dump(json_dict, json_file) diff --git a/scripts/v1_5/finetune_lora.sh b/scripts/v1_5/finetune_lora.sh index 648be7708..cbcdd2da3 100644 --- a/scripts/v1_5/finetune_lora.sh +++ b/scripts/v1_5/finetune_lora.sh @@ -20,7 +20,7 @@ deepspeed llava/VLLMSafety/evaluate_disc.py \ --image_aspect_ratio pad \ --group_by_modality_length True \ --bf16 True \ - --output_dir ./checkpoints/llava-v1.5-13b-eval_disc \ + --output_dir ./checkpoints/llava-v1.5-13b-eval_disc2 \ --num_train_epochs 1 \ --per_device_train_batch_size 4 \ --per_device_eval_batch_size 4 \ From f0052643b69835bb3e13e7e0bce3e5d771881ea6 Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Mon, 16 Sep 2024 19:30:17 +0000 Subject: [PATCH 32/41] saving discriminator --- llava/VLLMSafety/builder2.py | 166 +++++++++++++++++++++++++++++++++++ scripts/v1_5/eval_discrim.sh | 43 +++++++++ 2 files changed, 209 insertions(+) create mode 100644 llava/VLLMSafety/builder2.py create mode 100644 scripts/v1_5/eval_discrim.sh diff --git a/llava/VLLMSafety/builder2.py b/llava/VLLMSafety/builder2.py new file mode 100644 index 000000000..efa187c46 --- /dev/null +++ b/llava/VLLMSafety/builder2.py @@ -0,0 +1,166 @@ +import os +import warnings +import shutil + +from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig +import torch +from llava.model import * +from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN +from transformers.modeling_utils import * +from peft.tuners.lora.model import * + + +def load_pretrained_model_discrim(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs): + kwargs = {"device_map": device_map, **kwargs} + + if device != "cuda": + kwargs['device_map'] = {"": device} + + if load_8bit: + kwargs['load_in_8bit'] = True + elif load_4bit: + kwargs['load_in_4bit'] = True + kwargs['quantization_config'] = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4' + ) + else: + kwargs['torch_dtype'] = torch.float16 + + if use_flash_attn: + kwargs['attn_implementation'] = 'flash_attention_2' + + if 'llava' in model_name.lower(): + # Load LLaVA model + if 'lora' in model_name.lower() and model_base is None: + warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.') + if 'lora' in model_name.lower() and model_base is not None: + from llava.model.language_model.llava_llama import LlavaConfig + lora_cfg_pretrained = LlavaConfig.from_pretrained(model_path) + tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False) + print('Loading LLaVA from base model...') + model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs) + token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features + if model.lm_head.weight.shape[0] != token_num: + model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype)) + model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype)) + + print('Manually loading Discriminator weights...') + model_lora_state_dict = load_state_dict('/home/smirrashidi/LLaVAFork/checkpoints/llava-v1.5-13b-lora_disc/adapter_model.safetensors') # only has the lora weights + discriminator_state_dict = { + 'discriminator.fc1.lora_A.weight': model_lora_state_dict['base_model.model.discriminator.fc1.lora_A.weight'], # starts with 0.0361, 0.0126 + 'discriminator.fc1.lora_B.weight': model_lora_state_dict['base_model.model.discriminator.fc1.lora_B.weight'], # starts with -2.2095e-02, 1.1414e-02 + 'discriminator.fc2.lora_A.weight': model_lora_state_dict['base_model.model.discriminator.fc1.lora_A.weight'], # starts with 0.0952, 0.0471 + 'discriminator.fc2.lora_B.weight': model_lora_state_dict['base_model.model.discriminator.fc1.lora_B.weight'] # starts with 0.0688, -0.0267 + } + + + model.merge_and_unload(discriminator_state_dict) + + print('Loading additional LLaVA weights...') + if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')): + non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu') + else: + # this is probably from HF Hub + from huggingface_hub import hf_hub_download + def load_from_hf(repo_id, filename, subfolder=None): + cache_file = hf_hub_download( + repo_id=repo_id, + filename=filename, + subfolder=subfolder) + return torch.load(cache_file, map_location='cpu') + non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin') + non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()} + if any(k.startswith('model.model.') for k in non_lora_trainables): + non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()} + model.load_state_dict(non_lora_trainables, strict=False) + + from peft import PeftModel + print('Loading LoRA weights...') + model = PeftModel.from_pretrained(model, model_path) + print('Merging LoRA weights...') + model = model.merge_and_unload() + print('Model is loaded...') + elif model_base is not None: + # this may be mm projector only + print('Loading LLaVA from base model...') + if 'mpt' in model_name.lower(): + if not os.path.isfile(os.path.join(model_path, 'configuration_mpt.py')): + shutil.copyfile(os.path.join(model_base, 'configuration_mpt.py'), os.path.join(model_path, 'configuration_mpt.py')) + tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=True) + cfg_pretrained = AutoConfig.from_pretrained(model_path, trust_remote_code=True) + model = LlavaMptForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs) + else: + tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False) + cfg_pretrained = AutoConfig.from_pretrained(model_path) + model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs) + + mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu') + mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()} + model.load_state_dict(mm_projector_weights, strict=False) + else: + if 'mpt' in model_name.lower(): + tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) + model = LlavaMptForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs) + elif 'mistral' in model_name.lower(): + tokenizer = AutoTokenizer.from_pretrained(model_path) + model = LlavaMistralForCausalLM.from_pretrained( + model_path, + low_cpu_mem_usage=True, + **kwargs + ) + else: + tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) + model = LlavaLlamaForCausalLM.from_pretrained( + model_path, + low_cpu_mem_usage=True, + **kwargs + ) + else: + # Load language model + if model_base is not None: + # PEFT model + from peft import PeftModel + tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False) + model = AutoModelForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs) + print(f"Loading LoRA weights from {model_path}") + model = PeftModel.from_pretrained(model, model_path) + print(f"Merging weights") + model = model.merge_and_unload() + print('Convert to FP16...') + model.to(torch.float16) + else: + use_fast = False + if 'mpt' in model_name.lower(): + tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) + model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs) + else: + tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) + model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs) + + image_processor = None + + if 'llava' in model_name.lower(): + mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False) + mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True) + if mm_use_im_patch_token: + tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True) + if mm_use_im_start_end: + tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True) + model.resize_token_embeddings(len(tokenizer)) + + vision_tower = model.get_vision_tower() + if not vision_tower.is_loaded: + vision_tower.load_model(device_map=device_map) + if device_map != 'auto': + vision_tower.to(device=device_map, dtype=torch.float16) + image_processor = vision_tower.image_processor + + if hasattr(model.config, "max_sequence_length"): + context_len = model.config.max_sequence_length + else: + context_len = 2048 + + return tokenizer, model, image_processor, context_len diff --git a/scripts/v1_5/eval_discrim.sh b/scripts/v1_5/eval_discrim.sh new file mode 100644 index 000000000..17064766e --- /dev/null +++ b/scripts/v1_5/eval_discrim.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +# llava/train/train_mem.py + +# REMOVE TEST DATA PATH, TEST DATA IMAGES, TESTING + +# change output dir back to llava-v1.5-13b-lora_disc + +deepspeed llava/VLLMSafety/train.py \ + --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ + --deepspeed ./scripts/zero3.json \ + --model_name_or_path lmsys/vicuna-13b-v1.5 \ + --version v1 \ + --data_path ./playground/data/llava_v1_5_mix665k.json \ + --image_folder ./playground/data \ + --vision_tower openai/clip-vit-large-patch14-336 \ + --pretrain_mm_mlp_adapter ./checkpoints/llava-v1.5-13b/mm_projector.bin \ + --mm_projector_type mlp2x_gelu \ + --mm_vision_select_layer -2 \ + --mm_use_im_start_end False \ + --mm_use_im_patch_token False \ + --image_aspect_ratio pad \ + --group_by_modality_length True \ + --bf16 True \ + --output_dir ./checkpoints/llava-v1.5-13b-lora_testing \ + --num_train_epochs 1 \ + --per_device_train_batch_size 4 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 1 \ + --evaluation_strategy "no" \ + --save_strategy "steps" \ + --save_steps 50000 --save_total_limit 1 \ + --learning_rate 2e-4 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --tf32 True \ + --model_max_length 2048 \ + --gradient_checkpointing True \ + --dataloader_num_workers 4 \ + --lazy_preprocess True \ + --report_to wandb \ No newline at end of file From b53918564b54c9a243bd53a0568f0de099cf5862 Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Mon, 16 Sep 2024 19:32:09 +0000 Subject: [PATCH 33/41] adding rest of files from saving the discrim --- llava/VLLMSafety/discriminator.py | 8 +++++- llava/VLLMSafety/test_discrim.py | 33 ++++++++++------------- llava/model/language_model/llava_llama.py | 8 +++--- llava/model/llava_arch.py | 1 + llava/train/llava_trainer.py | 2 ++ llava/train/train.py | 7 ++++- scripts/v1_5/finetune_lora.sh | 8 +++--- 7 files changed, 38 insertions(+), 29 deletions(-) diff --git a/llava/VLLMSafety/discriminator.py b/llava/VLLMSafety/discriminator.py index c73e4a440..84c26ceab 100644 --- a/llava/VLLMSafety/discriminator.py +++ b/llava/VLLMSafety/discriminator.py @@ -32,6 +32,10 @@ def forward(self, data, d_mode): lang_pred = self.linear(lang_tok) if d_mode == True: + print("in discrim forward") + print(self.fc1.weight.requires_grad) + print(self.fc2.weight.requires_grad) + img_label = torch.full((img_tok.size(0), 1), 1, dtype=torch.bfloat16, device=device) # 1 for images lang_label = torch.full((lang_tok.size(0), 1), 0, dtype=torch.bfloat16, device=device) # 0 for lang @@ -43,6 +47,8 @@ def forward(self, data, d_mode): img_pred_binary = torch.ge(img_pred, 0.5).float().to(torch.bfloat16) lang_pred_binary = torch.lt(lang_pred, 0.5).float().to(torch.bfloat16) + print(f'img_loss: {img_loss} lang_loss: {lang_loss} loss: {loss}\n') + img_is_correct = torch.eq(img_pred_binary, img_label) lang_is_correct = torch.eq(lang_pred_binary, lang_label) @@ -59,7 +65,7 @@ def forward(self, data, d_mode): "lang_total": return_dict["img_is_correct"].size(0) } - with open("/home/smirrashidi/return_dict2.json", "a") as json_file: + with open("/home/smirrashidi/return_dict1.json", "a") as json_file: json.dump(json_dict, json_file) json_file.write("\n") diff --git a/llava/VLLMSafety/test_discrim.py b/llava/VLLMSafety/test_discrim.py index d8410cc2f..fa278e6b6 100644 --- a/llava/VLLMSafety/test_discrim.py +++ b/llava/VLLMSafety/test_discrim.py @@ -3,12 +3,11 @@ import os import json from tqdm import tqdm -import shortuuid from torch.utils.data import DataLoader from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN from llava.conversation import conv_templates, SeparatorStyle -from llava.model.builder import load_pretrained_model +from llava.VLLMSafety.builder2 import load_pretrained_model_discrim from llava.utils import disable_torch_init from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path from llava.train.train import LazySupervisedDataset, DataCollatorForSupervisedDataset, DataArguments, TrainingArguments, make_supervised_data_module @@ -33,36 +32,32 @@ def get_chunk(lst, n, k): def eval_model(args): model_path = os.path.expanduser(args.model_path) model_name = get_model_name_from_path(model_path) - tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) + tokenizer, model, image_processor, context_len = load_pretrained_model_discrim(model_path, args.model_base, model_name) model_args = ModelArguments(model_name_or_path = args.model_path) data_args = DataArguments(data_path = args.question_file, image_folder = args.image_folder) + training_args = TrainingArguments(output_dir="/home/smirrashidi/dump") - model = LlavaLlamaForCausalLM.from_pretrained( - model_args.model_name_or_path, - cache_dir=training_args.cache_dir, - attn_implementation=None, - torch_dtype=(torch.bfloat16 if training_args.bf16 else None) - ) + total = 0 num_img_correct= 0 num_lang_correct = 0 - test_data = LazySupervisedDataset(tokenizer=tokenizer, - data_path=data_args.data_path, - data_args=data_args) + # test_data = LazySupervisedDataset(tokenizer=tokenizer, + # data_path=data_args.data_path, + # data_args=data_args) - data_module = make_supervised_data_module(tokenizer=tokenizer, - data_args=data_args) + # data_module = make_supervised_data_module(tokenizer=tokenizer, + # data_args=data_args) - trainer = LLaVATrainer(model=model, - tokenizer=tokenizer, - args=training_args, - **data_module) + # trainer = LLaVATrainer(model=model, + # tokenizer=tokenizer, + # args=training_args, + # **data_module) - trainer.evaluate(eval_dataset=test_data) + # trainer.evaluate(eval_dataset=test_data) if __name__ == "__main__": diff --git a/llava/model/language_model/llava_llama.py b/llava/model/language_model/llava_llama.py index ff8848984..3185ed39c 100644 --- a/llava/model/language_model/llava_llama.py +++ b/llava/model/language_model/llava_llama.py @@ -27,6 +27,9 @@ from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM +from transformers.modeling_utils import * +from transformers.modeling_utils import _add_variant + class LlavaConfig(LlamaConfig): model_type = "llava_llama" @@ -132,6 +135,8 @@ def forward( ) d_loss = discrim_dict["loss"] + + print(f"pritning from llava forward function \n fc1:{self.discriminator.fc1.weight}") model_output.loss = d_loss # returning only discriminator loss @@ -249,6 +254,3 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, if image_sizes is not None: inputs['image_sizes'] = image_sizes return inputs - -AutoConfig.register("llava_llama", LlavaConfig) -AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM) \ No newline at end of file diff --git a/llava/model/llava_arch.py b/llava/model/llava_arch.py index 52aa949dc..a896e9ed2 100644 --- a/llava/model/llava_arch.py +++ b/llava/model/llava_arch.py @@ -260,6 +260,7 @@ def prepare_inputs_labels_for_multimodal( #curr input embeds is coming from cur_input_ids_noim which means its already filitered self.disc_data['lang'] = cur_input_embeds + #print(f'self.disc_data: {self.disc_data}\n') cur_new_input_embeds = [] cur_new_labels = [] diff --git a/llava/train/llava_trainer.py b/llava/train/llava_trainer.py index 1c57f0b37..689e4028a 100644 --- a/llava/train/llava_trainer.py +++ b/llava/train/llava_trainer.py @@ -797,6 +797,8 @@ def _inner_training_loop( if self.neftune_noise_alpha is not None: self._deactivate_neftune(self.model) + print(f"printing from inner_training_loop:{model.module.base_model.model.discriminator.fc1.weight}") + return TrainOutput(self.state.global_step, train_loss, metrics) def compute_loss(self, model, inputs, return_outputs=False): diff --git a/llava/train/train.py b/llava/train/train.py index 4ee9b8861..014b8abbb 100644 --- a/llava/train/train.py +++ b/llava/train/train.py @@ -169,7 +169,7 @@ def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match): def find_all_linear_names(model): cls = torch.nn.Linear lora_module_names = set() - multimodal_keywords = ['mm_projector', 'vision_tower', 'vision_resampler'] + multimodal_keywords = ['mm_projector', 'vision_tower', 'vision_resampler', 'discriminator'] for name, module in model.named_modules(): if any(mm_keyword in name for mm_keyword in multimodal_keywords): continue @@ -960,6 +960,10 @@ def make_inputs_require_grad(module, input, output): data_args=data_args) model.to("cuda") + + for name, param in model.discriminator.named_parameters(): + param.requires_grad = True + trainer = LLaVATrainer(model=model, tokenizer=tokenizer, args=training_args, @@ -980,6 +984,7 @@ def make_inputs_require_grad(module, input, output): non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3( model.named_parameters() ) + print(non_lora_state_dict) if training_args.local_rank == 0 or training_args.local_rank == -1: model.config.save_pretrained(training_args.output_dir) model.save_pretrained(training_args.output_dir, state_dict=state_dict) diff --git a/scripts/v1_5/finetune_lora.sh b/scripts/v1_5/finetune_lora.sh index cbcdd2da3..701c3fbdd 100644 --- a/scripts/v1_5/finetune_lora.sh +++ b/scripts/v1_5/finetune_lora.sh @@ -1,10 +1,8 @@ #!/bin/bash -# llava/train/train_mem.py +# change output dir back to llava-v1.5-13b-lora_disc -# REMOVE TEST DATA PATH, TEST DATA IMAGES, TESTING - -deepspeed llava/VLLMSafety/evaluate_disc.py \ +deepspeed llava/train/train_mem.py \ --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ --deepspeed ./scripts/zero3.json \ --model_name_or_path lmsys/vicuna-13b-v1.5 \ @@ -20,7 +18,7 @@ deepspeed llava/VLLMSafety/evaluate_disc.py \ --image_aspect_ratio pad \ --group_by_modality_length True \ --bf16 True \ - --output_dir ./checkpoints/llava-v1.5-13b-eval_disc2 \ + --output_dir ./checkpoints/llava-v1.5-13b-lora_testing \ --num_train_epochs 1 \ --per_device_train_batch_size 4 \ --per_device_eval_batch_size 4 \ From 9822afb3e6b310dd6f856d5e9d5f776e0475949f Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Wed, 18 Sep 2024 07:39:52 +0000 Subject: [PATCH 34/41] successfully evaluated discrimiantor --- llava/VLLMSafety/discriminator.py | 23 +-- llava/VLLMSafety/evaluate_disc.py | 232 +++------------------- llava/VLLMSafety/test_discrim.py | 118 ++++++++--- llava/model/language_model/llava_llama.py | 23 +-- llava/model/llava_arch.py | 2 +- llava/train/llava_trainer.py | 182 +---------------- scripts/v1_5/eval_discrim.sh | 4 +- scripts/v1_5/finetune_lora.sh | 4 +- 8 files changed, 130 insertions(+), 458 deletions(-) diff --git a/llava/VLLMSafety/discriminator.py b/llava/VLLMSafety/discriminator.py index 84c26ceab..b44c9a605 100644 --- a/llava/VLLMSafety/discriminator.py +++ b/llava/VLLMSafety/discriminator.py @@ -20,7 +20,7 @@ def linear(self, x): return x def forward(self, data, d_mode): - device = 'cuda' + device = 'cuda:0' loss_function = nn.BCELoss() # from DCGAN img_tok = data["image"] @@ -31,10 +31,10 @@ def forward(self, data, d_mode): img_pred = self.linear(img_tok) # BCE expects output from a sigmoid (i think) lang_pred = self.linear(lang_tok) + img_pred = img_pred.to(device) # for testing dsicrim, remove when training + lang_pred = lang_pred.to(device) # for testing discrim, remove when training (i think?) its not like training even works + if d_mode == True: - print("in discrim forward") - print(self.fc1.weight.requires_grad) - print(self.fc2.weight.requires_grad) img_label = torch.full((img_tok.size(0), 1), 1, dtype=torch.bfloat16, device=device) # 1 for images lang_label = torch.full((lang_tok.size(0), 1), 0, dtype=torch.bfloat16, device=device) # 0 for lang @@ -45,10 +45,8 @@ def forward(self, data, d_mode): loss = img_loss + lang_loss img_pred_binary = torch.ge(img_pred, 0.5).float().to(torch.bfloat16) - lang_pred_binary = torch.lt(lang_pred, 0.5).float().to(torch.bfloat16) + lang_pred_binary = torch.ge(lang_pred, 0.5).float().to(torch.bfloat16) # >= because we want the tensor to be all 0s if each value is less than 0.5 - print(f'img_loss: {img_loss} lang_loss: {lang_loss} loss: {loss}\n') - img_is_correct = torch.eq(img_pred_binary, img_label) lang_is_correct = torch.eq(lang_pred_binary, lang_label) @@ -58,17 +56,6 @@ def forward(self, data, d_mode): "lang_is_correct": lang_is_correct, } - json_dict = { - "num_img_corr": return_dict["img_is_correct"].sum().item(), - "num_lang_corr": return_dict["lang_is_correct"].sum().item(), - "img_total": return_dict["img_is_correct"].size(0), - "lang_total": return_dict["img_is_correct"].size(0) - } - - with open("/home/smirrashidi/return_dict1.json", "a") as json_file: - json.dump(json_dict, json_file) - json_file.write("\n") - return return_dict else: diff --git a/llava/VLLMSafety/evaluate_disc.py b/llava/VLLMSafety/evaluate_disc.py index 6aee0e979..bf8cad070 100644 --- a/llava/VLLMSafety/evaluate_disc.py +++ b/llava/VLLMSafety/evaluate_disc.py @@ -28,12 +28,14 @@ import tokenizers from llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN -from torch.utils.data import Dataset +from torch.utils.data import Dataset, DataLoader from llava.train.llava_trainer import LLaVATrainer from llava import conversation as conversation_lib from llava.model import * -from llava.mm_utils import tokenizer_image_token +from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path +from llava.model.builder import load_pretrained_model + from PIL import Image @@ -113,8 +115,10 @@ class TrainingArguments(transformers.TrainingArguments): @dataclass class DiscArguments: - test_data_path: Optional[str] = field(default="/home/smirrashidi/coco_data/coco_test_conversations.json") - test_image_folder:Optional[str] = field(default="/home/smirrashidi/coco_data/coco_test") + test_data_path:str = "/home/smirrashidi/coco_data/coco_test_conversations.json" + test_image_folder:str = "/home/smirrashidi/coco_data/coco_test" + model_path: str = "/home/smirrashidi/LLaVAFork/checkpoints/llava-v1.5-13b-lora-disc" + model_base: str = "lmsys/vicuna-13b-v1.3" def maybe_zero_3(param, ignore_status=False, name=None): @@ -811,216 +815,42 @@ def train(attn_implementation=None): model_args, data_args, training_args, disc_args = parser.parse_args_into_dataclasses() local_rank = training_args.local_rank compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32)) - - bnb_model_from_pretrained_args = {} - if training_args.bits in [4, 8]: - from transformers import BitsAndBytesConfig - bnb_model_from_pretrained_args.update(dict( - device_map={"": training_args.device}, - load_in_4bit=training_args.bits == 4, - load_in_8bit=training_args.bits == 8, - quantization_config=BitsAndBytesConfig( - load_in_4bit=training_args.bits == 4, - load_in_8bit=training_args.bits == 8, - llm_int8_skip_modules=["mm_projector"], - llm_int8_threshold=6.0, - llm_int8_has_fp16_weight=False, - bnb_4bit_compute_dtype=compute_dtype, - bnb_4bit_use_double_quant=training_args.double_quant, - bnb_4bit_quant_type=training_args.quant_type # {'fp4', 'nf4'} - ) - )) - - if model_args.vision_tower is not None: - if 'mpt' in model_args.model_name_or_path: - config = transformers.AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True) - config.attn_config['attn_impl'] = training_args.mpt_attn_impl - model = LlavaMptForCausalLM.from_pretrained( - model_args.model_name_or_path, - config=config, - cache_dir=training_args.cache_dir, - **bnb_model_from_pretrained_args - ) - else: - model = LlavaLlamaForCausalLM.from_pretrained( - model_args.model_name_or_path, - cache_dir=training_args.cache_dir, - attn_implementation=attn_implementation, - torch_dtype=(torch.bfloat16 if training_args.bf16 else None), - **bnb_model_from_pretrained_args - ) - else: - model = transformers.LlamaForCausalLM.from_pretrained( - model_args.model_name_or_path, - cache_dir=training_args.cache_dir, - attn_implementation=attn_implementation, - torch_dtype=(torch.bfloat16 if training_args.bf16 else None), - **bnb_model_from_pretrained_args - ) - model.config.use_cache = False - - if model_args.freeze_backbone: - model.model.requires_grad_(False) - - if training_args.bits in [4, 8]: - from peft import prepare_model_for_kbit_training - model.config.torch_dtype=(torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32)) - model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing) - - if training_args.gradient_checkpointing: - if hasattr(model, "enable_input_require_grads"): - model.enable_input_require_grads() - else: - def make_inputs_require_grad(module, input, output): - output.requires_grad_(True) - model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) - - if training_args.lora_enable: - from peft import LoraConfig, get_peft_model - lora_config = LoraConfig( - r=training_args.lora_r, - lora_alpha=training_args.lora_alpha, - target_modules=find_all_linear_names(model), - lora_dropout=training_args.lora_dropout, - bias=training_args.lora_bias, - task_type="CAUSAL_LM", - ) - if training_args.bits == 16: - if training_args.bf16: - model.to(torch.bfloat16) - if training_args.fp16: - model.to(torch.float16) - rank0_print("Adding LoRA adapters...") - model = get_peft_model(model, lora_config) - - if 'mpt' in model_args.model_name_or_path: - tokenizer = transformers.AutoTokenizer.from_pretrained( - model_args.model_name_or_path, - cache_dir=training_args.cache_dir, - model_max_length=training_args.model_max_length, - padding_side="right" - ) - else: - tokenizer = transformers.AutoTokenizer.from_pretrained( - model_args.model_name_or_path, - cache_dir=training_args.cache_dir, - model_max_length=training_args.model_max_length, - padding_side="right", - use_fast=False, - ) - - if model_args.version == "v0": - if tokenizer.pad_token is None: - smart_tokenizer_and_embedding_resize( - special_tokens_dict=dict(pad_token="[PAD]"), - tokenizer=tokenizer, - model=model, - ) - elif model_args.version == "v0.5": - tokenizer.pad_token = tokenizer.unk_token - else: - tokenizer.pad_token = tokenizer.unk_token - if model_args.version in conversation_lib.conv_templates: - conversation_lib.default_conversation = conversation_lib.conv_templates[model_args.version] - else: - conversation_lib.default_conversation = conversation_lib.conv_templates["vicuna_v1"] - - if model_args.vision_tower is not None: - model.get_model().initialize_vision_modules( - model_args=model_args, - fsdp=training_args.fsdp - ) - - vision_tower = model.get_vision_tower() - vision_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device) - - data_args.image_processor = vision_tower.image_processor - data_args.is_multimodal = True - - model.config.image_aspect_ratio = data_args.image_aspect_ratio - model.config.tokenizer_padding_side = tokenizer.padding_side - model.config.tokenizer_model_max_length = tokenizer.model_max_length - - model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter - if model_args.tune_mm_mlp_adapter: - model.requires_grad_(False) - for p in model.get_model().mm_projector.parameters(): - p.requires_grad = True - - model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter - if training_args.freeze_mm_mlp_adapter: - for p in model.get_model().mm_projector.parameters(): - p.requires_grad = False - - if training_args.bits in [4, 8]: - model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device) - - model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = model_args.mm_use_im_start_end - model.config.mm_projector_lr = training_args.mm_projector_lr - training_args.use_im_start_end = model_args.mm_use_im_start_end - model.config.mm_use_im_patch_token = model_args.mm_use_im_patch_token - model.initialize_vision_tokenizer(model_args, tokenizer=tokenizer) - - if training_args.bits in [4, 8]: - from peft.tuners.lora import LoraLayer - for name, module in model.named_modules(): - if isinstance(module, LoraLayer): - if training_args.bf16: - module = module.to(torch.bfloat16) - if 'norm' in name: - module = module.to(torch.float32) - if 'lm_head' in name or 'embed_tokens' in name: - if hasattr(module, 'weight'): - if training_args.bf16 and module.weight.dtype == torch.float32: - module = module.to(torch.bfloat16) - - data_module = make_supervised_data_module(tokenizer=tokenizer, - data_args=data_args, - disc_args=disc_args, - testing = False) + model_path = os.path.expanduser(disc_args.model_path) + model_name = get_model_name_from_path(model_path) + tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, disc_args.model_base, model_name) model.to("cuda") - trainer = LLaVATrainer(model=model, - tokenizer=tokenizer, - args=training_args, - **data_module) - - if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")): - trainer.train(resume_from_checkpoint=True) - else: - trainer.train() - trainer.save_state() - trainer.save_model() test_data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args, disc_args=disc_args, testing = True) - _, eval_dict = trainer.evaluate(test_data_module["train_dataset"]) + data_collator = test_data_module['data_collator'] + + # Create DataLoader for test dataset + test_dataloader = DataLoader( + test_data_module['train_dataset'], # In this context, train_dataset is your test dataset + batch_size=4, # Adjust batch size as needed + collate_fn=data_collator, # Use the provided data_collator here + shuffle=False) + - print(eval_dict) + for i, batch in enumerate(test_dataloader): + input_ids = batch['input_ids'] + image = batch['image'] - with open("/home/smirrashidi/test_discrim2.json", "w") as json_file: - json.dump(eval_dict, json_file, indent=4) + with torch.inference_mode(): + discrim_dict = model.forward_eval_discrim( + input_ids = input_ids, + image = image + ) - model.config.use_cache = True + # with open("/home/smirrashidi/test_discrim2.json", "w") as json_file: + # json.dump(eval_dict, json_file, indent=4) - if training_args.lora_enable: - state_dict = get_peft_state_maybe_zero_3( - model.named_parameters(), training_args.lora_bias - ) - non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3( - model.named_parameters() - ) - if training_args.local_rank == 0 or training_args.local_rank == -1: - model.config.save_pretrained(training_args.output_dir) - model.save_pretrained(training_args.output_dir, state_dict=state_dict) - torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin')) - else: - safe_save_model_for_hf_trainer(trainer=trainer, - output_dir=training_args.output_dir) + # print(eval_dict) if __name__ == "__main__": train(attn_implementation="flash_attention_2") diff --git a/llava/VLLMSafety/test_discrim.py b/llava/VLLMSafety/test_discrim.py index fa278e6b6..0c888f287 100644 --- a/llava/VLLMSafety/test_discrim.py +++ b/llava/VLLMSafety/test_discrim.py @@ -7,17 +7,22 @@ from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN from llava.conversation import conv_templates, SeparatorStyle -from llava.VLLMSafety.builder2 import load_pretrained_model_discrim +from llava.model.builder import load_pretrained_model from llava.utils import disable_torch_init from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path -from llava.train.train import LazySupervisedDataset, DataCollatorForSupervisedDataset, DataArguments, TrainingArguments, make_supervised_data_module +from llava.train.train import * from llava.train.llava_trainer import LLaVATrainer from PIL import Image import math from llava.VLLMSafety.discriminator import Discriminator -from llava.train.train import * +from datetime import date +class DiscArguments: + test_data_path: str = "/home/smirrashidi/coco_data/coco_test_conversations.json" + test_image_folder: str = "/home/smirrashidi/coco_data/coco_test" + model_path: str = "/home/smirrashidi/LLaVAFork/checkpoints/llava-v1.5-13b-lora-disc" + model_base: str = "lmsys/vicuna-13b-v1.3" def split_list(lst, n): """Split a list into n (roughly) equal-sized chunks""" @@ -29,43 +34,96 @@ def get_chunk(lst, n, k): chunks = split_list(lst, n) return chunks[k] -def eval_model(args): - model_path = os.path.expanduser(args.model_path) - model_name = get_model_name_from_path(model_path) - tokenizer, model, image_processor, context_len = load_pretrained_model_discrim(model_path, args.model_base, model_name) - model_args = ModelArguments(model_name_or_path = args.model_path) - data_args = DataArguments(data_path = args.question_file, - image_folder = args.image_folder) +def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, + data_args, disc_args, testing) -> Dict: + """Make dataset and collator for supervised fine-tuning.""" + + if testing == True: + data_args.image_folder = "/home/smirrashidi/coco_data/coco_test" + data_args.data_path = "/home/smirrashidi/coco_data/coco_test_conversations.json" + + train_dataset = LazySupervisedDataset(tokenizer=tokenizer, + data_path=data_args.data_path, + data_args=data_args) - - training_args = TrainingArguments(output_dir="/home/smirrashidi/dump") + data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) + else: + train_dataset = LazySupervisedDataset(tokenizer=tokenizer, + data_path=data_args.data_path, + data_args=data_args) + data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) - total = 0 - num_img_correct= 0 - num_lang_correct = 0 - - # test_data = LazySupervisedDataset(tokenizer=tokenizer, - # data_path=data_args.data_path, - # data_args=data_args) + return dict(train_dataset=train_dataset, + eval_dataset=None, + data_collator=data_collator) + + +def eval_model(args): + # Model + disable_torch_init() + model_path = os.path.expanduser(args.model_path) + model_name = get_model_name_from_path(model_path) + model_args = ModelArguments(model_name_or_path = args.model_path) + data_args = DataArguments(data_path = args.question_file, + image_folder = args.image_folder) + training_args = TrainingArguments(output_dir="/home/smirrashidi/dump") + disc_args = DiscArguments + + tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) + + model = model.to(torch.bfloat16) + + data_args.image_processor = image_processor + + test_data_module = make_supervised_data_module(tokenizer=tokenizer, + data_args=data_args, + disc_args=disc_args, + testing = True) + + data_collator = test_data_module['data_collator'] + + test_dataloader = DataLoader( + test_data_module['train_dataset'], + batch_size=4, + collate_fn=data_collator, + shuffle=False) + + eval_disc_data = { + "num_img_corr": 0, + "num_lang_corr": 0, + "img_total": 0, + "lang_total": 0, + } - # data_module = make_supervised_data_module(tokenizer=tokenizer, - # data_args=data_args) + for i, batch in enumerate(test_dataloader): + print(f"Iteration #{i}") + input_ids = batch['input_ids'] + image = batch['images'] + with torch.inference_mode(): + discrim_dict = model.forward_eval_discrim( + input_ids = input_ids, + images = image + ) - # trainer = LLaVATrainer(model=model, - # tokenizer=tokenizer, - # args=training_args, - # **data_module) + eval_disc_data["num_img_corr"] += discrim_dict["img_is_correct"].sum().item() + eval_disc_data["num_lang_corr"] += discrim_dict["lang_is_correct"].sum().item() + eval_disc_data["img_total"] += discrim_dict["img_is_correct"].size(0) + eval_disc_data["lang_total"] += discrim_dict["lang_is_correct"].size(0) - # trainer.evaluate(eval_dataset=test_data) + eval_disc_data["date"] = date.today().strftime('%Y-%m-%d') + print(eval_disc_data) + with open("/home/smirrashidi/eval_discrim_results.json", "a") as json_file: + json.dump(eval_disc_data, json_file) + json_file.write("\n") if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--model-path", type=str, default="facebook/opt-350m") - parser.add_argument("--model-base", type=str, default=None) - parser.add_argument("--image-folder", type=str, default="") - parser.add_argument("--question-file", type=str, default="tables/question.jsonl") + parser.add_argument("--model-path", type=str, default="/home/smirrashidi/LLaVAFork/checkpoints/llava-v1.5-13b-lora-disc") + parser.add_argument("--model-base", type=str, default="lmsys/vicuna-13b-v1.3") + parser.add_argument("--image-folder", type=str, default="/home/smirrashidi/coco_data/coco_test") + parser.add_argument("--question-file", type=str, default="/home/smirrashidi/coco_data/coco_test_conversations.json") parser.add_argument("--answers-file", type=str, default="answer.jsonl") parser.add_argument("--conv-mode", type=str, default="llava_v1") parser.add_argument("--num-chunks", type=int, default=1) diff --git a/llava/model/language_model/llava_llama.py b/llava/model/language_model/llava_llama.py index 3185ed39c..3d73144cd 100644 --- a/llava/model/language_model/llava_llama.py +++ b/llava/model/language_model/llava_llama.py @@ -78,27 +78,8 @@ def forward( image_sizes: Optional[List[List[int]]] = None, return_dict: Optional[bool] = None, d_mode: Optional[bool] = False, # False means run without discriminator - eval_disc: Optional[bool] = False ) -> Union[Tuple, CausalLMOutputWithPast]: - if eval_disc == True: - return self.forward_eval_discrim( - input_ids, - attention_mask, - position_ids, - past_key_values, - inputs_embeds, - labels, - use_cache, - output_attentions, - output_hidden_states, - images, - image_sizes, - return_dict, - d_mode, - eval_disc - ) - if inputs_embeds is None: ( input_ids, @@ -135,9 +116,7 @@ def forward( ) d_loss = discrim_dict["loss"] - - print(f"pritning from llava forward function \n fc1:{self.discriminator.fc1.weight}") - + model_output.loss = d_loss # returning only discriminator loss return model_output diff --git a/llava/model/llava_arch.py b/llava/model/llava_arch.py index a896e9ed2..2677c226e 100644 --- a/llava/model/llava_arch.py +++ b/llava/model/llava_arch.py @@ -138,9 +138,9 @@ def get_vision_tower(self): return self.get_model().get_vision_tower() def encode_images(self, images): + images = images.to(torch.bfloat16) if images.dtype != torch.bfloat16 else images # added for testing the discriminator, not part of source code image_features = self.get_model().get_vision_tower()(images) image_features = self.get_model().mm_projector(image_features) - return image_features def prepare_inputs_labels_for_multimodal( diff --git a/llava/train/llava_trainer.py b/llava/train/llava_trainer.py index 689e4028a..5e2fed2db 100644 --- a/llava/train/llava_trainer.py +++ b/llava/train/llava_trainer.py @@ -171,13 +171,6 @@ class LLaVATrainer(Trainer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.eval_disc_data = { - "num_img_corr": 0, - "num_lang_corr": 0, - "img_total": 0, - "lang_total": 0, - } - def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]: if self.train_dataset is None or not has_length(self.train_dataset): return None @@ -608,7 +601,6 @@ def _inner_training_loop( step = -1 for step, inputs in enumerate(epoch_iterator): inputs['d_mode'] = True if (step % 2 == 0) else False # set d_mode - inputs["eval_disc"] = None # to avoid errors total_batched_samples += 1 if self.args.include_num_input_tokens_seen: @@ -800,72 +792,6 @@ def _inner_training_loop( print(f"printing from inner_training_loop:{model.module.base_model.model.discriminator.fc1.weight}") return TrainOutput(self.state.global_step, train_loss, metrics) - - def compute_loss(self, model, inputs, return_outputs=False): - """ - How the loss is computed by Trainer. By default, all models return the loss in the first element. - - Subclass and override for custom behavior. - """ - if inputs["eval_disc"] == True: - discrim_dict = model(**inputs) - print("running compute loss") - - # print(discrim_dict) - - # print( - # "img is correct", discrim_dict["img_is_correct"].sum().item(), - # "lang is correct", discrim_dict["lang_is_correct"].sum().item(), - # "img_total", discrim_dict["img_is_correct"].size(0), - # "lang_total", discrim_dict["img_is_correct"].size(0)) - - self.eval_disc_data["num_img_corr"] += discrim_dict["img_is_correct"].sum().item() - self.eval_disc_data["num_lang_corr"] += discrim_dict["lang_is_correct"].sum().item() - self.eval_disc_data["img_total"] += discrim_dict["img_is_correct"].size(0) - self.eval_disc_data["lang_total"] += discrim_dict["img_is_correct"].size(0) - - json_dict = { - "num_img_corr": discrim_dict["img_is_correct"].sum().item(), - "num_lang_corr": discrim_dict["lang_is_correct"].sum().item(), - "img_total": discrim_dict["img_is_correct"].size(0), - "lang_total": discrim_dict["img_is_correct"].size(0) - } - - with open("/home/smirrashidi/return_dict2.json", "a") as json_file: - json.dump("eval", json_file) - json_file.write("\n") - json.dump(json_dict, json_file) - json_file.write("\n") - - if self.label_smoother is not None and "labels" in inputs: - labels = inputs.pop("labels") - else: - labels = None - outputs = model(**inputs) - # Save past state if it exists - # TODO: this needs to be fixed and made cleaner later. - if self.args.past_index >= 0: - self._past = outputs[self.args.past_index] - - unwrapped_model = unwrap_model(model) - if _is_peft_model(unwrapped_model): - model_name = unwrapped_model.base_model.model._get_name() - else: - model_name = unwrapped_model._get_name() - if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): - loss = self.label_smoother(outputs, labels, shift_labels=True) - else: - loss = self.label_smoother(outputs, labels) - else: - if isinstance(outputs, dict) and "loss" not in outputs: - raise ValueError( - "The model did not return a loss from the inputs, only the following keys: " - f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." - ) - # We don't use .loss here since the model may return tuples instead of ModelOutput. - loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] - - return (loss, outputs) if return_outputs else loss def evaluate( self, @@ -963,110 +889,4 @@ def evaluate( self._memory_tracker.stop_and_update_metrics(output.metrics) - return output.metrics, self.eval_disc_data - - def prediction_step( - self, - model: nn.Module, - inputs: Dict[str, Union[torch.Tensor, Any]], - prediction_loss_only: bool, - ignore_keys: Optional[List[str]] = None, - ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: - """ - Perform an evaluation step on `model` using `inputs`. - - Subclass and override to inject custom behavior. - - Args: - model (`nn.Module`): - The model to evaluate. - inputs (`Dict[str, Union[torch.Tensor, Any]]`): - The inputs and targets of the model. - - The dictionary will be unpacked before being fed to the model. Most models expect the targets under the - argument `labels`. Check your model's documentation for all accepted arguments. - prediction_loss_only (`bool`): - Whether or not to return the loss only. - ignore_keys (`List[str]`, *optional*): - A list of keys in the output of your model (if it is a dictionary) that should be ignored when - gathering predictions. - - Return: - Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, - logits and labels (each being optional). - """ - has_labels = False if len(self.label_names) == 0 else all(inputs.get(k) is not None for k in self.label_names) - # For CLIP-like models capable of returning loss values. - # If `return_loss` is not specified or being `None` in `inputs`, we check if the default value of `return_loss` - # is `True` in `model.forward`. - return_loss = inputs.get("return_loss", None) - if return_loss is None: - return_loss = self.can_return_loss - loss_without_labels = True if len(self.label_names) == 0 and return_loss else False - - inputs = self._prepare_inputs(inputs) - if ignore_keys is None: - if hasattr(self.model, "config"): - ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", []) - else: - ignore_keys = [] - - # labels may be popped when computing the loss (label smoothing for instance) so we grab them first. - if has_labels or loss_without_labels: - labels = nested_detach(tuple(inputs.get(name) for name in self.label_names)) - if len(labels) == 1: - labels = labels[0] - else: - labels = None - - with torch.no_grad(): - if is_sagemaker_mp_enabled(): - raw_outputs = smp_forward_only(model, inputs) - if has_labels or loss_without_labels: - if isinstance(raw_outputs, dict): - loss_mb = raw_outputs["loss"] - logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys + ["loss"]) - else: - loss_mb = raw_outputs[0] - logits_mb = raw_outputs[1:] - - loss = loss_mb.reduce_mean().detach().cpu() - logits = smp_nested_concat(logits_mb) - else: - loss = None - if isinstance(raw_outputs, dict): - logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys) - else: - logits_mb = raw_outputs - logits = smp_nested_concat(logits_mb) - else: - if has_labels or loss_without_labels: - with self.compute_loss_context_manager(): - inputs["eval_disc"] = True - loss, outputs = self.compute_loss(model, inputs, return_outputs=True) - loss = loss.mean().detach() - - if isinstance(outputs, dict): - logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"]) - else: - logits = outputs[1:] - else: - loss = None - with self.compute_loss_context_manager(): - outputs = model(**inputs) - if isinstance(outputs, dict): - logits = tuple(v for k, v in outputs.items() if k not in ignore_keys) - else: - logits = outputs - # TODO: this needs to be fixed and made cleaner later. - if self.args.past_index >= 0: - self._past = outputs[self.args.past_index - 1] - - if prediction_loss_only: - return (loss, None, None) - - logits = nested_detach(logits) - if len(logits) == 1: - logits = logits[0] - - return (loss, logits, labels) \ No newline at end of file + return output.metrics, self.eval_disc_data \ No newline at end of file diff --git a/scripts/v1_5/eval_discrim.sh b/scripts/v1_5/eval_discrim.sh index 17064766e..d4a274e20 100644 --- a/scripts/v1_5/eval_discrim.sh +++ b/scripts/v1_5/eval_discrim.sh @@ -6,7 +6,7 @@ # change output dir back to llava-v1.5-13b-lora_disc -deepspeed llava/VLLMSafety/train.py \ +deepspeed llava/VLLMSafety/evaluate_disc.py \ --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ --deepspeed ./scripts/zero3.json \ --model_name_or_path lmsys/vicuna-13b-v1.5 \ @@ -22,7 +22,7 @@ deepspeed llava/VLLMSafety/train.py \ --image_aspect_ratio pad \ --group_by_modality_length True \ --bf16 True \ - --output_dir ./checkpoints/llava-v1.5-13b-lora_testing \ + --output_dir ./checkpoints/llava-v1.5-13b-lora_eval \ --num_train_epochs 1 \ --per_device_train_batch_size 4 \ --per_device_eval_batch_size 4 \ diff --git a/scripts/v1_5/finetune_lora.sh b/scripts/v1_5/finetune_lora.sh index 701c3fbdd..453c5d707 100644 --- a/scripts/v1_5/finetune_lora.sh +++ b/scripts/v1_5/finetune_lora.sh @@ -1,7 +1,5 @@ #!/bin/bash -# change output dir back to llava-v1.5-13b-lora_disc - deepspeed llava/train/train_mem.py \ --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \ --deepspeed ./scripts/zero3.json \ @@ -18,7 +16,7 @@ deepspeed llava/train/train_mem.py \ --image_aspect_ratio pad \ --group_by_modality_length True \ --bf16 True \ - --output_dir ./checkpoints/llava-v1.5-13b-lora_testing \ + --output_dir ./checkpoints/llava-v1.5-13b-lora-disc \ --num_train_epochs 1 \ --per_device_train_batch_size 4 \ --per_device_eval_batch_size 4 \ From 2c461557f7ce312b601a15719d5a58a1d4e1bf97 Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Sat, 21 Sep 2024 22:36:22 +0000 Subject: [PATCH 35/41] finished training: --- llava/VLLMSafety/discriminator.py | 6 +++--- llava/model/language_model/llava_llama.py | 4 +--- llava/model/llava_arch.py | 2 +- llava/train/llava_trainer.py | 1 - llava/train/train.py | 2 ++ scripts/v1_5/finetune_lora.sh | 5 +++-- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/llava/VLLMSafety/discriminator.py b/llava/VLLMSafety/discriminator.py index b44c9a605..003b0ccea 100644 --- a/llava/VLLMSafety/discriminator.py +++ b/llava/VLLMSafety/discriminator.py @@ -20,7 +20,7 @@ def linear(self, x): return x def forward(self, data, d_mode): - device = 'cuda:0' + device = 'cuda' loss_function = nn.BCELoss() # from DCGAN img_tok = data["image"] @@ -31,8 +31,8 @@ def forward(self, data, d_mode): img_pred = self.linear(img_tok) # BCE expects output from a sigmoid (i think) lang_pred = self.linear(lang_tok) - img_pred = img_pred.to(device) # for testing dsicrim, remove when training - lang_pred = lang_pred.to(device) # for testing discrim, remove when training (i think?) its not like training even works + # img_pred = img_pred.to(device) # for testing dsicrim, remove when training + # lang_pred = lang_pred.to(device) # for testing discrim, remove when training (i think?) its not like training even works if d_mode == True: diff --git a/llava/model/language_model/llava_llama.py b/llava/model/language_model/llava_llama.py index 3d73144cd..683009b8e 100644 --- a/llava/model/language_model/llava_llama.py +++ b/llava/model/language_model/llava_llama.py @@ -97,8 +97,6 @@ def forward( images, image_sizes ) - - d_mode = True # REMOVE WHEN NOT TESTING DISC if d_mode == True: discrim_dict = self.discriminator.forward(self.disc_data, d_mode=True) # d loss is sum of disc loss on images and lang @@ -121,7 +119,7 @@ def forward( return model_output else: - discrim_dict = self.discriminator.forward(self.disc_data, d_mode=True) # d loss is sum of disc loss on images and lang + discrim_dict = self.discriminator.forward(self.disc_data, d_mode=True) # d loss is sum of disc loss on images and lang; same call in both if and else model_output = super().forward( input_ids=input_ids, attention_mask=attention_mask, diff --git a/llava/model/llava_arch.py b/llava/model/llava_arch.py index 2677c226e..535733cef 100644 --- a/llava/model/llava_arch.py +++ b/llava/model/llava_arch.py @@ -138,7 +138,7 @@ def get_vision_tower(self): return self.get_model().get_vision_tower() def encode_images(self, images): - images = images.to(torch.bfloat16) if images.dtype != torch.bfloat16 else images # added for testing the discriminator, not part of source code + # images = images.to(torch.bfloat16) if images.dtype != torch.bfloat16 else images # added for testing the discriminator, not part of source code image_features = self.get_model().get_vision_tower()(images) image_features = self.get_model().mm_projector(image_features) return image_features diff --git a/llava/train/llava_trainer.py b/llava/train/llava_trainer.py index 5e2fed2db..94a6f76e1 100644 --- a/llava/train/llava_trainer.py +++ b/llava/train/llava_trainer.py @@ -682,7 +682,6 @@ def _inner_training_loop( # Optimizer step - inputs["d_mode"] = True # REMOVE WHEN NOT TESTING DISC if inputs["d_mode"] == True: self.d_optimizer.step() model.module.base_model.model.discriminator.zero_grad() diff --git a/llava/train/train.py b/llava/train/train.py index 014b8abbb..2e465f0b9 100644 --- a/llava/train/train.py +++ b/llava/train/train.py @@ -928,11 +928,13 @@ def make_inputs_require_grad(module, input, output): model.requires_grad_(False) for p in model.get_model().mm_projector.parameters(): p.requires_grad = True + print("Tuning mm_projector") model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter if training_args.freeze_mm_mlp_adapter: for p in model.get_model().mm_projector.parameters(): p.requires_grad = False + print("if this is printing then you are not training the projector") if training_args.bits in [4, 8]: model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device) diff --git a/scripts/v1_5/finetune_lora.sh b/scripts/v1_5/finetune_lora.sh index 453c5d707..f53e9fb72 100644 --- a/scripts/v1_5/finetune_lora.sh +++ b/scripts/v1_5/finetune_lora.sh @@ -16,7 +16,7 @@ deepspeed llava/train/train_mem.py \ --image_aspect_ratio pad \ --group_by_modality_length True \ --bf16 True \ - --output_dir ./checkpoints/llava-v1.5-13b-lora-disc \ + --output_dir ./checkpoints/llava-v1.5-13b-lora-9-20 \ --num_train_epochs 1 \ --per_device_train_batch_size 4 \ --per_device_eval_batch_size 4 \ @@ -34,4 +34,5 @@ deepspeed llava/train/train_mem.py \ --gradient_checkpointing True \ --dataloader_num_workers 4 \ --lazy_preprocess True \ - --report_to wandb \ No newline at end of file + --report_to wandb \ + --tune_mm_mlp_adapter True \ No newline at end of file From 765dd47c89f861f492315e9f7c780b82a8582db1 Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Mon, 23 Sep 2024 21:40:24 +0000 Subject: [PATCH 36/41] eval works but now training doesnt --- llava/VLLMSafety/builder2.py | 166 ------ llava/VLLMSafety/discriminator.py | 6 +- llava/VLLMSafety/evaluate_disc.py | 856 ------------------------------ llava/model/llava_arch.py | 1 + llava/train/llava_trainer.py | 98 ---- llava/train/train.py | 9 +- scripts/v1_5/finetune_lora.sh | 4 +- 7 files changed, 14 insertions(+), 1126 deletions(-) delete mode 100644 llava/VLLMSafety/builder2.py delete mode 100644 llava/VLLMSafety/evaluate_disc.py diff --git a/llava/VLLMSafety/builder2.py b/llava/VLLMSafety/builder2.py deleted file mode 100644 index efa187c46..000000000 --- a/llava/VLLMSafety/builder2.py +++ /dev/null @@ -1,166 +0,0 @@ -import os -import warnings -import shutil - -from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig -import torch -from llava.model import * -from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN -from transformers.modeling_utils import * -from peft.tuners.lora.model import * - - -def load_pretrained_model_discrim(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs): - kwargs = {"device_map": device_map, **kwargs} - - if device != "cuda": - kwargs['device_map'] = {"": device} - - if load_8bit: - kwargs['load_in_8bit'] = True - elif load_4bit: - kwargs['load_in_4bit'] = True - kwargs['quantization_config'] = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_compute_dtype=torch.float16, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type='nf4' - ) - else: - kwargs['torch_dtype'] = torch.float16 - - if use_flash_attn: - kwargs['attn_implementation'] = 'flash_attention_2' - - if 'llava' in model_name.lower(): - # Load LLaVA model - if 'lora' in model_name.lower() and model_base is None: - warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.') - if 'lora' in model_name.lower() and model_base is not None: - from llava.model.language_model.llava_llama import LlavaConfig - lora_cfg_pretrained = LlavaConfig.from_pretrained(model_path) - tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False) - print('Loading LLaVA from base model...') - model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs) - token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features - if model.lm_head.weight.shape[0] != token_num: - model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype)) - model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype)) - - print('Manually loading Discriminator weights...') - model_lora_state_dict = load_state_dict('/home/smirrashidi/LLaVAFork/checkpoints/llava-v1.5-13b-lora_disc/adapter_model.safetensors') # only has the lora weights - discriminator_state_dict = { - 'discriminator.fc1.lora_A.weight': model_lora_state_dict['base_model.model.discriminator.fc1.lora_A.weight'], # starts with 0.0361, 0.0126 - 'discriminator.fc1.lora_B.weight': model_lora_state_dict['base_model.model.discriminator.fc1.lora_B.weight'], # starts with -2.2095e-02, 1.1414e-02 - 'discriminator.fc2.lora_A.weight': model_lora_state_dict['base_model.model.discriminator.fc1.lora_A.weight'], # starts with 0.0952, 0.0471 - 'discriminator.fc2.lora_B.weight': model_lora_state_dict['base_model.model.discriminator.fc1.lora_B.weight'] # starts with 0.0688, -0.0267 - } - - - model.merge_and_unload(discriminator_state_dict) - - print('Loading additional LLaVA weights...') - if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')): - non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu') - else: - # this is probably from HF Hub - from huggingface_hub import hf_hub_download - def load_from_hf(repo_id, filename, subfolder=None): - cache_file = hf_hub_download( - repo_id=repo_id, - filename=filename, - subfolder=subfolder) - return torch.load(cache_file, map_location='cpu') - non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin') - non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()} - if any(k.startswith('model.model.') for k in non_lora_trainables): - non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()} - model.load_state_dict(non_lora_trainables, strict=False) - - from peft import PeftModel - print('Loading LoRA weights...') - model = PeftModel.from_pretrained(model, model_path) - print('Merging LoRA weights...') - model = model.merge_and_unload() - print('Model is loaded...') - elif model_base is not None: - # this may be mm projector only - print('Loading LLaVA from base model...') - if 'mpt' in model_name.lower(): - if not os.path.isfile(os.path.join(model_path, 'configuration_mpt.py')): - shutil.copyfile(os.path.join(model_base, 'configuration_mpt.py'), os.path.join(model_path, 'configuration_mpt.py')) - tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=True) - cfg_pretrained = AutoConfig.from_pretrained(model_path, trust_remote_code=True) - model = LlavaMptForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs) - else: - tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False) - cfg_pretrained = AutoConfig.from_pretrained(model_path) - model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs) - - mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu') - mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()} - model.load_state_dict(mm_projector_weights, strict=False) - else: - if 'mpt' in model_name.lower(): - tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) - model = LlavaMptForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs) - elif 'mistral' in model_name.lower(): - tokenizer = AutoTokenizer.from_pretrained(model_path) - model = LlavaMistralForCausalLM.from_pretrained( - model_path, - low_cpu_mem_usage=True, - **kwargs - ) - else: - tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) - model = LlavaLlamaForCausalLM.from_pretrained( - model_path, - low_cpu_mem_usage=True, - **kwargs - ) - else: - # Load language model - if model_base is not None: - # PEFT model - from peft import PeftModel - tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False) - model = AutoModelForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs) - print(f"Loading LoRA weights from {model_path}") - model = PeftModel.from_pretrained(model, model_path) - print(f"Merging weights") - model = model.merge_and_unload() - print('Convert to FP16...') - model.to(torch.float16) - else: - use_fast = False - if 'mpt' in model_name.lower(): - tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) - model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs) - else: - tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) - model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs) - - image_processor = None - - if 'llava' in model_name.lower(): - mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False) - mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True) - if mm_use_im_patch_token: - tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True) - if mm_use_im_start_end: - tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True) - model.resize_token_embeddings(len(tokenizer)) - - vision_tower = model.get_vision_tower() - if not vision_tower.is_loaded: - vision_tower.load_model(device_map=device_map) - if device_map != 'auto': - vision_tower.to(device=device_map, dtype=torch.float16) - image_processor = vision_tower.image_processor - - if hasattr(model.config, "max_sequence_length"): - context_len = model.config.max_sequence_length - else: - context_len = 2048 - - return tokenizer, model, image_processor, context_len diff --git a/llava/VLLMSafety/discriminator.py b/llava/VLLMSafety/discriminator.py index 003b0ccea..b771d5de0 100644 --- a/llava/VLLMSafety/discriminator.py +++ b/llava/VLLMSafety/discriminator.py @@ -20,7 +20,7 @@ def linear(self, x): return x def forward(self, data, d_mode): - device = 'cuda' + device = 'cuda:0' # CHANGE TO JUST CUDA WHEN NOT USING VLMEVAL loss_function = nn.BCELoss() # from DCGAN img_tok = data["image"] @@ -28,8 +28,8 @@ def forward(self, data, d_mode): img_tok = img_tok.view(-1, 5120) # image tokens have dim=3 - img_pred = self.linear(img_tok) # BCE expects output from a sigmoid (i think) - lang_pred = self.linear(lang_tok) + img_pred = self.linear(img_tok).to(device) # BCE expects output from a sigmoid (i think) + lang_pred = self.linear(lang_tok).to(device) # img_pred = img_pred.to(device) # for testing dsicrim, remove when training # lang_pred = lang_pred.to(device) # for testing discrim, remove when training (i think?) its not like training even works diff --git a/llava/VLLMSafety/evaluate_disc.py b/llava/VLLMSafety/evaluate_disc.py deleted file mode 100644 index bf8cad070..000000000 --- a/llava/VLLMSafety/evaluate_disc.py +++ /dev/null @@ -1,856 +0,0 @@ -# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright: -# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright: -# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import copy -from dataclasses import dataclass, field -import json -import logging -import pathlib -from typing import Dict, Optional, Sequence, List - -import torch - -import transformers -import tokenizers - -from llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN -from torch.utils.data import Dataset, DataLoader -from llava.train.llava_trainer import LLaVATrainer - -from llava import conversation as conversation_lib -from llava.model import * -from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path -from llava.model.builder import load_pretrained_model - - -from PIL import Image - - -local_rank = None - - -def rank0_print(*args): - if local_rank == 0: - print(*args) - - -from packaging import version -IS_TOKENIZER_GREATER_THAN_0_14 = version.parse(tokenizers.__version__) >= version.parse('0.14') - - -@dataclass -class ModelArguments: - model_name_or_path: Optional[str] = field(default="facebook/opt-125m") - version: Optional[str] = field(default="v0") - freeze_backbone: bool = field(default=False) - tune_mm_mlp_adapter: bool = field(default=False) - vision_tower: Optional[str] = field(default=None) - mm_vision_select_layer: Optional[int] = field(default=-1) # default to the last layer - pretrain_mm_mlp_adapter: Optional[str] = field(default=None) - mm_projector_type: Optional[str] = field(default='linear') - mm_use_im_start_end: bool = field(default=False) - mm_use_im_patch_token: bool = field(default=True) - mm_patch_merge_type: Optional[str] = field(default='flat') - mm_vision_select_feature: Optional[str] = field(default="patch") - - -@dataclass -class DataArguments: - data_path: str = field(default=None, - metadata={"help": "Path to the training data."}) - lazy_preprocess: bool = False - is_multimodal: bool = False - image_folder: Optional[str] = field(default=None) - image_aspect_ratio: str = 'square' - - -@dataclass -class TrainingArguments(transformers.TrainingArguments): - cache_dir: Optional[str] = field(default=None) - optim: str = field(default="adamw_torch") - remove_unused_columns: bool = field(default=False) - freeze_mm_mlp_adapter: bool = field(default=False) - mpt_attn_impl: Optional[str] = field(default="triton") - model_max_length: int = field( - default=512, - metadata={ - "help": - "Maximum sequence length. Sequences will be right padded (and possibly truncated)." - }, - ) - double_quant: bool = field( - default=True, - metadata={"help": "Compress the quantization statistics through double quantization."} - ) - quant_type: str = field( - default="nf4", - metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."} - ) - bits: int = field( - default=16, - metadata={"help": "How many bits to use."} - ) - lora_enable: bool = False - lora_r: int = 64 - lora_alpha: int = 16 - lora_dropout: float = 0.05 - lora_weight_path: str = "" - lora_bias: str = "none" - mm_projector_lr: Optional[float] = None - group_by_modality_length: bool = field(default=False) - -@dataclass -class DiscArguments: - test_data_path:str = "/home/smirrashidi/coco_data/coco_test_conversations.json" - test_image_folder:str = "/home/smirrashidi/coco_data/coco_test" - model_path: str = "/home/smirrashidi/LLaVAFork/checkpoints/llava-v1.5-13b-lora-disc" - model_base: str = "lmsys/vicuna-13b-v1.3" - - -def maybe_zero_3(param, ignore_status=False, name=None): - from deepspeed import zero - from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus - if hasattr(param, "ds_id"): - if param.ds_status == ZeroParamStatus.NOT_AVAILABLE: - if not ignore_status: - logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}") - with zero.GatheredParameters([param]): - param = param.data.detach().cpu().clone() - else: - param = param.detach().cpu().clone() - return param - - -# Borrowed from peft.utils.get_peft_model_state_dict -def get_peft_state_maybe_zero_3(named_params, bias): - if bias == "none": - to_return = {k: t for k, t in named_params if "lora_" in k} - elif bias == "all": - to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k} - elif bias == "lora_only": - to_return = {} - maybe_lora_bias = {} - lora_bias_names = set() - for k, t in named_params: - if "lora_" in k: - to_return[k] = t - bias_name = k.split("lora_")[0] + "bias" - lora_bias_names.add(bias_name) - elif "bias" in k: - maybe_lora_bias[k] = t - for k, t in maybe_lora_bias: - if bias_name in lora_bias_names: - to_return[bias_name] = t - else: - raise NotImplementedError - to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()} - return to_return - - -def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True): - to_return = {k: t for k, t in named_params if "lora_" not in k} - if require_grad_only: - to_return = {k: t for k, t in to_return.items() if t.requires_grad} - to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()} - return to_return - - -def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match): - to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)} - to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()} - return to_return - - -def find_all_linear_names(model): - cls = torch.nn.Linear - lora_module_names = set() - multimodal_keywords = ['mm_projector', 'vision_tower', 'vision_resampler'] - for name, module in model.named_modules(): - if any(mm_keyword in name for mm_keyword in multimodal_keywords): - continue - if isinstance(module, cls): - names = name.split('.') - lora_module_names.add(names[0] if len(names) == 1 else names[-1]) - - if 'lm_head' in lora_module_names: # needed for 16-bit - lora_module_names.remove('lm_head') - return list(lora_module_names) - - -def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, - output_dir: str): - """Collects the state dict and dump to disk.""" - - if getattr(trainer.args, "tune_mm_mlp_adapter", False): - # Only save Adapter - keys_to_match = ['mm_projector'] - if getattr(trainer.args, "use_im_start_end", False): - keys_to_match.extend(['embed_tokens', 'embed_in']) - - weight_to_save = get_mm_adapter_state_maybe_zero_3(trainer.model.named_parameters(), keys_to_match) - trainer.model.config.save_pretrained(output_dir) - - current_folder = output_dir.split('/')[-1] - parent_folder = os.path.dirname(output_dir) - if trainer.args.local_rank == 0 or trainer.args.local_rank == -1: - if current_folder.startswith('checkpoint-'): - mm_projector_folder = os.path.join(parent_folder, "mm_projector") - os.makedirs(mm_projector_folder, exist_ok=True) - torch.save(weight_to_save, os.path.join(mm_projector_folder, f'{current_folder}.bin')) - else: - torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin')) - return - - if trainer.deepspeed: - torch.cuda.synchronize() - trainer.save_model(output_dir) - return - - state_dict = trainer.model.state_dict() - if trainer.args.should_save: - cpu_state_dict = { - key: value.cpu() - for key, value in state_dict.items() - } - del state_dict - trainer._save(output_dir, state_dict=cpu_state_dict) # noqa - - -def smart_tokenizer_and_embedding_resize( - special_tokens_dict: Dict, - tokenizer: transformers.PreTrainedTokenizer, - model: transformers.PreTrainedModel, -): - """Resize tokenizer and embedding. - - Note: This is the unoptimized version that may make your embedding size not be divisible by 64. - """ - num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) - model.resize_token_embeddings(len(tokenizer)) - - if num_new_tokens > 0: - input_embeddings = model.get_input_embeddings().weight.data - output_embeddings = model.get_output_embeddings().weight.data - - input_embeddings_avg = input_embeddings[:-num_new_tokens].mean( - dim=0, keepdim=True) - output_embeddings_avg = output_embeddings[:-num_new_tokens].mean( - dim=0, keepdim=True) - - input_embeddings[-num_new_tokens:] = input_embeddings_avg - output_embeddings[-num_new_tokens:] = output_embeddings_avg - - -def _tokenize_fn(strings: Sequence[str], - tokenizer: transformers.PreTrainedTokenizer) -> Dict: - """Tokenize a list of strings.""" - tokenized_list = [ - tokenizer( - text, - return_tensors="pt", - padding="longest", - max_length=tokenizer.model_max_length, - truncation=True, - ) for text in strings - ] - input_ids = labels = [ - tokenized.input_ids[0] for tokenized in tokenized_list - ] - input_ids_lens = labels_lens = [ - tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() - for tokenized in tokenized_list - ] - return dict( - input_ids=input_ids, - labels=labels, - input_ids_lens=input_ids_lens, - labels_lens=labels_lens, - ) - - -def _mask_targets(target, tokenized_lens, speakers): - # cur_idx = 0 - cur_idx = tokenized_lens[0] - tokenized_lens = tokenized_lens[1:] - target[:cur_idx] = IGNORE_INDEX - for tokenized_len, speaker in zip(tokenized_lens, speakers): - if speaker == "human": - target[cur_idx+2:cur_idx + tokenized_len] = IGNORE_INDEX - cur_idx += tokenized_len - - -def _add_speaker_and_signal(header, source, get_conversation=True): - """Add speaker and start/end signal on each round.""" - BEGIN_SIGNAL = "### " - END_SIGNAL = "\n" - conversation = header - for sentence in source: - from_str = sentence["from"] - if from_str.lower() == "human": - from_str = conversation_lib.default_conversation.roles[0] - elif from_str.lower() == "gpt": - from_str = conversation_lib.default_conversation.roles[1] - else: - from_str = 'unknown' - sentence["value"] = (BEGIN_SIGNAL + from_str + ": " + - sentence["value"] + END_SIGNAL) - if get_conversation: - conversation += sentence["value"] - conversation += BEGIN_SIGNAL - return conversation - - -def preprocess_multimodal( - sources: Sequence[str], - data_args: DataArguments -) -> Dict: - is_multimodal = data_args.is_multimodal - if not is_multimodal: - return sources - - for source in sources: - for sentence in source: - if DEFAULT_IMAGE_TOKEN in sentence['value']: - sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip() - sentence['value'] = DEFAULT_IMAGE_TOKEN + '\n' + sentence['value'] - sentence['value'] = sentence['value'].strip() - if "mmtag" in conversation_lib.default_conversation.version: - sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '' + DEFAULT_IMAGE_TOKEN + '') - replace_token = DEFAULT_IMAGE_TOKEN - if data_args.mm_use_im_start_end: - replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN - sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token) - - return sources - - -def preprocess_llama_2( - sources, - tokenizer: transformers.PreTrainedTokenizer, - has_image: bool = False -) -> Dict: - conv = conversation_lib.default_conversation.copy() - roles = {"human": conv.roles[0], "gpt": conv.roles[1]} - - # Apply prompt templates - conversations = [] - for i, source in enumerate(sources): - if roles[source[0]["from"]] != conv.roles[0]: - # Skip the first one if it is not from human - source = source[1:] - - conv.messages = [] - for j, sentence in enumerate(source): - role = roles[sentence["from"]] - assert role == conv.roles[j % 2], f"{i}" - conv.append_message(role, sentence["value"]) - conversations.append(conv.get_prompt()) - - # Tokenize conversations - - if has_image: - input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0) - else: - input_ids = tokenizer( - conversations, - return_tensors="pt", - padding="longest", - max_length=tokenizer.model_max_length, - truncation=True, - ).input_ids - - targets = input_ids.clone() - - assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_2 - - # Mask targets - sep = "[/INST] " - for conversation, target in zip(conversations, targets): - total_len = int(target.ne(tokenizer.pad_token_id).sum()) - - rounds = conversation.split(conv.sep2) - cur_len = 1 - target[:cur_len] = IGNORE_INDEX - for i, rou in enumerate(rounds): - if rou == "": - break - - parts = rou.split(sep) - if len(parts) != 2: - break - parts[0] += sep - - if has_image: - round_len = len(tokenizer_image_token(rou, tokenizer)) - instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2 - else: - round_len = len(tokenizer(rou).input_ids) - instruction_len = len(tokenizer(parts[0]).input_ids) - 2 - - target[cur_len : cur_len + instruction_len] = IGNORE_INDEX - - cur_len += round_len - target[cur_len:] = IGNORE_INDEX - - if cur_len < tokenizer.model_max_length: - if cur_len != total_len: - target[:] = IGNORE_INDEX - print( - f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." - f" (ignored)" - ) - - return dict( - input_ids=input_ids, - labels=targets, - ) - - -def preprocess_v1( - sources, - tokenizer: transformers.PreTrainedTokenizer, - has_image: bool = False -) -> Dict: - conv = conversation_lib.default_conversation.copy() - roles = {"human": conv.roles[0], "gpt": conv.roles[1]} - - # Apply prompt templates - conversations = [] - for i, source in enumerate(sources): - if roles[source[0]["from"]] != conv.roles[0]: - # Skip the first one if it is not from human - source = source[1:] - - conv.messages = [] - for j, sentence in enumerate(source): - role = roles[sentence["from"]] - assert role == conv.roles[j % 2], f"{i}" - conv.append_message(role, sentence["value"]) - conversations.append(conv.get_prompt()) - - # Tokenize conversations - - if has_image: - input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0) - else: - input_ids = tokenizer( - conversations, - return_tensors="pt", - padding="longest", - max_length=tokenizer.model_max_length, - truncation=True, - ).input_ids - - targets = input_ids.clone() - - assert conv.sep_style == conversation_lib.SeparatorStyle.TWO - - # Mask targets - sep = conv.sep + conv.roles[1] + ": " - for conversation, target in zip(conversations, targets): - total_len = int(target.ne(tokenizer.pad_token_id).sum()) - - rounds = conversation.split(conv.sep2) - cur_len = 1 - target[:cur_len] = IGNORE_INDEX - for i, rou in enumerate(rounds): - if rou == "": - break - - parts = rou.split(sep) - if len(parts) != 2: - break - parts[0] += sep - - if has_image: - round_len = len(tokenizer_image_token(rou, tokenizer)) - instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2 - else: - round_len = len(tokenizer(rou).input_ids) - instruction_len = len(tokenizer(parts[0]).input_ids) - 2 - - if i != 0 and not tokenizer.legacy and IS_TOKENIZER_GREATER_THAN_0_14: - round_len -= 1 - instruction_len -= 1 - - target[cur_len : cur_len + instruction_len] = IGNORE_INDEX - - cur_len += round_len - target[cur_len:] = IGNORE_INDEX - - if cur_len < tokenizer.model_max_length: - if cur_len != total_len: - target[:] = IGNORE_INDEX - print( - f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." - f" (ignored)" - ) - - return dict( - input_ids=input_ids, - labels=targets, - ) - - -def preprocess_mpt( - sources, - tokenizer: transformers.PreTrainedTokenizer, - has_image: bool = False -) -> Dict: - conv = conversation_lib.default_conversation.copy() - roles = {"human": conv.roles[0], "gpt": conv.roles[1]} - - # Apply prompt templates - conversations = [] - for i, source in enumerate(sources): - if roles[source[0]["from"]] != conv.roles[0]: - # Skip the first one if it is not from human - source = source[1:] - - conv.messages = [] - for j, sentence in enumerate(source): - role = roles[sentence["from"]] - assert role == conv.roles[j % 2], f"{i}" - conv.append_message(role, sentence["value"]) - conversations.append(conv.get_prompt()) - - # Tokenize conversations - - if has_image: - input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0) - else: - input_ids = tokenizer( - conversations, - return_tensors="pt", - padding="longest", - max_length=tokenizer.model_max_length, - truncation=True, - ).input_ids - - targets = input_ids.clone() - assert conv.sep_style == conversation_lib.SeparatorStyle.MPT - - # Mask targets - sep = conv.sep + conv.roles[1] - for conversation, target in zip(conversations, targets): - total_len = int(target.ne(tokenizer.pad_token_id).sum()) - - rounds = conversation.split(conv.sep) - re_rounds = [conv.sep.join(rounds[:3])] # system + user + gpt - for conv_idx in range(3, len(rounds), 2): - re_rounds.append(conv.sep.join(rounds[conv_idx:conv_idx+2])) # user + gpt - cur_len = 0 - target[:cur_len] = IGNORE_INDEX - for i, rou in enumerate(re_rounds): - if rou == "": - break - - parts = rou.split(sep) - if len(parts) != 2: - break - parts[0] += sep - - if has_image: - round_len = len(tokenizer_image_token(rou, tokenizer)) - instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1 - else: - round_len = len(tokenizer(rou).input_ids) - instruction_len = len(tokenizer(parts[0]).input_ids) - 1 - - if i != 0 and getattr(tokenizer, 'legacy', False) and IS_TOKENIZER_GREATER_THAN_0_14: - round_len += 1 - instruction_len += 1 - - target[cur_len : cur_len + instruction_len] = IGNORE_INDEX - - cur_len += round_len - target[cur_len:] = IGNORE_INDEX - - if cur_len < tokenizer.model_max_length: - if cur_len != total_len: - target[:] = IGNORE_INDEX - print( - f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." - f" (ignored)" - ) - - return dict( - input_ids=input_ids, - labels=targets, - ) - - -def preprocess_plain( - sources: Sequence[str], - tokenizer: transformers.PreTrainedTokenizer, -) -> Dict: - # add end signal and concatenate together - conversations = [] - for source in sources: - assert len(source) == 2 - assert DEFAULT_IMAGE_TOKEN in source[0]['value'] - source[0]['value'] = DEFAULT_IMAGE_TOKEN - conversation = source[0]['value'] + source[1]['value'] + conversation_lib.default_conversation.sep - conversations.append(conversation) - # tokenize conversations - input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations] - targets = copy.deepcopy(input_ids) - for target, source in zip(targets, sources): - tokenized_len = len(tokenizer_image_token(source[0]['value'], tokenizer)) - target[:tokenized_len] = IGNORE_INDEX - - return dict(input_ids=input_ids, labels=targets) - - -def preprocess( - sources: Sequence[str], - tokenizer: transformers.PreTrainedTokenizer, - has_image: bool = False -) -> Dict: - """ - Given a list of sources, each is a conversation list. This transform: - 1. Add signal '### ' at the beginning each sentence, with end signal '\n'; - 2. Concatenate conversations together; - 3. Tokenize the concatenated conversation; - 4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX. - """ - if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN: - return preprocess_plain(sources, tokenizer) - if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.LLAMA_2: - return preprocess_llama_2(sources, tokenizer, has_image=has_image) - if conversation_lib.default_conversation.version.startswith("v1"): - return preprocess_v1(sources, tokenizer, has_image=has_image) - if conversation_lib.default_conversation.version == "mpt": - return preprocess_mpt(sources, tokenizer, has_image=has_image) - # add end signal and concatenate together - conversations = [] - for source in sources: - header = f"{conversation_lib.default_conversation.system}\n\n" - conversation = _add_speaker_and_signal(header, source) - conversations.append(conversation) - # tokenize conversations - def get_tokenize_len(prompts): - return [len(tokenizer_image_token(prompt, tokenizer)) for prompt in prompts] - - if has_image: - input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations] - else: - conversations_tokenized = _tokenize_fn(conversations, tokenizer) - input_ids = conversations_tokenized["input_ids"] - - targets = copy.deepcopy(input_ids) - for target, source in zip(targets, sources): - if has_image: - tokenized_lens = get_tokenize_len([header] + [s["value"] for s in source]) - else: - tokenized_lens = _tokenize_fn([header] + [s["value"] for s in source], tokenizer)["input_ids_lens"] - speakers = [sentence["from"] for sentence in source] - _mask_targets(target, tokenized_lens, speakers) - - return dict(input_ids=input_ids, labels=targets) - - -class LazySupervisedDataset(Dataset): - """Dataset for supervised fine-tuning.""" - - def __init__(self, data_path: str, - tokenizer: transformers.PreTrainedTokenizer, - data_args: DataArguments): - super(LazySupervisedDataset, self).__init__() - list_data_dict = json.load(open(data_path, "r")) - - rank0_print("Formatting inputs...Skip in lazy mode") - self.tokenizer = tokenizer - self.list_data_dict = list_data_dict - self.data_args = data_args - - def __len__(self): - return len(self.list_data_dict) - - @property - def lengths(self): - length_list = [] - for sample in self.list_data_dict: - img_tokens = 128 if 'image' in sample else 0 - length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens) - return length_list - - @property - def modality_lengths(self): - length_list = [] - for sample in self.list_data_dict: - cur_len = sum(len(conv['value'].split()) for conv in sample['conversations']) - cur_len = cur_len if 'image' in sample else -cur_len - length_list.append(cur_len) - return length_list - - def __getitem__(self, i) -> Dict[str, torch.Tensor]: - sources = self.list_data_dict[i] - if isinstance(i, int): - sources = [sources] - assert len(sources) == 1, "Don't know why it is wrapped to a list" # FIXME - if 'image' in sources[0]: - image_file = self.list_data_dict[i]['image'] - image_folder = self.data_args.image_folder - processor = self.data_args.image_processor - image = Image.open(os.path.join(image_folder, image_file)).convert('RGB') - if self.data_args.image_aspect_ratio == 'pad': - def expand2square(pil_img, background_color): - width, height = pil_img.size - if width == height: - return pil_img - elif width > height: - result = Image.new(pil_img.mode, (width, width), background_color) - result.paste(pil_img, (0, (width - height) // 2)) - return result - else: - result = Image.new(pil_img.mode, (height, height), background_color) - result.paste(pil_img, ((height - width) // 2, 0)) - return result - image = expand2square(image, tuple(int(x*255) for x in processor.image_mean)) - image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0] - else: - image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0] - sources = preprocess_multimodal( - copy.deepcopy([e["conversations"] for e in sources]), - self.data_args) - else: - sources = copy.deepcopy([e["conversations"] for e in sources]) - data_dict = preprocess( - sources, - self.tokenizer, - has_image=('image' in self.list_data_dict[i])) - if isinstance(i, int): - data_dict = dict(input_ids=data_dict["input_ids"][0], - labels=data_dict["labels"][0]) - - # image exist in the data - if 'image' in self.list_data_dict[i]: - data_dict['image'] = image - elif self.data_args.is_multimodal: - # image does not exist in the data, but the model is multimodal - crop_size = self.data_args.image_processor.crop_size - data_dict['image'] = torch.zeros(3, crop_size['height'], crop_size['width']) - return data_dict - - -@dataclass -class DataCollatorForSupervisedDataset(object): - """Collate examples for supervised fine-tuning.""" - - tokenizer: transformers.PreTrainedTokenizer - - def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: - input_ids, labels = tuple([instance[key] for instance in instances] - for key in ("input_ids", "labels")) - input_ids = torch.nn.utils.rnn.pad_sequence( - input_ids, - batch_first=True, - padding_value=self.tokenizer.pad_token_id) - labels = torch.nn.utils.rnn.pad_sequence(labels, - batch_first=True, - padding_value=IGNORE_INDEX) - input_ids = input_ids[:, :self.tokenizer.model_max_length] - labels = labels[:, :self.tokenizer.model_max_length] - batch = dict( - input_ids=input_ids, - labels=labels, - attention_mask=input_ids.ne(self.tokenizer.pad_token_id), - ) - - if 'image' in instances[0]: - images = [instance['image'] for instance in instances] - if all(x is not None and x.shape == images[0].shape for x in images): - batch['images'] = torch.stack(images) - else: - batch['images'] = images - - return batch - - -def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, - data_args, disc_args, testing) -> Dict: - """Make dataset and collator for supervised fine-tuning.""" - - if testing == True: - data_args.image_folder = disc_args.test_image_folder - data_args.data_path = disc_args.test_data_path - - train_dataset = LazySupervisedDataset(tokenizer=tokenizer, - data_path=data_args.data_path, - data_args=data_args) - - data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) - - else: - train_dataset = LazySupervisedDataset(tokenizer=tokenizer, - data_path=data_args.data_path, - data_args=data_args) - data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) - - return dict(train_dataset=train_dataset, - eval_dataset=None, - data_collator=data_collator) - - -def train(attn_implementation=None): - global local_rank - - parser = transformers.HfArgumentParser( - (ModelArguments, DataArguments, TrainingArguments, DiscArguments)) - model_args, data_args, training_args, disc_args = parser.parse_args_into_dataclasses() - local_rank = training_args.local_rank - compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32)) - model_path = os.path.expanduser(disc_args.model_path) - model_name = get_model_name_from_path(model_path) - tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, disc_args.model_base, model_name) - - model.to("cuda") - - test_data_module = make_supervised_data_module(tokenizer=tokenizer, - data_args=data_args, - disc_args=disc_args, - testing = True) - - data_collator = test_data_module['data_collator'] - - # Create DataLoader for test dataset - test_dataloader = DataLoader( - test_data_module['train_dataset'], # In this context, train_dataset is your test dataset - batch_size=4, # Adjust batch size as needed - collate_fn=data_collator, # Use the provided data_collator here - shuffle=False) - - - for i, batch in enumerate(test_dataloader): - input_ids = batch['input_ids'] - image = batch['image'] - - with torch.inference_mode(): - discrim_dict = model.forward_eval_discrim( - input_ids = input_ids, - image = image - ) - - # with open("/home/smirrashidi/test_discrim2.json", "w") as json_file: - # json.dump(eval_dict, json_file, indent=4) - - - # print(eval_dict) - -if __name__ == "__main__": - train(attn_implementation="flash_attention_2") diff --git a/llava/model/llava_arch.py b/llava/model/llava_arch.py index 535733cef..333df217c 100644 --- a/llava/model/llava_arch.py +++ b/llava/model/llava_arch.py @@ -147,6 +147,7 @@ def prepare_inputs_labels_for_multimodal( self, input_ids, position_ids, attention_mask, past_key_values, labels, images, image_sizes=None ): + torch.cuda.set_device(0) self.disc_data['image'] = [] self.disc_data['lang'] = [] diff --git a/llava/train/llava_trainer.py b/llava/train/llava_trainer.py index 94a6f76e1..324ee36b0 100644 --- a/llava/train/llava_trainer.py +++ b/llava/train/llava_trainer.py @@ -791,101 +791,3 @@ def _inner_training_loop( print(f"printing from inner_training_loop:{model.module.base_model.model.discriminator.fc1.weight}") return TrainOutput(self.state.global_step, train_loss, metrics) - - def evaluate( - self, - eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None, - ignore_keys: Optional[List[str]] = None, - metric_key_prefix: str = "eval", - ) -> Dict[str, float]: - """ - Run evaluation and returns metrics. - - The calling script will be responsible for providing a method to compute metrics, as they are task-dependent - (pass it to the init `compute_metrics` argument). - - You can also subclass and override this method to inject custom behavior. - - Args: - eval_dataset (Union[`Dataset`, Dict[str, `Dataset`]), *optional*): - Pass a dataset if you wish to override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns - not accepted by the `model.forward()` method are automatically removed. If it is a dictionary, it will - evaluate on each dataset, prepending the dictionary key to the metric name. Datasets must implement the - `__len__` method. - - - - If you pass a dictionary with names of datasets as keys and datasets as values, evaluate will run - separate evaluations on each dataset. This can be useful to monitor how training affects other - datasets or simply to get a more fine-grained evaluation. - When used with `load_best_model_at_end`, make sure `metric_for_best_model` references exactly one - of the datasets. If you, for example, pass in `{"data1": data1, "data2": data2}` for two datasets - `data1` and `data2`, you could specify `metric_for_best_model="eval_data1_loss"` for using the - loss on `data1` and `metric_for_best_model="eval_data1_loss"` for the loss on `data2`. - - - - ignore_keys (`List[str]`, *optional*): - A list of keys in the output of your model (if it is a dictionary) that should be ignored when - gathering predictions. - metric_key_prefix (`str`, *optional*, defaults to `"eval"`): - An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named - "eval_bleu" if the prefix is "eval" (default) - - Returns: - A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The - dictionary also contains the epoch number which comes from the training state. - """ - # handle multipe eval datasets - eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset - if isinstance(eval_dataset, dict): - metrics = {} - for eval_dataset_name, _eval_dataset in eval_dataset.items(): - dataset_metrics = self.evaluate( - eval_dataset=_eval_dataset, - ignore_keys=ignore_keys, - metric_key_prefix=f"{metric_key_prefix}_{eval_dataset_name}", - ) - metrics.update(dataset_metrics) - return metrics - - # memory metrics - must set up as early as possible - self._memory_tracker.start() - - eval_dataloader = self.get_eval_dataloader(eval_dataset) - start_time = time.time() - - eval_loop = self.evaluation_loop - output = eval_loop( - eval_dataloader, - description="Evaluation", - # No point gathering the predictions if there are no metrics, otherwise we defer to - # self.args.prediction_loss_only - prediction_loss_only=True if self.compute_metrics is None else None, - ignore_keys=ignore_keys, - metric_key_prefix=metric_key_prefix, - ) - - total_batch_size = self.args.eval_batch_size * self.args.world_size - if f"{metric_key_prefix}_jit_compilation_time" in output.metrics: - start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"] - output.metrics.update( - speed_metrics( - metric_key_prefix, - start_time, - num_samples=output.num_samples, - num_steps=math.ceil(output.num_samples / total_batch_size), - ) - ) - - self.log(output.metrics) - - if DebugOption.TPU_METRICS_DEBUG in self.args.debug: - # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) - xm.master_print(met.metrics_report()) - - self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics) - - self._memory_tracker.stop_and_update_metrics(output.metrics) - - return output.metrics, self.eval_disc_data \ No newline at end of file diff --git a/llava/train/train.py b/llava/train/train.py index 2e465f0b9..5b3c774c9 100644 --- a/llava/train/train.py +++ b/llava/train/train.py @@ -787,6 +787,7 @@ def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, def train(attn_implementation=None): global local_rank + print("Starting Training") parser = transformers.HfArgumentParser( (ModelArguments, DataArguments, TrainingArguments)) @@ -934,7 +935,7 @@ def make_inputs_require_grad(module, input, output): if training_args.freeze_mm_mlp_adapter: for p in model.get_model().mm_projector.parameters(): p.requires_grad = False - print("if this is printing then you are not training the projector") + print("\n\nif this is printing then you are not training the projector") if training_args.bits in [4, 8]: model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device) @@ -966,6 +967,12 @@ def make_inputs_require_grad(module, input, output): for name, param in model.discriminator.named_parameters(): param.requires_grad = True + for name, param in model.discriminator.named_parameters(): + assert param.requires_grad, f"Parameter {name} does not have requires_grad set to True" + + for name, param in model.get_model().mm_projector.named_parameters(): + assert param.requires_grad, f"Parameter {name} does not have requires_grad set to True" + trainer = LLaVATrainer(model=model, tokenizer=tokenizer, args=training_args, diff --git a/scripts/v1_5/finetune_lora.sh b/scripts/v1_5/finetune_lora.sh index f53e9fb72..086694ced 100644 --- a/scripts/v1_5/finetune_lora.sh +++ b/scripts/v1_5/finetune_lora.sh @@ -16,9 +16,9 @@ deepspeed llava/train/train_mem.py \ --image_aspect_ratio pad \ --group_by_modality_length True \ --bf16 True \ - --output_dir ./checkpoints/llava-v1.5-13b-lora-9-20 \ + --output_dir ./checkpoints/llava-v1.5-13b-lora-9-23 \ --num_train_epochs 1 \ - --per_device_train_batch_size 4 \ + --per_device_train_batch_size 2 \ --per_device_eval_batch_size 4 \ --gradient_accumulation_steps 1 \ --evaluation_strategy "no" \ From 9646ea8164e9f9ab6e3bc11028753ae14b837928 Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Fri, 27 Sep 2024 05:57:46 +0000 Subject: [PATCH 37/41] updated code for training - added in checkpointing and changed some forward pass logic --- llava/VLLMSafety/discriminator.py | 169 +--------------------- llava/model/language_model/llava_llama.py | 18 ++- llava/model/llava_arch.py | 2 - llava/train/llava_trainer.py | 28 +++- scripts/v1_5/finetune_lora.sh | 2 +- 5 files changed, 41 insertions(+), 178 deletions(-) diff --git a/llava/VLLMSafety/discriminator.py b/llava/VLLMSafety/discriminator.py index b771d5de0..9700df0a5 100644 --- a/llava/VLLMSafety/discriminator.py +++ b/llava/VLLMSafety/discriminator.py @@ -20,7 +20,7 @@ def linear(self, x): return x def forward(self, data, d_mode): - device = 'cuda:0' # CHANGE TO JUST CUDA WHEN NOT USING VLMEVAL + device = 'cuda' # CHANGE TO JUST CUDA WHEN NOT USING VLMEVAL loss_function = nn.BCELoss() # from DCGAN img_tok = data["image"] @@ -28,11 +28,8 @@ def forward(self, data, d_mode): img_tok = img_tok.view(-1, 5120) # image tokens have dim=3 - img_pred = self.linear(img_tok).to(device) # BCE expects output from a sigmoid (i think) - lang_pred = self.linear(lang_tok).to(device) - - # img_pred = img_pred.to(device) # for testing dsicrim, remove when training - # lang_pred = lang_pred.to(device) # for testing discrim, remove when training (i think?) its not like training even works + img_pred = self.linear(img_tok) + lang_pred = self.linear(lang_tok) if d_mode == True: @@ -63,162 +60,4 @@ def forward(self, data, d_mode): img_with_lang_label_loss = loss_function(img_pred, lang_label) # trying to follow DCGAN return img_with_lang_label_loss # returning image loss to maximize disc loss when training generator - -# class Discriminator: - -# def __init__(self): -# self.model = EasyNeuralNetwork(5120, 2) - -# # def evaluate(self, model, loss_function, X, y): -# def evaluate(self, loss_function, X, y): -# # predictions = model(X) # pass thorugh model -# predictions = self.model(X) -# # print("shape of y: ", y.shape) -# # print("prediction: ", predictions) -# loss = loss_function(predictions, y) -# predictions = predictions.argmax(dim=1).cpu().numpy() -# acc = (predictions == y.cpu().numpy()).mean() -# return predictions, acc, loss - - -# def call_discrim(self, data): -# device = 'cuda' -# loss_function = nn.BCELoss() # from DCGAN - -# img_tok = data["image"] -# lang_tok = data["lang"] - -# img_label = torch.full((img_tok.size(0),), 1, dtype=torch.float, device=device) # 1 for images -# lang_label = torch.full((lang_tok.size(0),), 0, dtype=torch.float, device=device) # 0 for language - -# _, _, img_loss = self.evaluate(self.model, loss_function, img_tok, img_label) -# _, _, lang_loss = self.evaluate(self.model, loss_function, lang_tok, lang_label) - -# final_loss = img_loss + lang_loss - -# return final_loss - -# def train(self,training_dataloader, IMAGE_SHAPE=1024 * 5, NUM_CLASSES=2, device='cuda', EPOCHS=1): -# self.model.train(mode=True) -# self.model.to(device) # put the model on the device (remember its cuda on workstation) -# optimizer = optim.Adam(self.model.parameters(), lr=0.001) -# loss_function = nn.CrossEntropyLoss() - -# epochs_acc = [] -# for epoch in range(EPOCHS): -# print(f'Epoch {epoch + 1}') -# epoch_acc = [] -# training_acc_checkpoint, training_loss_checkpoint = [], [] -# for step, (data, labels) in enumerate(training_dataloader): -# data = data.float().unsqueeze(0) -# labels = labels.unsqueeze(0) - -# data, labels = data.to(device), labels.to(device) # Convert labels to tensor if not already - -# predictions, acc, loss = self.evaluate(self.model, loss_function, data, labels) -# training_acc_checkpoint.append(acc) -# epoch_acc.append(acc) - -# # loss already calculated in the evaluate() call. just append it -# training_loss_checkpoint.append(loss.item()) - -# # back propagation -# loss.backward() - -# # gradient descent -# optimizer.step() - -# # zero the gradients so they do not accumulate -# optimizer.zero_grad() - -# # epoch end -# print("Accuracy: ", np.mean(epoch_acc)) -# epochs_acc.append(np.mean(epoch_acc)) - -# # can do some optimizations here if you want early stopping, right now im not gonna implement this - -# self.model.train(mode=False) # exit training mode - -# return epochs_acc, self.model - - -# # def test(): -# # model.train(False) # since were testing - -# # test_loss = [] -# # test_acc = [] - -# # for X,y in test_loader: -# # with torch.no_grad(): -# # X, y = X.to(device), y.to(device) -# # predictions = model(X) #as above: check dimentions - -# # loss = loss_function(predictions, y) -# # test_loss.append(loss.item()) - -# # test_acc.append((predictions.argmax(dim=1).cpu().numpy() == y.cpu().numpy()).mean()) - -# # print(f'Accuracy: {np.mean(test_acc):.2f}, Loss: {np.mean(test_loss):.2f}') - -# # return test_acc #idc about test_loss - - -# def preprocess_and_call_train(self,get_tkns): -# # set device to cpu -# device = 'cuda' if torch.cuda.is_available() else 'cpu' # if we are running this on workstation change this to cuda - -# # Example data loading (assuming you have loaded im_tok and lang_tok) - -# im_tok = get_tkns["image"] -# lang_tok = get_tkns["lang"] - -# lang_tok_list = [] -# for tensor in lang_tok: -# for i in range(tensor.size(0)): -# lang_tok_list.append(tensor[i, :]) - -# im_tok_list = [] -# for tensor in im_tok: -# for i in range(tensor.size(0)): -# for j in range(tensor.size(1)): -# im_tok_list.append(tensor[i, j, :]) - -# # print("image tokens arr length: ", len(im_tok)) -# # print("image tokens[0] shape: ", im_tok[0].shape) # image tokens[0] shape: torch.Size([16, 576, 5120]) - -# # print("lang tokens arr length: ", len(lang_tok)) -# # print("lang tokens[0] shape: ", lang_tok[0].shape) # lang tokens[0] shape: torch.Size([1277, 5120]) - - -# combined_tokens = [(torch.tensor(token), torch.tensor(0)) for token in im_tok_list] + [(torch.tensor(token), torch.tensor(1)) for token in lang_tok_list] - -# print("im_tok: ", im_tok[0].shape) -# print("lang_tok: ", lang_tok_list[0].shape) - -# # Optionally shuffle the combined list to randomize the order -# random.shuffle(combined_tokens) - -# # testing code... if our embeddings are the wrong side we are doing something wrong. -# print("combined_tokens[0][0].flatten().size(): ", combined_tokens[0][0].flatten().size()) -# assert combined_tokens[0][0].flatten().size() == torch.Size([1024*5]), ("flattened image tokens fed to discriminator do not match the size of " -# "disc first layer") -# print("combined_tokens[-1][0].flatten().size(): ", combined_tokens[-1][0].flatten().size()) -# assert combined_tokens[-1][0].flatten().size() == torch.Size([1024*5]), ("flattened language tokens fed to discriminator do not match the size " -# "of disc first layer") - -# # train network -# epochs_acc, model = self.train(combined_tokens, device=device) - - -# if( len(epochs_acc) > 0 ): -# print("-----------final epochs acc--------------: ", epochs_acc[-1]) - -# # not gonna do any eval for now -# # test_acc = test() - -# # save the model -# # PATH = 'models/desc_v1_llava.pth' -# # torch.save(model, PATH) - -# return model - + \ No newline at end of file diff --git a/llava/model/language_model/llava_llama.py b/llava/model/language_model/llava_llama.py index 683009b8e..64c48993f 100644 --- a/llava/model/language_model/llava_llama.py +++ b/llava/model/language_model/llava_llama.py @@ -1,4 +1,4 @@ -# Copyright 2023 Haotian Liu + # Copyright 2023 Haotian Liu # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -114,12 +114,17 @@ def forward( ) d_loss = discrim_dict["loss"] + + data = {'disc loss': d_loss.item()} + with open('/home/smirrashidi/loss_9-24.json', 'a') as f: + json.dump(data, f) + f.write('\n') model_output.loss = d_loss # returning only discriminator loss return model_output else: - discrim_dict = self.discriminator.forward(self.disc_data, d_mode=True) # d loss is sum of disc loss on images and lang; same call in both if and else + d_loss = self.discriminator.forward(self.disc_data, d_mode=False) # d loss is sum of disc loss on images and lang; same call in both if and else model_output = super().forward( input_ids=input_ids, attention_mask=attention_mask, @@ -132,11 +137,14 @@ def forward( output_hidden_states=output_hidden_states, return_dict=return_dict ) - - d_loss = discrim_dict["loss"] model_output.loss = model_output.loss + d_loss # returning sum of model and discriminator loss + data = {'model loss': model_output.loss.item()} + with open('/home/smirrashidi/loss_9-24.json', 'a') as f: + json.dump(data, f) + f.write('\n') + return model_output def forward_eval_discrim( @@ -230,4 +238,4 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs['images'] = images if image_sizes is not None: inputs['image_sizes'] = image_sizes - return inputs + return inputs \ No newline at end of file diff --git a/llava/model/llava_arch.py b/llava/model/llava_arch.py index 333df217c..39c1afd1e 100644 --- a/llava/model/llava_arch.py +++ b/llava/model/llava_arch.py @@ -138,7 +138,6 @@ def get_vision_tower(self): return self.get_model().get_vision_tower() def encode_images(self, images): - # images = images.to(torch.bfloat16) if images.dtype != torch.bfloat16 else images # added for testing the discriminator, not part of source code image_features = self.get_model().get_vision_tower()(images) image_features = self.get_model().mm_projector(image_features) return image_features @@ -147,7 +146,6 @@ def prepare_inputs_labels_for_multimodal( self, input_ids, position_ids, attention_mask, past_key_values, labels, images, image_sizes=None ): - torch.cuda.set_device(0) self.disc_data['image'] = [] self.disc_data['lang'] = [] diff --git a/llava/train/llava_trainer.py b/llava/train/llava_trainer.py index 324ee36b0..fe05d601d 100644 --- a/llava/train/llava_trainer.py +++ b/llava/train/llava_trainer.py @@ -1,3 +1,4 @@ +llava trainer import os import torch import torch.nn as nn @@ -6,6 +7,7 @@ from packaging import version import time import deepspeed +import random import sys import json @@ -599,8 +601,8 @@ def _inner_training_loop( rng_to_sync = True step = -1 - for step, inputs in enumerate(epoch_iterator): - inputs['d_mode'] = True if (step % 2 == 0) else False # set d_mode + for step, inputs in enumerate(epoch_iterator): + inputs["d_mode"] = True if step % 2 == 0 else False total_batched_samples += 1 if self.args.include_num_input_tokens_seen: @@ -700,6 +702,24 @@ def _inner_training_loop( self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch self.control = self.callback_handler.on_step_end(args, self.state, self.control) + print(self.state.epoch) + + if abs(self.state.epoch - 0.01) < 1e-4: + print(f"Saving checkpoint at epoch {self.state.epoch}") + self._save_checkpoint(model, trial=None) # only saves the mm_projector weights, which can be passed into the model later + + elif abs(self.state.epoch - 0.25) < 1e-4: + print(f"Saving checkpoint at epoch {self.state.epoch}") + self._save_checkpoint(model, trial=None) + + elif abs(self.state.epoch - 0.5) < 1e-4: + print(f"Saving checkpoint at epoch {self.state.epoch}") + self._save_checkpoint(model, trial=None) + + elif abs(self.state.epoch - 0.75) < 1e-4: + print(f"Saving checkpoint at epoch {self.state.epoch}") + self._save_checkpoint(model, trial=None) + self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) else: self.control = self.callback_handler.on_substep_end(args, self.state, self.control) @@ -788,6 +808,4 @@ def _inner_training_loop( if self.neftune_noise_alpha is not None: self._deactivate_neftune(self.model) - print(f"printing from inner_training_loop:{model.module.base_model.model.discriminator.fc1.weight}") - - return TrainOutput(self.state.global_step, train_loss, metrics) + return TrainOutput(self.state.global_step, train_loss, metrics) \ No newline at end of file diff --git a/scripts/v1_5/finetune_lora.sh b/scripts/v1_5/finetune_lora.sh index 086694ced..c2cdf24d6 100644 --- a/scripts/v1_5/finetune_lora.sh +++ b/scripts/v1_5/finetune_lora.sh @@ -16,7 +16,7 @@ deepspeed llava/train/train_mem.py \ --image_aspect_ratio pad \ --group_by_modality_length True \ --bf16 True \ - --output_dir ./checkpoints/llava-v1.5-13b-lora-9-23 \ + --output_dir ./checkpoints/llava-v1.5-13b-lora-9-26 \ --num_train_epochs 1 \ --per_device_train_batch_size 2 \ --per_device_eval_batch_size 4 \ From b1ebae6f1ed446ad477e212b47c978d0f6b0df87 Mon Sep 17 00:00:00 2001 From: smirrashidi Date: Fri, 27 Sep 2024 21:36:57 +0000 Subject: [PATCH 38/41] set up intermittent checkpointing and logic fixes --- llava/VLLMSafety/discriminator.py | 2 +- llava/train/llava_trainer.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/llava/VLLMSafety/discriminator.py b/llava/VLLMSafety/discriminator.py index 9700df0a5..71f14a38b 100644 --- a/llava/VLLMSafety/discriminator.py +++ b/llava/VLLMSafety/discriminator.py @@ -20,7 +20,7 @@ def linear(self, x): return x def forward(self, data, d_mode): - device = 'cuda' # CHANGE TO JUST CUDA WHEN NOT USING VLMEVAL + device = 'cuda' loss_function = nn.BCELoss() # from DCGAN img_tok = data["image"] diff --git a/llava/train/llava_trainer.py b/llava/train/llava_trainer.py index fe05d601d..a294f7f77 100644 --- a/llava/train/llava_trainer.py +++ b/llava/train/llava_trainer.py @@ -1,4 +1,3 @@ -llava trainer import os import torch import torch.nn as nn From 14e2b1800da9aa3746802c35863620c49780bb0b Mon Sep 17 00:00:00 2001 From: Laya Date: Thu, 31 Oct 2024 17:42:09 -0700 Subject: [PATCH 39/41] parallel rewrite for multigpu training --- llava/VLLMSafety/discriminator.py | 62 +++++++++-------- llava/model/language_model/llava_llama.py | 14 ++-- llava/model/llava_arch.py | 8 ++- llava/train/llava_trainer.py | 85 ++++++++++++++++++++--- 4 files changed, 121 insertions(+), 48 deletions(-) diff --git a/llava/VLLMSafety/discriminator.py b/llava/VLLMSafety/discriminator.py index 71f14a38b..0a20a7027 100644 --- a/llava/VLLMSafety/discriminator.py +++ b/llava/VLLMSafety/discriminator.py @@ -21,43 +21,49 @@ def linear(self, x): def forward(self, data, d_mode): device = 'cuda' - loss_function = nn.BCELoss() # from DCGAN + loss_function = nn.BCELoss() # follow DCgan - img_tok = data["image"] - lang_tok = data["lang"] - - img_tok = img_tok.view(-1, 5120) # image tokens have dim=3 + image_batch = data['image'][0].view(-1, 5120).to(device) + img_tok = image_batch.view(-1, 5120) # flatten the lists img_pred = self.linear(img_tok) - lang_pred = self.linear(lang_tok) + img_label = torch.full((img_tok.size(0), 1), 1, dtype=torch.bfloat16, device=device) # use label 1 for imgs + img_loss = loss_function(img_pred, img_label) + + total_lang_loss = 0 + lang_correct_count = 0 + total_lang_preds = 0 + img_correct_count = torch.eq(torch.ge(img_pred, 0.5).float().to(torch.bfloat16), img_label).sum().item() + img_accuracy = img_correct_count / img_tok.size(0) * 100 + + for lang_tensor in data["lang"]: + lang_tensor = lang_tensor.to(device) + lang_pred = self.linear(lang_tensor.view(-1, 5120)) # Process each lang tensor independently + lang_label = torch.full((lang_pred.size(0), 1), 0, dtype=torch.bfloat16, device=device) # Label 0 for language - if d_mode == True: + lang_loss = loss_function(lang_pred, lang_label) + total_lang_loss += lang_loss - img_label = torch.full((img_tok.size(0), 1), 1, dtype=torch.bfloat16, device=device) # 1 for images - lang_label = torch.full((lang_tok.size(0), 1), 0, dtype=torch.bfloat16, device=device) # 0 for lang + #for accuracy calculations + lang_correct = torch.eq(torch.ge(lang_pred, 0.5).float().to(torch.bfloat16), lang_label).sum().item() + lang_correct_count += lang_correct + total_lang_preds += lang_pred.size(0) - img_loss = loss_function(img_pred, img_label) - lang_loss = loss_function(lang_pred, lang_label) + if d_mode: + lang_accuracy = lang_correct_count / total_lang_preds * 100 + print(f"Image Accuracy: {img_accuracy:.2f}%") + print(f"Language Accuracy: {lang_accuracy:.2f}%") - loss = img_loss + lang_loss + loss = img_loss + total_lang_loss - img_pred_binary = torch.ge(img_pred, 0.5).float().to(torch.bfloat16) - lang_pred_binary = torch.ge(lang_pred, 0.5).float().to(torch.bfloat16) # >= because we want the tensor to be all 0s if each value is less than 0.5 - - img_is_correct = torch.eq(img_pred_binary, img_label) - lang_is_correct = torch.eq(lang_pred_binary, lang_label) - - return_dict = { + return { "loss": loss, - "img_is_correct" : img_is_correct, - "lang_is_correct": lang_is_correct, + "img_is_correct": img_correct_count, + "lang_is_correct": lang_correct_count, + "img_accuracy": img_accuracy, + "lang_accuracy": lang_accuracy, } - - return return_dict else: - lang_label = torch.full((img_tok.size(0), 1), 0, dtype=torch.bfloat16, device=device) # 0 for lang - img_with_lang_label_loss = loss_function(img_pred, lang_label) # trying to follow DCGAN - - return img_with_lang_label_loss # returning image loss to maximize disc loss when training generator - \ No newline at end of file + img_with_lang_label_loss = loss_function(img_pred, torch.full((img_tok.size(0), 1), 0, dtype=torch.bfloat16, device=device)) + return img_with_lang_label_loss \ No newline at end of file diff --git a/llava/model/language_model/llava_llama.py b/llava/model/language_model/llava_llama.py index 64c48993f..63c51cb3a 100644 --- a/llava/model/language_model/llava_llama.py +++ b/llava/model/language_model/llava_llama.py @@ -29,6 +29,7 @@ from transformers.modeling_utils import * from transformers.modeling_utils import _add_variant +import wandb class LlavaConfig(LlamaConfig): @@ -116,9 +117,9 @@ def forward( d_loss = discrim_dict["loss"] data = {'disc loss': d_loss.item()} - with open('/home/smirrashidi/loss_9-24.json', 'a') as f: - json.dump(data, f) - f.write('\n') + # with open('/home/smirrashidi/loss_9-24.json', 'a') as f: + # json.dump(data, f) + # f.write('\n') model_output.loss = d_loss # returning only discriminator loss @@ -139,11 +140,12 @@ def forward( ) model_output.loss = model_output.loss + d_loss # returning sum of model and discriminator loss + wandb.log({"generator_disc loss": d_loss}) data = {'model loss': model_output.loss.item()} - with open('/home/smirrashidi/loss_9-24.json', 'a') as f: - json.dump(data, f) - f.write('\n') + # with open('/home/smirrashidi/loss_9-24.json', 'a') as f: + # json.dump(data, f) + # f.write('\n') return model_output diff --git a/llava/model/llava_arch.py b/llava/model/llava_arch.py index 39c1afd1e..808d1a9f4 100644 --- a/llava/model/llava_arch.py +++ b/llava/model/llava_arch.py @@ -157,7 +157,8 @@ def prepare_inputs_labels_for_multimodal( if type(images) is list: images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images] concat_images = torch.cat([image for image in images], dim=0) - image_features = self.encode_images(concat_images) + raw_image_features = self.encode_images(concat_images) + image_features = raw_image_features split_sizes = [image.shape[0] for image in images] image_features = torch.split(image_features, split_sizes, dim=0) mm_patch_merge_type = getattr(self.config, 'mm_patch_merge_type', 'flat') @@ -203,8 +204,9 @@ def prepare_inputs_labels_for_multimodal( raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}") else: image_features = self.encode_images(images) + raw_image_features = image_features - self.disc_data['image'] = image_features + self.disc_data['image'].append(raw_image_features) # TODO: image start / end is not implemented here to support pretraining. if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False): @@ -258,7 +260,7 @@ def prepare_inputs_labels_for_multimodal( #curr input embeds is coming from cur_input_ids_noim which means its already filitered - self.disc_data['lang'] = cur_input_embeds + self.disc_data['lang'].append(cur_input_embeds_no_im[1]) #print(f'self.disc_data: {self.disc_data}\n') cur_new_input_embeds = [] diff --git a/llava/train/llava_trainer.py b/llava/train/llava_trainer.py index a294f7f77..2968ea915 100644 --- a/llava/train/llava_trainer.py +++ b/llava/train/llava_trainer.py @@ -47,11 +47,16 @@ ) from typing import List, Optional, Union +import wandb TRAINER_STATE_NAME = "trainer_state.json" lr = 0.0002 beta1 = 0.5 - + +#os.environ['WANDB_MODE'] = 'disabled' +wandb.init( + project="llava_safety" +) def maybe_zero_3(param, ignore_status=False, name=None): from deepspeed import zero @@ -218,18 +223,14 @@ def create_optimizer(self): decay_parameters = [name for name in decay_parameters if "bias" not in name] if self.args.mm_projector_lr is not None: projector_parameters = [name for name, _ in opt_model.named_parameters() if "mm_projector" in name] + discriminator_parameters = [name for name, _ in opt_model.named_parameters() if "discriminator" in name] optimizer_grouped_parameters = [ { "params": [ - p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in projector_parameters and p.requires_grad) - ], - "weight_decay": self.args.weight_decay, - }, - { - "params": [ - p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in projector_parameters and p.requires_grad) + p for n, p in opt_model.named_parameters() if (n in discriminator_parameters and p.requires_grad) ], - "weight_decay": 0.0, + "weight_decay": 0, # TODO: this can be a hyperparameter + "lr": lr, }, { "params": [ @@ -246,7 +247,7 @@ def create_optimizer(self): "lr": self.args.mm_projector_lr, }, ] - else: + else: # our code will never go here optimizer_grouped_parameters = [ { "params": [ @@ -262,6 +263,7 @@ def create_optimizer(self): }, ] + optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args) self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs) @@ -280,6 +282,12 @@ def create_optimizer(self): logger.info(f"skipped: {skipped/2**20}M params") self.d_optimizer = optim.Adam(opt_model.discriminator.parameters(), lr= lr, betas=(beta1, 0.999)) # how to get discriminator parameters? + + for name, param in opt_model.named_parameters(): + if 'mm_projector' not in name and 'discriminator' not in name: + param.requires_grad = False + + # turn off all the params in the model that are not part of the projector or discriminator return self.optimizer @@ -807,4 +815,59 @@ def _inner_training_loop( if self.neftune_noise_alpha is not None: self._deactivate_neftune(self.model) - return TrainOutput(self.state.global_step, train_loss, metrics) \ No newline at end of file + return TrainOutput(self.state.global_step, train_loss, metrics) + + def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor: + """ + gan style, compute d_loss and g_loss and update optimizers accordingly + """ + model.train() + inputs = self._prepare_inputs(inputs) + + # get d loss + d_loss = self._compute_loss_for_discriminator(model, inputs) + self._backward_pass(d_loss, self.d_optimizer, update_optimizer=True, loss_name="discriminator_loss") + + # get g loss + g_loss = self._compute_loss_for_generator(model, inputs) + self._backward_pass(g_loss, self.optimizer, update_optimizer=False, loss_name="generator_loss") + + + total_loss = d_loss.detach() + g_loss.detach() + return total_loss / self.args.gradient_accumulation_steps + + def _compute_loss_for_discriminator(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor: + inputs['d_mode'] = True # enable discriminator mode + with self.compute_loss_context_manager(): + d_loss = self.compute_loss(model, inputs) + + if self.args.n_gpu > 1: + d_loss = d_loss.mean() # average loss across multiple GPUs + + return d_loss + + def _compute_loss_for_generator(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor: + inputs['d_mode'] = False # enable generator mode + with self.compute_loss_context_manager(): + g_loss = self.compute_loss(model, inputs) + + if self.args.n_gpu > 1: + g_loss = g_loss.mean() # Average loss across multiple GPUs + + return g_loss + + def _backward_pass(self, loss: torch.Tensor, optimizer, update_optimizer: bool, loss_name: str): + + if self.use_apex: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + self.accelerator.backward(loss) # backwards pass + + # only update d_optimizer (we want g_optimizer to go through grad clips) + if update_optimizer: + optimizer.step() + optimizer.zero_grad() + + # Log the loss using WandB + wandb.log({loss_name: loss.item()}) \ No newline at end of file From bfae3425497929f65d12b85129673ccd4904c6aa Mon Sep 17 00:00:00 2001 From: Laya Date: Thu, 31 Oct 2024 18:14:57 -0700 Subject: [PATCH 40/41] eval params --- llava/model/language_model/llava_llama.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/llava/model/language_model/llava_llama.py b/llava/model/language_model/llava_llama.py index 63c51cb3a..a5185b63d 100644 --- a/llava/model/language_model/llava_llama.py +++ b/llava/model/language_model/llava_llama.py @@ -56,11 +56,16 @@ def __init__(self, config): "image": None, "lang": None, } - self.discriminator = Discriminator(5120) # hard coding in sizes for now + + self.eval_mode = False + + if not self.eval_mode: + self.discriminator = Discriminator(5120) # hard coding in sizes for now # Initialize weights and apply final processing self.post_init() + def get_model(self): return self.model @@ -98,6 +103,20 @@ def forward( images, image_sizes ) + + if self.eval_mode: + return super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + labels=labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) if d_mode == True: discrim_dict = self.discriminator.forward(self.disc_data, d_mode=True) # d loss is sum of disc loss on images and lang From 0740ec736945df2357d05f24cdfa2f47eab9b3f7 Mon Sep 17 00:00:00 2001 From: Laya Date: Tue, 5 Nov 2024 12:49:45 -0800 Subject: [PATCH 41/41] dmode off for testing --- llava/model/language_model/llava_llama.py | 17 +++++++++-------- llava/train/llava_trainer.py | 14 ++++---------- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/llava/model/language_model/llava_llama.py b/llava/model/language_model/llava_llama.py index a5185b63d..897428bff 100644 --- a/llava/model/language_model/llava_llama.py +++ b/llava/model/language_model/llava_llama.py @@ -117,6 +117,9 @@ def forward( output_hidden_states=output_hidden_states, return_dict=return_dict, ) + + d_mode=False # if you want to turn off the disc completely + if d_mode == True: discrim_dict = self.discriminator.forward(self.disc_data, d_mode=True) # d loss is sum of disc loss on images and lang @@ -144,7 +147,7 @@ def forward( return model_output else: - d_loss = self.discriminator.forward(self.disc_data, d_mode=False) # d loss is sum of disc loss on images and lang; same call in both if and else + # d_loss = self.discriminator.forward(self.disc_data, d_mode=False) # d loss is sum of disc loss on images and lang; same call in both if and else model_output = super().forward( input_ids=input_ids, attention_mask=attention_mask, @@ -157,14 +160,12 @@ def forward( output_hidden_states=output_hidden_states, return_dict=return_dict ) - - model_output.loss = model_output.loss + d_loss # returning sum of model and discriminator loss - wandb.log({"generator_disc loss": d_loss}) + + #wandb.log({"generator_disc loss": d_loss}) + wandb.log({"generator loss": model_output.loss}) - data = {'model loss': model_output.loss.item()} - # with open('/home/smirrashidi/loss_9-24.json', 'a') as f: - # json.dump(data, f) - # f.write('\n') + model_output.loss = model_output.loss #+ d_loss # returning sum of model and discriminator loss + wandb.log({"summed loss": model_output.loss}) return model_output diff --git a/llava/train/llava_trainer.py b/llava/train/llava_trainer.py index 2968ea915..0ee1022da 100644 --- a/llava/train/llava_trainer.py +++ b/llava/train/llava_trainer.py @@ -690,13 +690,7 @@ def _inner_training_loop( ) # Optimizer step - - if inputs["d_mode"] == True: - self.d_optimizer.step() - model.module.base_model.model.discriminator.zero_grad() - - else: - self.optimizer.step() + self.optimizer.step() optimizer_was_run = not self.accelerator.optimizer_step_was_skipped if optimizer_was_run: @@ -825,8 +819,8 @@ def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, inputs = self._prepare_inputs(inputs) # get d loss - d_loss = self._compute_loss_for_discriminator(model, inputs) - self._backward_pass(d_loss, self.d_optimizer, update_optimizer=True, loss_name="discriminator_loss") + #d_loss = self._compute_loss_for_discriminator(model, inputs) + #self._backward_pass(d_loss, self.d_optimizer, update_optimizer=True, loss_name="discriminator_loss") # get g loss g_loss = self._compute_loss_for_generator(model, inputs) @@ -870,4 +864,4 @@ def _backward_pass(self, loss: torch.Tensor, optimizer, update_optimizer: bool, optimizer.zero_grad() # Log the loss using WandB - wandb.log({loss_name: loss.item()}) \ No newline at end of file + #wandb.log({loss_name: loss.item()}) \ No newline at end of file