diff --git a/finetune.ipynb b/finetune.ipynb
new file mode 100644
index 000000000..1422d990d
--- /dev/null
+++ b/finetune.ipynb
@@ -0,0 +1,288 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Tested to be running in Colab (2024.03.23) On T4 GPU, without bf16 or tf32"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Author: https://github.com/TongmengXie/"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "QkGSpvuMcCjQ"
+      },
+      "source": [
+        "# pre-run"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "WlYzUFFmSlXd",
+        "outputId": "9d6178cb-6710-45a5-96e9-3c3a9dd5816d"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Collecting flash-attn\n",
+            "  Downloading flash_attn-2.5.6.tar.gz (2.5 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.5/2.5 MB\u001b[0m \u001b[31m17.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from flash-attn) (2.2.1+cu121)\n",
+            "Collecting einops (from flash-attn)\n",
+            "  Downloading einops-0.7.0-py3-none-any.whl (44 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.6/44.6 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from flash-attn) (24.0)\n",
+            "Requirement already satisfied: ninja in /usr/local/lib/python3.10/dist-packages (from flash-attn) (1.11.1.1)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (3.13.1)\n",
+            "Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (4.10.0)\n",
+            "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (1.12)\n",
+            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (3.2.1)\n",
+            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (3.1.3)\n",
+            "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (2023.6.0)\n",
+            "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (12.1.105)\n",
+            "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (12.1.105)\n",
+            "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (12.1.105)\n",
+            "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (8.9.2.26)\n",
+            "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (12.1.3.1)\n",
+            "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (11.0.2.54)\n",
+            "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (10.3.2.106)\n",
+            "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (11.4.5.107)\n",
+            "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (12.1.0.106)\n",
+            "Requirement already satisfied: nvidia-nccl-cu12==2.19.3 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (2.19.3)\n",
+            "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (12.1.105)\n",
+            "Requirement already satisfied: triton==2.2.0 in /usr/local/lib/python3.10/dist-packages (from torch->flash-attn) (2.2.0)\n",
+            "Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.10/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch->flash-attn) (12.4.99)\n",
+            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->flash-attn) (2.1.5)\n",
+            "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch->flash-attn) (1.3.0)\n",
+            "Building wheels for collected packages: flash-attn\n",
+            "  Building wheel for flash-attn (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for flash-attn: filename=flash_attn-2.5.6-cp310-cp310-linux_x86_64.whl size=120592258 sha256=d8cf54adda65f59820221d329d274e124972d7fdc05ab3b1130253c64eee6c8a\n",
+            "  Stored in directory: /root/.cache/pip/wheels/a8/1c/88/b959d6818b98a46d61ba231683abb7523b89ac1a7ed1e0c206\n",
+            "Successfully built flash-attn\n",
+            "Installing collected packages: einops, flash-attn\n",
+            "Successfully installed einops-0.7.0 flash-attn-2.5.6\n"
+          ]
+        }
+      ],
+      "source": [
+        "# get ollama\n",
+        "!sudo apt-get install -y pciutils\n",
+        "!curl https://ollama.ai/install.sh | sh\n",
+        "import pandas as pd\n",
+        "\n",
+        "# dependencies for fine-tuning\n",
+        "!pip install accelerate -U -q\n",
+        "!pip install deepspeed -q\n",
+        "!pip install flash-attn --no-build-isolation -q # flash attenion and for vision-tower"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "3EQAh7C5LIjP"
+      },
+      "outputs": [],
+      "source": [
+        "# clone from myself\n",
+        "!git clone https://github.com/Tongmengxie/LLaVA.git\n",
+        "\n",
+        "# optional: clone from public repo after my pull request merged\n",
+        "# %cd LLaVA\n",
+        "# !git clone https://github.com/haotian-liu/LLaVA.git\n",
+        "# !git checkout colab\n",
+        "# %cd ..\n",
+        "# %pwd\n",
+        "!mkdir LLaVA/checkpoints/llava-v1.5-13b-task-lora"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "r29HVHeBcEes"
+      },
+      "source": [
+        "# Input & Output Directories"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "zE4YhiPTcOlP"
+      },
+      "outputs": [],
+      "source": [
+        "# data_path\n",
+        "# DATA_PATH = 'LLaVA/playground/data/llava_v1_5_mix665k.json'\n",
+        "DATA_PATH = 'path_to_data' # see for format: https://github.com/haotian-liu/LLaVA/blob/main/docs/Finetune_Custom_Data.md\n",
+        "\n",
+        "# image_folder\n",
+        "# IMAGE_FOLDER = 'LLaVA/playground/data'\n",
+        "IMAGE_FOLDER = 'path_to_images'"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "ssSg2wZ9M9zc",
+        "outputId": "fc1eded3-71c4-467c-e3b0-07ed208f70fb"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "[2024-03-23 17:49:06,369] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+            "[2024-03-23 17:49:10,197] [WARNING] [runner.py:202:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.\n",
+            "[2024-03-23 17:49:10,211] [INFO] [runner.py:568:main] cmd = /usr/bin/python3 -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMF19 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None LLaVA/llava/train/train_mem.py --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 --deepspeed LLaVA/scripts/zero3.json --model_name_or_path liuhaotian/llava-v1.5-13b --version v1 --data_path LLaVA/playground/data/llava_v1_5_mix665k.json --image_folder LLaVA/playground/data --mm_projector_type mlp2x_gelu --mm_vision_select_layer -2 --mm_use_im_start_end False --mm_use_im_patch_token False --image_aspect_ratio pad --group_by_modality_length True --bf16 False --output_dir LLaVA/checkpoints/llava-v1.5-13b-task-lora --num_train_epochs 1 --per_device_train_batch_size 16 --per_device_eval_batch_size 4 --gradient_accumulation_steps 1 --evaluation_strategy no --save_strategy steps --save_steps 50000 --save_total_limit 1 --learning_rate 2e-4 --weight_decay 0. --warmup_ratio 0.03 --lr_scheduler_type cosine --logging_steps 1 --tf32 False --model_max_length 2048 --gradient_checkpointing True --dataloader_num_workers 4 --lazy_preprocess True --report_to wandb\n",
+            "[2024-03-23 17:49:15,047] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+            "[2024-03-23 17:49:18,960] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_DEV_PACKAGE=libnccl-dev=2.19.3-1+cuda12.2\n",
+            "[2024-03-23 17:49:18,960] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_DEV_PACKAGE_VERSION=2.19.3-1\n",
+            "[2024-03-23 17:49:18,960] [INFO] [launch.py:138:main] 0 NCCL_VERSION=2.19.3-1\n",
+            "[2024-03-23 17:49:18,961] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_DEV_PACKAGE_NAME=libnccl-dev\n",
+            "[2024-03-23 17:49:18,961] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_PACKAGE=libnccl2=2.19.3-1+cuda12.2\n",
+            "[2024-03-23 17:49:18,961] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_PACKAGE_NAME=libnccl2\n",
+            "[2024-03-23 17:49:18,961] [INFO] [launch.py:138:main] 0 NV_LIBNCCL_PACKAGE_VERSION=2.19.3-1\n",
+            "[2024-03-23 17:49:18,961] [INFO] [launch.py:145:main] WORLD INFO DICT: {'localhost': [0]}\n",
+            "[2024-03-23 17:49:18,961] [INFO] [launch.py:151:main] nnodes=1, num_local_procs=1, node_rank=0\n",
+            "[2024-03-23 17:49:18,961] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0]})\n",
+            "[2024-03-23 17:49:18,961] [INFO] [launch.py:163:main] dist_world_size=1\n",
+            "[2024-03-23 17:49:18,961] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0\n",
+            "[2024-03-23 17:49:18,962] [INFO] [launch.py:253:main] process 20970 spawned with command: ['/usr/bin/python3', '-u', 'LLaVA/llava/train/train_mem.py', '--local_rank=0', '--lora_enable', 'True', '--lora_r', '128', '--lora_alpha', '256', '--mm_projector_lr', '2e-5', '--deepspeed', 'LLaVA/scripts/zero3.json', '--model_name_or_path', 'liuhaotian/llava-v1.5-13b', '--version', 'v1', '--data_path', 'LLaVA/playground/data/llava_v1_5_mix665k.json', '--image_folder', 'LLaVA/playground/data', '--mm_projector_type', 'mlp2x_gelu', '--mm_vision_select_layer', '-2', '--mm_use_im_start_end', 'False', '--mm_use_im_patch_token', 'False', '--image_aspect_ratio', 'pad', '--group_by_modality_length', 'True', '--bf16', 'False', '--output_dir', 'LLaVA/checkpoints/llava-v1.5-13b-task-lora', '--num_train_epochs', '1', '--per_device_train_batch_size', '16', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '1', '--evaluation_strategy', 'no', '--save_strategy', 'steps', '--save_steps', '50000', '--save_total_limit', '1', '--learning_rate', '2e-4', '--weight_decay', '0.', '--warmup_ratio', '0.03', '--lr_scheduler_type', 'cosine', '--logging_steps', '1', '--tf32', 'False', '--model_max_length', '2048', '--gradient_checkpointing', 'True', '--dataloader_num_workers', '4', '--lazy_preprocess', 'True', '--report_to', 'wandb']\n",
+            "2024-03-23 17:49:23.673937: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+            "2024-03-23 17:49:23.673984: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+            "2024-03-23 17:49:23.675371: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+            "2024-03-23 17:49:24.858169: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+            "[2024-03-23 17:49:26,104] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+            "[2024-03-23 17:49:26,349] [INFO] [comm.py:637:init_distributed] cdb=None\n",
+            "[2024-03-23 17:49:26,349] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n",
+            "You are using a model of type llava to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.\n",
+            "You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour\n",
+            "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.\n",
+            "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n",
+            "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n",
+            "[2024-03-23 17:49:27,941] [INFO] [partition_parameters.py:343:__exit__] finished initializing model - num_params = 97, num_elems = 3.58B\n",
+            "Traceback (most recent call last):\n",
+            "  File \"/content/LLaVA/llava/train/train_mem.py\", line 4, in <module>\n",
+            "    train(attn_implementation=\"flash_attention_2\")\n",
+            "  File \"/content/LLaVA/llava/train/train.py\", line 837, in train\n",
+            "    model = transformers.LlamaForCausalLM.from_pretrained(\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py\", line 3375, in from_pretrained\n",
+            "    model = cls(config, *model_args, **model_kwargs)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/partition_parameters.py\", line 503, in wrapper\n",
+            "    f(module, *args, **kwargs)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py\", line 1103, in __init__\n",
+            "    self.model = LlamaModel(config)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/partition_parameters.py\", line 503, in wrapper\n",
+            "    f(module, *args, **kwargs)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py\", line 924, in __init__\n",
+            "    [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py\", line 924, in <listcomp>\n",
+            "    [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/partition_parameters.py\", line 503, in wrapper\n",
+            "    f(module, *args, **kwargs)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py\", line 701, in __init__\n",
+            "    self.mlp = LlamaMLP(config)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/partition_parameters.py\", line 503, in wrapper\n",
+            "    f(module, *args, **kwargs)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py\", line 219, in __init__\n",
+            "    self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/partition_parameters.py\", line 513, in wrapper\n",
+            "    self._post_init_method(module)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/partition_parameters.py\", line 1056, in _post_init_method\n",
+            "    self._zero_init_param(param)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/partition_parameters.py\", line 1015, in _zero_init_param\n",
+            "    param.partition()\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/partition_parameters.py\", line 1326, in partition\n",
+            "    self._partition(param_list, has_been_updated=has_been_updated)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/partition_parameters.py\", line 1475, in _partition\n",
+            "    self._partition_param(param, has_been_updated=has_been_updated)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/deepspeed/utils/nvtx.py\", line 15, in wrapped_fn\n",
+            "    ret_val = func(*args, **kwargs)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/partition_parameters.py\", line 1540, in _partition_param\n",
+            "    partitioned_tensor = torch.empty(partition_size, dtype=param.dtype, device=device)\n",
+            "  File \"/usr/local/lib/python3.10/dist-packages/deepspeed/runtime/zero/partition_parameters.py\", line 238, in wrapped_fn\n",
+            "    tensor: Tensor = fn(*args, **kwargs)\n",
+            "torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 270.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 65.06 MiB is free. Process 192148 has 14.68 GiB memory in use. Of the allocated memory 13.39 GiB is allocated by PyTorch, and 883.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)\n",
+            "[2024-03-23 17:49:29,972] [INFO] [launch.py:316:sigkill_handler] Killing subprocess 20970\n",
+            "[2024-03-23 17:49:29,972] [ERROR] [launch.py:322:sigkill_handler] ['/usr/bin/python3', '-u', 'LLaVA/llava/train/train_mem.py', '--local_rank=0', '--lora_enable', 'True', '--lora_r', '128', '--lora_alpha', '256', '--mm_projector_lr', '2e-5', '--deepspeed', 'LLaVA/scripts/zero3.json', '--model_name_or_path', 'liuhaotian/llava-v1.5-13b', '--version', 'v1', '--data_path', 'LLaVA/playground/data/llava_v1_5_mix665k.json', '--image_folder', 'LLaVA/playground/data', '--mm_projector_type', 'mlp2x_gelu', '--mm_vision_select_layer', '-2', '--mm_use_im_start_end', 'False', '--mm_use_im_patch_token', 'False', '--image_aspect_ratio', 'pad', '--group_by_modality_length', 'True', '--bf16', 'False', '--output_dir', 'LLaVA/checkpoints/llava-v1.5-13b-task-lora', '--num_train_epochs', '1', '--per_device_train_batch_size', '16', '--per_device_eval_batch_size', '4', '--gradient_accumulation_steps', '1', '--evaluation_strategy', 'no', '--save_strategy', 'steps', '--save_steps', '50000', '--save_total_limit', '1', '--learning_rate', '2e-4', '--weight_decay', '0.', '--warmup_ratio', '0.03', '--lr_scheduler_type', 'cosine', '--logging_steps', '1', '--tf32', 'False', '--model_max_length', '2048', '--gradient_checkpointing', 'True', '--dataloader_num_workers', '4', '--lazy_preprocess', 'True', '--report_to', 'wandb'] exits with return code = 1\n"
+          ]
+        }
+      ],
+      "source": [
+        "!deepspeed LLaVA/llava/train/train_mem.py \\\n",
+        "    --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 2e-5 \\\n",
+        "    --deepspeed LLaVA/scripts/zero3.json \\\n",
+        "    --model_name_or_path liuhaotian/llava-v1.5-13b \\\n",
+        "    --version v1 \\\n",
+        "    --data_path DATA_PATH \\\n",
+        "    --image_folder IMAGE_FOLDER \\\n",
+        "    --mm_projector_type mlp2x_gelu \\\n",
+        "    --mm_vision_select_layer -2 \\\n",
+        "    --mm_use_im_start_end False \\\n",
+        "    --mm_use_im_patch_token False \\\n",
+        "    --image_aspect_ratio pad \\\n",
+        "    --group_by_modality_length True \\\n",
+        "    --bf16 False \\\n",
+        "    --output_dir LLaVA/checkpoints/llava-v1.5-13b-task-lora \\\n",
+        "    --num_train_epochs 1 \\\n",
+        "    --per_device_train_batch_size 16 \\\n",
+        "    --per_device_eval_batch_size 4 \\\n",
+        "    --gradient_accumulation_steps 1 \\\n",
+        "    --evaluation_strategy \"no\" \\\n",
+        "    --save_strategy \"steps\" \\\n",
+        "    --save_steps 50000 \\\n",
+        "    --save_total_limit 1 \\\n",
+        "    --learning_rate 2e-4 \\\n",
+        "    --weight_decay 0. \\\n",
+        "    --warmup_ratio 0.03 \\\n",
+        "    --lr_scheduler_type \"cosine\" \\\n",
+        "    --logging_steps 1 \\\n",
+        "    --tf32 False \\\n",
+        "    --model_max_length 2048 \\\n",
+        "    --gradient_checkpointing True \\\n",
+        "    --dataloader_num_workers 4 \\\n",
+        "    --lazy_preprocess True \\\n",
+        "    --report_to wandb\n",
+        "    # --vision_tower openai/clip-vit-large-patch14-336 \\"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/llava/mm_utils.py b/llava/mm_utils.py
index de97345cf..374829843 100644
--- a/llava/mm_utils.py
+++ b/llava/mm_utils.py
@@ -5,9 +5,14 @@
 import math
 import ast
 
-from transformers import StoppingCriteria
-from llava.constants import IMAGE_TOKEN_INDEX
+import sys
+try:
+    sys.path.append('/content/LLaVA/llava/')
+except:
+    pass
 
+from transformers import StoppingCriteria
+from constants import IMAGE_TOKEN_INDEX
 
 def select_best_resolution(original_size, possible_resolutions):
     """
diff --git a/llava/train/train.py b/llava/train/train.py
index 477c668b6..b2552360f 100644
--- a/llava/train/train.py
+++ b/llava/train/train.py
@@ -15,6 +15,11 @@
 #    limitations under the License.
 
 import os
+import sys
+try:
+    sys.path.append('/content/LLaVA/llava/')
+except:
+    pass
 import copy
 from dataclasses import dataclass, field
 import json
@@ -27,13 +32,13 @@
 import transformers
 import tokenizers
 
-from llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 from torch.utils.data import Dataset
-from llava.train.llava_trainer import LLaVATrainer
+from llava_trainer import LLaVATrainer
 
-from llava import conversation as conversation_lib
-from llava.model import *
-from llava.mm_utils import tokenizer_image_token
+import conversation as conversation_lib
+from model import *
+from mm_utils import tokenizer_image_token
 
 from PIL import Image
 
diff --git a/llava/train/train_mem.py b/llava/train/train_mem.py
index 29ea06170..048913afc 100644
--- a/llava/train/train_mem.py
+++ b/llava/train/train_mem.py
@@ -1,4 +1,8 @@
-from llava.train.train import train
+
+try:
+    from llava.train.train import train
+except: # e.g., on colab
+    from train import train
 
 if __name__ == "__main__":
     train(attn_implementation="flash_attention_2")
diff --git a/scripts/finetune_lora.sh b/scripts/finetune_lora.sh
index fc02e09d7..c2ed4c030 100644
--- a/scripts/finetune_lora.sh
+++ b/scripts/finetune_lora.sh
@@ -17,6 +17,7 @@
 deepspeed llava/train/train_mem.py \
     --deepspeed ./scripts/zero2.json \
     --lora_enable True \
+    --bits 4 \
     --model_name_or_path ./checkpoints/$MODEL_VERSION \
     --version $PROMPT_VERSION \
     --data_path ./playground/data/llava_instruct_80k.json \