opea-project
diff --git a/‎comps/finetuning/src/integrations/xtune/README.md‎
Lines changed: 75 additions & 0 deletions b/‎comps/finetuning/src/integrations/xtune/README.md‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎comps/finetuning/src/integrations/xtune/VERSION‎
Lines changed: 1 addition & 1 deletion b/‎comps/finetuning/src/integrations/xtune/VERSION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎comps/finetuning/src/integrations/xtune/doc/Prepare_dataset.md‎
Lines changed: 151 additions & 0 deletions b/‎comps/finetuning/src/integrations/xtune/doc/Prepare_dataset.md‎
Lines changed: 151 additions & 0 deletions
@@ -161,6 +161,81 @@ cd src/llamafactory/adaclip_finetune
 # Please see README.md in src/llamafactory/adaclip_finetune for detail
 ```
 
+### Qwen2-VL Training and Hyperparameter Optimization
+
+```bash
+# Please see Qwen2-VL_README.md in doc for detail, bolow are simple use
+```
+
+#### Step 1: Finetune qwen2-vl with logging eval loss
+
+If you want to finetune with plotting eval loss, please set eval_strategy as steps, eval_stepsand eval_dataset:
+
+```
+# Finetune qwen2-vl with logging eval loss
+export DATA='where you can find dataset_info.json'
+export dataset=activitynet_qa_2000_limit_20s                    # to point which dataset llamafactory will use
+export eval_dataset=activitynet_qa_val_500_limit_20s
+llamafactory-cli train \
+    --stage sft \
+    --do_train True \
+    --model_name_or_path $models/Qwen2-VL-7B-Instruct-GPTQ-Int8 \
+    --preprocessing_num_workers 16 \
+    --finetuning_type lora \
+    --template qwen2_vl \
+    --flash_attn auto \
+    --dataset_dir $DATA \
+    --dataset $dataset \
+    --cutoff_len 2048 \
+    --learning_rate 5e-05 \
+    --num_train_epochs 20.0 \
+    --max_samples 100000 \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 8 \
+    --lr_scheduler_type cosine \
+    --max_grad_norm 1.0 \
+    --logging_steps 10 \
+    --save_steps 100 \
+    --warmup_steps 100 \
+    --packing False \
+    --report_to none \
+    --output_dir saves/Qwen2-VL-7B-Instruct-GPTQ-Int8/lora/finetune_test_valmetrics_evalstep8 \
+    --bf16 True \
+    --plot_loss True \
+    --ddp_timeout 180000000 \
+    --optim adamw_torch \
+    --video_fps 0.1 \
+    --per_device_eval_batch_size 1 \
+    --eval_strategy steps \
+    --eval_steps 100 \
+    --eval_dataset ${eval_dataset} \
+    --predict_with_generate true \
+    --lora_rank 8 \
+    --lora_alpha 16 \
+    --lora_dropout 0 \
+    --lora_target all
+```
+
+#### step 2: Evaluation metrics calculation and plotting
+
+If you want to plot eval metrics:
+Change `MODEL_NAME`,`EXPERIENT_NAME`,`EVAL_DATASET` as you need and run evaluation metrics calculation sctrpt:
+
+```
+export MODEL_DIR = where can find eval model
+export MODEL_NAME="Qwen2-VL-2B-Instruct"
+export EXPERIENT_NAME="finetune_onlyplot_evalloss_5e-6"
+export EVAL_DATASET=activitynet_qa_val_500_limit_20s
+chmod a+x ./doc/run_eval.sh
+./doc/run_eval.sh
+```
+
+Change `model_name` and `experiment_name` then run:
+
+```
+python plot_metrics.py --model_name your_model_name --experiment_name your_experiment_name
+```
+
 ### DeepSeek-R1 Distillation(not main function)
 
 Please see [doc](./doc/DeepSeek-R1_distillation_best_practice-v1.3.pdf) for details
 
@@ -1 +1 @@
-25.05-dev
+25.07-dev
@@ -133,3 +133,154 @@ wget https://cs.stanford.edu/people/ranjaykrishna/densevid/captions.zip
 ```
 
 - DiDeMo annotations have two components: annotations from the [original author](https://github.com/LisaAnne/LocalizingMoments/tree/master/data) and the split used by [Collaborative Experts](https://github.com/albanie/collaborative-experts/tree/master/misc/datasets/didemo).
+
+## Dataset for Qwen2-VL Finetune
+
+### ActivityNet-QA
+
+Please follow https://github.com/MILVLG/activitynet-qa/tree/master to download and seperata train/val dataset
+
+Then use below python generate_llama_json_limit_frames.py file to generate our train and test dataset:
+python generate_llama_json_limit_frames.py -name val_q -type val -n 500 -seconds 20
+
+generate_llama_json_limit_frames.py
+
+```python
+import json
+import os
+import argparse
+import ffmpeg
+
+# Define the path to the directory where the video files are stored
+video_directory = "where to find dataset"
+
+
+def get_video_duration(video_path):
+    try:
+        probe = ffmpeg.probe(video_path)
+        video_stream = next(stream for stream in probe["streams"] if stream["codec_type"] == "video")
+        return float(video_stream["duration"])
+    except Exception as e:
+        print(f"Error getting duration for video {video_path}: {e}")
+        return 0
+
+
+if __name__ == "__main__":
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description="Generate LLaMA JSON")
+    parser.add_argument("-name", type=str, default="train_q_3000", help="Number of questions to process")
+    parser.add_argument("-type", type=str, default="train", help="data type")
+    parser.add_argument("-fps", type=float, default=0.2, help="data type")
+    parser.add_argument("-n", type=int, default=250, help="data type")
+    parser.add_argument("-seconds", type=int, default=20, help="minimum video duration in seconds")
+    args = parser.parse_args()
+    fps = args.fps
+    basic_seconds = args.seconds
+    question_json = "../activitynet-qa/dataset/{}.json".format(args.name)
+    answer_json = "../activitynet-qa/dataset/{}_a.json".format(args.type)
+    combine_json = "../data/activitynet_qa_{}_{}_limit_{}s.json".format(args.type, args.n, basic_seconds)
+    print("combine_json:", combine_json)
+
+    # Supported video file extensions
+    video_extensions = (".mp4", ".mkv", "webm")
+
+    # Load the questions and answers JSON files
+    with open(question_json, "r") as question_file:
+        questions = json.load(question_file)
+
+    with open(answer_json, "r") as answer_file:
+        answers = json.load(answer_file)
+
+    # Create a dictionary to map question_id to answer for quick lookup
+    answer_lookup = {answer["question_id"]: answer for answer in answers}
+
+    combined_data = []
+    len_pairs = len(questions)
+    # Process each question and look for a corresponding answer
+    for question in questions:
+        question_id = question["question_id"]
+        if question_id in answer_lookup:
+            answer = answer_lookup[question_id]
+
+            # Extract the video name typically between 'v_' and the second underscore or end
+            video_name_without_path = ("_").join(question_id.split("_")[:-1])
+            # Search for the video file that matches the extracted name
+            video_path = None
+            find_flag = False
+            # Walk through the directory to find matching video files
+            for root, dirs, files in os.walk(video_directory):
+                for file in files:
+                    if file.startswith(video_name_without_path) and file.endswith(video_extensions):
+                        video_path = os.path.join(root, file)
+                        find_flag = True
+                        break
+                if video_path:
+                    break
+            if not find_flag:
+                print("!!not find:", video_name_without_path)
+            if video_path:
+                video_duration = get_video_duration(video_path)
+                if video_duration > basic_seconds:
+                    combined_entry = {
+                        "messages": [
+                            {"content": f"<video>{question['question']}?", "role": "user"},
+                            {"content": answer["answer"], "role": "assistant"},
+                        ],
+                        "videos": [video_path],
+                    }
+                    combined_data.append(combined_entry)
+                    if len(combined_data) % 100 == 0:
+                        print(f"Processed {len(combined_data)} entries")
+                    if len(combined_data) >= args.n:
+                        break
+                else:
+                    print("video_duration < basic_seconds", video_duration, video_path)
+    # Write the combined data to the output JSON file
+    with open(combine_json, "w") as combine_file:
+        json.dump(combined_data, combine_file, indent=4)
+```
+
+## Update dataset_info.json
+
+### dataset_info.json
+
+```json
+{
+  "caltech101": {
+    "file_name": "caltech101.json"
+  },
+  "ActivityNet": {
+    "file_name": "ActivityNet.json"
+  },
+  "activitynet_qa_2000_limit_20s": {
+    "file_name": "activitynet_qa_2000_limit_20s.json",
+    "formatting": "sharegpt",
+    "columns": {
+      "messages": "messages",
+      "videos": "videos"
+    },
+    "tags": {
+      "role_tag": "role",
+      "content_tag": "content",
+      "user_tag": "user",
+      "assistant_tag": "assistant"
+    }
+  }
+}
+```
+
+### caltech101.json
+
+```json
+[]
+```
+
+### ActivityNet.json
+
+```json
+[]
+```
+
+### activitynet_qa_2000_limit_20s.json
+
+Generate by generate_llama_json_limit_frames.py