multimodal-art-projection · a43992899 · Jun 4, 2025 · May 18, 2025 · May 18, 2025
diff --git a/.gitignore b/.gitignore
@@ -165,3 +165,5 @@ output
 inference/xcodec_mini_infer
 inference/run_infer.sh
 finetune/wandb
+*.bin
+*.idx
diff --git a/finetune/README.md b/finetune/README.md
@@ -95,7 +95,7 @@ cd finetune/
 
 2. Run the token counting script:
 ```bash
-bash scripts/count_tokens.sh
+bash scripts/count_tokens.sh ./example/mmap/
 ```
 
 The results will be saved in `finetune/count_token_logs/`. This process may take several minutes for large datasets.
@@ -129,13 +129,20 @@ YuE supports finetuning using LoRA (Low-Rank Adaptation), which significantly re
 
 ```bash
 # Update data paths
-DATA_PATH="/path/to/your/data1 /path/to/your/data2"
+# Accepted formats for DATA_PATH:
+#   1) a single path: "/path/to/data"
+#   2) multiple datasets with weights: "100 /path/to/data1 200 /path/to/data2 ..."
+# You can copy DATA_PATH from the output of core/parse_mixture.py in Step 2
+DATA_PATH="data1-weight /path/to/data1 data2-weight /path/to/data2"
 DATA_CACHE_PATH="/path/to/your/cache"
 
+# Set comma-separated list of proportions for train/val/test split
+DATA_SPLIT="900,50,50"
+
 # Set model paths
 TOKENIZER_MODEL_PATH="/path/to/tokenizer"
 MODEL_NAME="m-a-p/YuE-s1-7B-anneal-en-cot"  # or your local model path
-CACHE_DIR="/path/to/model/cache"
+MODEL_CACHE_DIR="/path/to/model/cache"
 OUTPUT_DIR="/path/to/save/finetuned/model"
 
 # Configure LoRA parameters (optional)

diff --git a/finetune/example/dummy_data_mixture_cfg.yml b/finetune/example/dummy_data_mixture_cfg.yml
@@ -3,8 +3,8 @@ GLOBAL_BATCH_SIZE: 8
 SEQ_LEN: 8192
 
 1_ROUND:
-  - /aifs4su/mmcode/codeclm/opensuno_publish/YuE/finetune/example/mmap/dummy.msa.xcodec_16k_stage_1_token_level_interleave_cot_xcodec_textfirst_text_document.bin
-  - /aifs4su/mmcode/codeclm/opensuno_publish/YuE/finetune/example/mmap/dummy.msa.xcodec_16k_stage_1_token_level_interleave_long_prompt_msa_textfirst_dual_text_document.bin
-  - /aifs4su/mmcode/codeclm/opensuno_publish/YuE/finetune/example/mmap/dummy.msa.xcodec_16k_stage_1_token_level_interleave_long_prompt_msa_textfirst_inst_text_document.bin
-  - /aifs4su/mmcode/codeclm/opensuno_publish/YuE/finetune/example/mmap/dummy.msa.xcodec_16k_stage_1_token_level_interleave_long_prompt_msa_textfirst_mixture_text_document.bin
-  - /aifs4su/mmcode/codeclm/opensuno_publish/YuE/finetune/example/mmap/dummy.msa.xcodec_16k_stage_1_token_level_interleave_long_prompt_msa_textfirst_vocal_text_document.bin
+  - ./example/mmap/dummy.msa.xcodec_16k_stage_1_token_level_interleave_cot_xcodec_textfirst_text_document.bin
+  - ./example/mmap/dummy.msa.xcodec_16k_stage_1_token_level_interleave_long_prompt_msa_textfirst_dual_text_document.bin
+  - ./example/mmap/dummy.msa.xcodec_16k_stage_1_token_level_interleave_long_prompt_msa_textfirst_inst_text_document.bin
+  - ./example/mmap/dummy.msa.xcodec_16k_stage_1_token_level_interleave_long_prompt_msa_textfirst_mixture_text_document.bin
+  - ./example/mmap/dummy.msa.xcodec_16k_stage_1_token_level_interleave_long_prompt_msa_textfirst_vocal_text_document.bin
diff --git a/finetune/requirements.txt b/finetune/requirements.txt
@@ -21,6 +21,7 @@ mpmath==1.3.0
 msgpack==1.1.0
 networkx==3.3
 ninja==1.11.1.4
+nltk==3.9.1
 numpy==2.1.2
 nvidia-cublas-cu12==12.1.3.1
 nvidia-cuda-cupti-cu12==12.1.105
@@ -50,6 +51,7 @@ PyYAML==6.0.2
 regex==2024.11.6
 requests==2.32.3
 safetensors==0.5.3
+scipy==1.15.3
 sentencepiece==0.2.0
 sentry-sdk==2.28.0
 setproctitle==1.3.6

diff --git a/finetune/scripts/run_finetune.sh b/finetune/scripts/run_finetune.sh
@@ -12,7 +12,7 @@ print_help() {
   echo "Before running this script, please update the following variables:"
   echo ""
   echo "1. Data paths:"
-  echo "   DATA_PATH - Replace <path_to_data_X> with actual paths to your data files"
+  echo "   DATA_PATH - Replace <weight_and_path_to_data_X> with actual weights and data paths"
   echo "   DATA_CACHE_PATH - Replace <path_to_data_cache> with actual cache directory"
   echo ""
   echo "2. Model configuration:"
@@ -24,7 +24,7 @@ print_help() {
   echo "   WANDB_API_KEY - Replace <your_wandb_api_key> with your actual API key"
   echo ""
   echo "Example usage:"
-  echo "  DATA_PATH=\"/path/to/data1 /path/to/data2\""
+  echo "  DATA_PATH=\"data1-weight /path/to/data1 data2-weight /path/to/data2\""
   echo "  DATA_CACHE_PATH=\"/path/to/cache\""
   echo "  TOKENIZER_MODEL_PATH=\"/path/to/tokenizer\""
   echo "  MODEL_CACHE_DIR=\"/path/to/model/cache\""
@@ -43,8 +43,8 @@ fi
 check_placeholders() {
   local has_placeholders=false
 
-  if [[ "$DATA_PATH" == *"<path_to_data"* ]]; then
-    echo "Error: Please set actual data paths in DATA_PATH variable."
+  if [[ "$DATA_PATH" == *"<weight_and_path_to_data"* ]]; then
+    echo "Error: Please set actual weight and data paths in DATA_PATH variable."
     has_placeholders=true
   fi
 
@@ -113,9 +113,12 @@ TRAIN_ITERS=150
 NUM_TRAIN_EPOCHS=10
 
 # Data paths (replace with your actual paths)
-DATA_PATH="<path_to_data_X>"
+DATA_PATH="<weight_and_path_to_data_X>"     
 DATA_CACHE_PATH="<path_to_tokenizer_model>"
 
+# Set comma-separated list of proportions for training, validation, and test split
+DATA_SPLIT="900,50,50"
+
 # Model configuration
 TOKENIZER_MODEL_PATH="<path_to_tokenizer_model>"
 MODEL_NAME="m-a-p/YuE-s1-7B-anneal-en-cot"
@@ -166,6 +169,7 @@ CMD="torchrun --nproc_per_node=$NUM_GPUS --master_port=$MASTER_PORT scripts/trai
     --seq-length $SEQ_LENGTH \
     --data-path $DATA_PATH \
     --data-cache-path $DATA_CACHE_PATH \
+    --split $DATA_SPLIT \
     --tokenizer-model $TOKENIZER_MODEL_PATH \
     --global-batch-size $GLOBAL_BATCH_SIZE \
     --per-device-train-batch-size $PER_DEVICE_TRAIN_BATCH_SIZE \
@@ -179,6 +183,8 @@ CMD="torchrun --nproc_per_node=$NUM_GPUS --master_port=$MASTER_PORT scripts/trai
 # Add conditional arguments
 if [ "$USE_WANDB" = true ]; then
     CMD="$CMD --report-to wandb --run-name \"$RUN_NAME\""
+elif [ "$USE_WANDB" = false ]; then
+    CMD="$CMD --report-to none"
 fi
 
 CMD="$CMD \