Merge remote-tracking branch 'upstream/main' into multi-node-support

Joe Cummings · Joe Cummings · commit 8e203945aa5c · 2025-01-29T12:20:38.000-05:00
diff --git a/recipes/configs/code_llama2/evaluation.yaml b/recipes/configs/code_llama2/evaluation.yaml
@@ -3,6 +3,8 @@
 # To launch, run the following command:
 #    tune run eleuther_eval --config code_llama2/evaluation
 
+output_dir: ./ # Not needed
+
 # Model arguments
 model:
   _component_: torchtune.models.code_llama2.code_llama2_7b
diff --git a/recipes/configs/llama3_2/evaluation.yaml b/recipes/configs/llama3_2/evaluation.yaml
@@ -3,6 +3,8 @@
 # To launch, run the following command:
 #    tune run eleuther_eval --config llama3_2/evaluation
 
+output_dir: ./ # Not needed
+
 # Model Arguments
 model:
   _component_: torchtune.models.llama3_2.llama3_2_3b
diff --git a/recipes/configs/llama3_2_vision/11B_lora_multi_dataset.yaml b/recipes/configs/llama3_2_vision/11B_lora_multi_dataset.yaml
@@ -1,20 +1,22 @@
-# Config for multi-device LoRA finetuning in lora_finetune_distributed_td.py
+# Config for multi-device LoRA finetuning in lora_finetune_distributed_multi_dataset.py
 # using a Llama3.2 11B Vision Instruct model
 #
 # This config assumes that you've run the following command before launching:
 #   tune download meta-llama/Llama-3.2-11B-Vision-Instruct --output-dir /tmp/Llama-3.2-11B-Vision-Instruct --ignore-patterns "original/consolidated*"
 #
 # To launch on 2 devices, run the following command from root:
-#   tune run --nproc_per_node 2 lora_finetune_distributed_td --config llama3_2_vision/11B_lora_td
+#   tune run --nproc_per_node 2 lora_finetune_distributed_multi_dataset --config llama3_2_vision/11B_lora_multi_dataset
 #
 # You can add specific overrides through the command line. For example
 # to override the checkpointer directory while launching training:
-#   tune run --nproc_per_node 2 lora_finetune_distributed_td --config llama3_2_vision/11B_lora_td checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#   tune run --nproc_per_node 2 lora_finetune_distributed_multi_dataset --config llama3_2_vision/11B_lora_multi_dataset checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
 # This config works best when the model is being fine-tuned on 2+ GPUs.
 # For single device LoRA finetuning please use 11B_lora_single_device.yaml
 # or 11B_qlora_single_device.yaml
 
+output_dir: /tmp/torchtune/llama3_2_vision_11B/lora_multi_dataset # /tmp may be deleted by your system. Change it to your preference.
+
 # Model arguments
 model:
   _component_: torchtune.models.llama3_2_vision.lora_llama3_2_vision_11b
@@ -44,7 +46,7 @@ checkpointer:
     filename_format: model-{}-of-{}.safetensors
     max_filename: "00005"
   recipe_checkpoint: null
-  output_dir: /tmp/Llama-3.2-11B-Vision-Instruct/
+  output_dir: ${output_dir}
   model_type: LLAMA3_VISION
 resume_from_checkpoint: False
 save_adapter_weights_only: False # PeFT formatting not available yet. This will save it in torchtune format only.
@@ -117,6 +119,6 @@ dtype: bf16
 output_dir: /tmp/lora-llama3.2-vision-finetune
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: /tmp/Llama-3.2-11B-Vision-Instruct/logs
+  log_dir: ${output_dir}/logs
 log_every_n_steps: 1
 log_peak_memory_stats: True
diff --git a/torchtune/utils/_logging.py b/torchtune/utils/_logging.py
@@ -67,6 +67,9 @@ def deprecated(msg: str = "") -> Callable[[T], T]:
 
     @lru_cache(maxsize=1)
     def warn(obj):
+        rank = dist.get_rank() if dist.is_available() and dist.is_initialized() else 0
+        if rank != 0:
+            return
         warnings.warn(
             f"{obj.__name__} is deprecated and will be removed in future versions. "
             + msg,