awslabs
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/launch_instance.md‎
Lines changed: 15 additions & 1 deletion b/‎docs/launch_instance.md‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎keys_values/finetune/args.py‎
Lines changed: 4 additions & 26 deletions b/‎keys_values/finetune/args.py‎
Lines changed: 4 additions & 26 deletions
diff --git a/‎keys_values/finetune/longcon_offload_full.py‎
Lines changed: 0 additions & 1 deletion b/‎keys_values/finetune/longcon_offload_full.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎keys_values/finetune/longcon_offload_lora.py‎
Lines changed: 0 additions & 1 deletion b/‎keys_values/finetune/longcon_offload_lora.py‎
Lines changed: 0 additions & 1 deletion
@@ -107,6 +107,7 @@ pip install --upgrade pip
 pip install torch==2.10.0 torchmetrics==1.8.2 torchvision==0.25.0
 echo "torch==2.10.0" >constraints.txt
 pip install flashinfer-python==0.6.7 -c constraints.txt
+rm constraints.txt
 pip install 'litgpt[all,test,extra]'
 cd keys_values
 pip install -e .
 
@@ -17,7 +17,7 @@ ssh -i "matthis_deeplearning_uswest2.pem" ubuntu@ec2-35-85-224-176.us-west-2.com
 
 ### Instance `P4Research2`
 
-* Instance ID: `??`
+* Instance ID: `i-01dee00f42a643ea7`
 * File system ID: `fs-0186b686e7dffc35b`
 * VPC: `vpc-0619b17e`
 * Subnet ID: `subnet-124f5848`
@@ -27,6 +27,20 @@ ssh -i "matthis_deeplearning_uswest2.pem" ubuntu@ec2-35-85-224-176.us-west-2.com
 ssh -i "matthis_deeplearning_uswest2.pem" ubuntu@ec2-34-209-209-37.us-west-2.compute.amazonaws.com
 ```
 
+### Instance `P4Research3`
+
+Note: `P4Research2` and `P4Research2` share the same EFS volume.
+
+* Instance ID: `i-0eb0cdd4eb6d6a6e3`
+* File system ID: `fs-0186b686e7dffc35b`
+* VPC: `vpc-0619b17e`
+* Subnet ID: `subnet-124f5848`
+* AZ: `us-west-2c`
+
+```bash
+ssh -i "matthis_deeplearning_uswest2.pem" ubuntu@ec2-16-147-216-186.us-west-2.compute.amazonaws.com
+```
+
 
 ## Launch and start instance
 
 
@@ -406,9 +406,8 @@ class TrainArgs:
     """
     Modified training-related arguments in :class:`litgpt.args.TrainArgs`.
 
-    Here, `global_batch_size` does not have a default value. If not given,
-    it should be set to the product of `micro_batch_size` and the number of
-    devices, unless sequential gradient averaging is desired.
+    `global_batch_size` is a legacy argument, which must be equal to the
+    product of `micro_batch_size` and the number of devices, if given.
 
     Storing intermediate checkpoints: Normal checkpoints are stored whenever
     `state["step_count"] % train.save_interval == 0`. If
@@ -437,7 +436,7 @@ class TrainArgs:
     log_interval: int = 1
     """Number of iterations between logging calls"""
     global_batch_size: Optional[int] = None
-    """Number of samples between optimizer steps across data-parallel ranks"""
+    """Legacy argument: Do not use"""
     micro_batch_size: int = 4
     """Number of samples per data-parallel rank"""
     lr_warmup_steps: Optional[int] = 100
@@ -506,23 +505,6 @@ def __post_init__(self) -> None:
         if self.max_grad_norm is not None and self.max_grad_norm <= 0:
             raise ValueError("max_grad_norm must be positive (or `None` to disable)")
 
-    def gradient_accumulation_iters(self, devices: int, num_nodes: int = 1) -> int:
-        """Number of iterations between gradient synchronizations"""
-        gradient_accumulation_iters = (
-            self.batch_size(devices, num_nodes) // self.micro_batch_size
-        )
-        assert gradient_accumulation_iters > 0
-        return gradient_accumulation_iters
-
-    def batch_size(self, devices: int, num_nodes: int = 1) -> int:
-        """Number of samples between optimizer steps per data-parallel rank"""
-        if self.global_batch_size is None:
-            batch_size = self.micro_batch_size
-        else:
-            batch_size = self.global_batch_size // (devices * num_nodes)
-        assert batch_size > 0
-        return batch_size
-
     def warmup_iters(
         self, devices: int, num_nodes: int, max_iters: int, train_dataloader
     ) -> int:
@@ -532,11 +514,7 @@ def warmup_iters(
                 max_iters, math.ceil(self.lr_warmup_fraction * len(train_dataloader))
             )
         if self.lr_warmup_steps:
-            return min(
-                max_iters,
-                self.lr_warmup_steps
-                * self.gradient_accumulation_iters(devices, num_nodes),
-            )
+            return min(max_iters, self.lr_warmup_steps)
         return 0
 
 
 
@@ -216,7 +216,6 @@ def setup(
         out_dir,
         precision,
         devices,
-        1,
         resume,
         data,
         train,
 
@@ -231,7 +231,6 @@ def setup(
         out_dir,
         precision,
         devices,
-        1,
         resume,
         data,
         train,