awslabs
diff --git a/‎keys_values/data/iterators.py‎
Lines changed: 0 additions & 4 deletions b/‎keys_values/data/iterators.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎keys_values/data/load_helmet_dev_eval.py‎
Lines changed: 7 additions & 7 deletions b/‎keys_values/data/load_helmet_dev_eval.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎keys_values/finetune/args.py‎
Lines changed: 12 additions & 0 deletions b/‎keys_values/finetune/args.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎keys_values/finetune/longcon_offload_full.py‎
Lines changed: 1 addition & 0 deletions b/‎keys_values/finetune/longcon_offload_full.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎keys_values/finetune/longcon_offload_lora.py‎
Lines changed: 1 addition & 0 deletions b/‎keys_values/finetune/longcon_offload_lora.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎keys_values/finetune/longcontext_eval.py‎
Lines changed: 6 additions & 4 deletions b/‎keys_values/finetune/longcontext_eval.py‎
Lines changed: 6 additions & 4 deletions
@@ -203,10 +203,6 @@ def __init__(
     ):
         assert micro_batch_size >= 1
         assert num_devices >= 1
-        if micro_batch_size == 1 and num_devices == 1:
-            raise ValueError(
-                "This sampler requires micro_batch_size > 1 or num_devices > 1"
-            )
         if shortest_first and longest_first:
             raise ValueError("Cannot set both shortest_first and longest_first")
         if num_devices > 1:
 
@@ -456,7 +456,7 @@ def load_rag(
             "8k": "kilt/popqa_test_1000_k50_dep6.jsonl",
         },
     }  # the load paths can only be stored in this way, as they are hard-coded from the original code
-    instruction_template = get_instruction_template(dataset_key)
+    instruction_template = get_instruction_template(dataset_key)[0]
 
     instance_path = str(
         Path(dataset_parent_dir) / Path(file_paths[dataset_key][max_length])
@@ -589,7 +589,7 @@ def load_cited_generation(
         "16k": 75,
         "8k": 30,
     }
-    instruction_template = get_instruction_template(dataset_key)
+    instruction_template = get_instruction_template(dataset_key)[0]
     demo_template = "Instruction: Write an accurate, engaging, and concise answer for the given question using only the provided search results (some of which might be irrelevant) and cite them properly. Use an unbiased and journalistic tone. Always cite for any factual claim. When citing a document, surround its ID with square brackets, such as [x] to cite document x. To cite multiple documents, simply concatenate the citation markers; for example, use [x][y][z] to cite the documents with ID x, y, and z. Cite at least one document and at most three documents in each sentence. If multiple documents support the sentence, only cite a minimum sufficient subset of the documents.\n\nQuestion: {question}\n\n{context}\n\nAnswer: {answer}"
     doc_template = "Document [{ID}](Title: {title}): {text}"
 
@@ -658,7 +658,7 @@ def load_rerank(
             "8k": "msmarco/test_reranking_data_k50_dep3.jsonl",
         },
     }
-    instruction_template = get_instruction_template(dataset_key)
+    instruction_template = get_instruction_template(dataset_key)[0]
 
     instance_path = str(
         Path(dataset_parent_dir) / Path(file_paths[dataset_key][max_length])
@@ -765,7 +765,7 @@ def load_icl(
         "banking77": 77,
         "clinc150": 151,
     }
-    instruction_template = get_instruction_template(dataset_key)
+    instruction_template = get_instruction_template(dataset_key)[0]
     demo_template = "{text}\nlabel: {label}"
 
     if dataset_key == "trec_coarse":
@@ -938,7 +938,7 @@ def load_long_doc_qa(
     )
     eval_questions_num = 100
 
-    instruction_template = get_instruction_template(dataset_key)
+    instruction_template = get_instruction_template(dataset_key)[0]
     if dataset_key == "narrative_qa":
         all_data = load_dataset("narrativeqa")
         instance_data = all_data["test"].shuffle(seed=seed)
@@ -1114,7 +1114,7 @@ def load_summarization(
         "meta-llama/Llama-2-7b-hf", token=HF_TOKEN
     )
 
-    instruction_template = get_instruction_template(dataset_key)
+    instruction_template = get_instruction_template(dataset_key)[0]
     if dataset_key == "infinite_bench_sum":
         eval_questions_num = 50  # different from HELMET
         ft = Features(
@@ -1302,7 +1302,7 @@ def load_synthetic(
     )
     instance_data = load_dataset("json", data_files=data_path)["train"]
 
-    instruction_template = get_instruction_template(dataset_key)
+    instruction_template = get_instruction_template(dataset_key)[0]
     if dataset_key == "json_kv":
         demo_template = "Key: {key}\nCorresponding value:{value}"
 
 
@@ -421,6 +421,14 @@ class TrainArgs:
     Args:
         intermed_save_interval: See above
         intermed_save_num: See above
+        max_grad_norm: If not `None`, we use gradient clipping (so
+            `torch.nn.utils.clip_grad_norm_`) with this maximum norm.
+            Defaults to 1.0.
+        average_loss_per_batch: If `True`, the sum of loss values for a batch
+            is normalized by the number of non-masked target tokens in that
+            batch. Otherwise (`False`, the default), we average the sum of loss
+            values per data case (by the number of non-masked target tokens),
+            then use the uniform average over the batch.
     """
 
     save_interval: Optional[int] = 1000
@@ -450,6 +458,8 @@ class TrainArgs:
     """Whether to tie the embedding weights with the language modeling head weights"""
     intermed_save_interval: Optional[int] = None
     intermed_save_num: Optional[int] = None
+    max_grad_norm: Optional[float] = 1.0
+    average_loss_per_batch: Optional[bool] = False
 
     def __post_init__(self) -> None:
         if self.lr_warmup_fraction and self.lr_warmup_steps:
@@ -492,6 +502,8 @@ def __post_init__(self) -> None:
             raise ValueError(
                 "intermed_save_num only needed if intermed_save_interval is given"
             )
+        if self.max_grad_norm is not None and self.max_grad_norm <= 0:
+            raise ValueError("max_grad_norm must be positive (or `None` to disable)")
 
     def gradient_accumulation_iters(self, devices: int, num_nodes: int = 1) -> int:
         """Number of iterations between gradient synchronizations"""
 
@@ -49,6 +49,7 @@ def setup(
         max_seq_length=None,
         intermed_save_interval=None,
         intermed_save_num=None,
+        max_grad_norm=1.0,
     ),
     eval: EvalArgs = EvalArgs(
         interval=100,
 
@@ -50,6 +50,7 @@ def setup(
         max_seq_length=None,
         intermed_save_interval=None,
         intermed_save_num=None,
+        max_grad_norm=1.0,
     ),
     lora: LoRAArgs = LoRAArgs(
         r=8,
 
@@ -419,6 +419,7 @@ def main(
             attention_backward_temp_size_gb=None,
             max_batch_size=batch_size,
             dtype=dtype,
+            average_loss_per_batch=False,
             fabric=fabric,
             model_kwargs=None,
         )
@@ -502,16 +503,17 @@ def main(
             t0 = time.perf_counter()
             # One entry per batch dimension:
             input_ids = batch[INPUT_IDS_NAME]
+            targets = batch["targets"]
             if evaluator is None:
                 with torch.no_grad():
-                    metric_values = model(input_ids, batch["targets"])
+                    metric_values = model(input_ids, targets)
                 metric_name = "eval_loss"
             else:
                 metric_name = evaluator.metrics[0]
-                targets = batch[TARGETS_STRINGS_NAME]
-                prompt_len = input_ids.shape[1] - batch["targets"].shape[1] + 1
+                prompt_len = input_ids.shape[1] - targets.shape[1] + 1
                 prompts = input_ids[:, :prompt_len]
-                metric_values = evaluator(model, prompts, targets)[metric_name]
+                raw_targets = batch[TARGETS_STRINGS_NAME]
+                metric_values = evaluator(model, prompts, raw_targets)[metric_name]
             eval_time = time.perf_counter() - t0
             print_with_rank_and_timestamp(
                 f"Batch {task}, {orig_idxs}: {metric_name} = {metric_values.mean().item():.3f}, eval_time = {eval_time * 1000:.2f} ms",