From 476f600be931e77b1d819ff05cc78709608d5269 Mon Sep 17 00:00:00 2001 From: Shijie <821898965@qq.com> Date: Tue, 24 Dec 2024 22:55:45 +0800 Subject: [PATCH 1/3] remove-redundant-code (#947) --- applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py b/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py index 7e3e6776b..e53042075 100644 --- a/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py +++ b/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py @@ -211,12 +211,7 @@ def create_dataset_split(current_dataset, raw_dataset, train_phase, tokenizer, padding="max_length", truncation=True, return_tensors="pt") - chosen_token["input_ids"] = chosen_token["input_ids"] - chosen_token["attention_mask"] = chosen_token["attention_mask"] chosen_dataset.append(chosen_token) - - reject_token["input_ids"] = reject_token["input_ids"] - reject_token["attention_mask"] = reject_token["attention_mask"] reject_dataset.append(reject_token) print( f'Creating dataset {raw_dataset.dataset_name_clean} for {train_phase=} size={len(chosen_dataset)}' From 1842b4fddfcfaedffbb697f53ffe3a57aea718c4 Mon Sep 17 00:00:00 2001 From: stceum <50257864+stceum@users.noreply.github.com> Date: Tue, 7 Jan 2025 03:11:44 +0800 Subject: [PATCH 2/3] Add DPO support for DeepSpeed-Chat (#828) * Add label_smoothing while calculating step2 DPO loss in DeepSpeed-Chat. * Add training scripts for step2 DPO in DeepSpeed-Chat. * Remove unused packages and format the code of step2 DPO in DeepSpeed-Chat. * Update training scripts of step2 DPO in DeepSpeed-Chat. * Follow upstream fixes. * Update README.md for Step2 DPO finetuning. * Add opt 350M training log demo for step 2 dpo finetuning in DeepSpeed-Chat. * Address the formatting issue in step2 dpo finetuning in DeepSpeed-Chat. --------- Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Olatunji Ruwase --- .../training/step2_dpo_finetuning/README.md | 26 + .../training/step2_dpo_finetuning/main.py | 528 ++ .../opt-350M_globalBatchSize-32.log | 6409 +++++++++++++++++ .../training_scripts/README.md | 6 + .../training_scripts/llama2/run_llama2_7b.sh | 35 + .../llama2/run_llama2_7b_lora.sh | 37 + .../opt/multi_node/run_350m.sh | 34 + .../opt/single_gpu/run_350m.sh | 20 + .../opt/single_node/run_350m.sh | 34 + .../opt/single_node/sweep/README.md | 20 + .../opt/single_node/sweep/run_single.sh | 46 + .../opt/single_node/sweep/run_step2_sweep.sh | 21 + 12 files changed, 7216 insertions(+) create mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/README.md create mode 100755 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/main.py create mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_log_output/opt-350M_globalBatchSize-32.log create mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/README.md create mode 100755 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b.sh create mode 100755 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh create mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/multi_node/run_350m.sh create mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_gpu/run_350m.sh create mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_350m.sh create mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/README.md create mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_single.sh create mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_step2_sweep.sh diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/README.md b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/README.md new file mode 100644 index 000000000..9a086003c --- /dev/null +++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/README.md @@ -0,0 +1,26 @@ +# πŸ• Direct Preference Optimization (DPO) finetuning +[Direct Preference Optimization (DPO)](https://arxiv.org/abs/2305.18290) is a novel approach to preference learning, which directly optimizes the policy without explicit reward modeling or reinforcement learning. It leverages a specific parameterization of the reward model that enables the extraction of the corresponding optimal policy in closed form. By using a simple classification loss, DPO aligns language models with human preferences, avoiding the complexity and instability often associated with RLHF. + +As the paper says, "Your Language Model is Secretly a Reward Model." Therefore, the training arguments and the training process of DPO are mostly the same as the reward model, as shown in [step2 "Reward Model (RM) finetuning"](../step2_reward_model_finetuning/README.md). After the training of DPO, you will get a model that has been aligned with human preferences. + +## πŸƒ How to train the model + +We provide the script for OPT-350m, which you can test by launching the command + +```bash +training_scripts/opt/single_node/run_350m.sh +``` + +We also provide the script for llama2, which you can test by launching the command + +```bash +training_scripts/llama2/run_llama2_7b.sh +``` + +## πŸƒ How to evaluate the DPO checkpoint? + +The checkpoint of DPO is exactly the language model that can be evaluated as [step1 "Supervised Finetuning"](../step1_supervised_finetuning/README.md). + +## πŸ’ Datasets + +Because DPO treats the language model as a reward model, the dataset for DPO is in the same format as that used for reward model fine-tuning. Each item in the dataset includes one "chosen" and one "rejected" output for the same input. diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/main.py b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/main.py new file mode 100755 index 000000000..f36e7a69b --- /dev/null +++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/main.py @@ -0,0 +1,528 @@ +#!/usr/bin/env python +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +import argparse +import math + +import torch +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler +from torch.utils.data.distributed import DistributedSampler + +from transformers import ( + AutoModelForCausalLM, + SchedulerType, + get_scheduler, +) + +import deepspeed +from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam +from deepspeed import get_accelerator + +from dschat.utils.data.data_utils import create_prompt_dataset, DataCollatorReward +from dschat.utils.utils import print_rank_0, to_device, save_hf_format, set_random_seed, get_all_reduce_mean, get_optimizer_grouped_parameters, save_zero_three_model, load_hf_tokenizer +from dschat.utils.ds_utils import get_train_ds_config, get_eval_ds_config +from dschat.utils.module.lora import convert_linear_layer_to_lora, convert_lora_to_linear_layer, only_optimize_lora_parameters, make_model_gradient_checkpointing_compatible +from dschat.utils.model.model_utils import create_hf_model, causal_lm_model_to_fp32_loss +from dschat.utils.perf import print_throughput + + +def parse_args(): + parser = argparse.ArgumentParser( + description= + "Finetune a transformers model on a causal language modeling task") + parser.add_argument('--data_path', + nargs='*', + default=['Dahoas/rm-static'], + help='Path to the training dataset. Accepted format:' + '1) a single data path, 2) multiple datasets in the' + 'form: dataset1-path dataset2-path ...') + parser.add_argument('--data_split', + type=str, + default='2,4,4', + help='Comma-separated list of proportions for training' + 'phase 1, 2, and 3 data. For example the split `6,2,2`' + 'will use 60%% of data for phase 1, 20%% for phase 2' + 'and 20%% for phase 3.') + parser.add_argument( + '--data_output_path', + type=str, + default='/tmp/data_files/', + help= + 'Where to store the data-related files such as shuffle index. This needs to be on a local storage of a node (not on a shared storage)' + ) + parser.add_argument( + "--model_name_or_path", + type=str, + help= + "Path to pretrained model or model identifier from huggingface.co/models.", + required=True, + ) + parser.add_argument( + "--per_device_train_batch_size", + type=int, + default=16, + help="Batch size (per device) for the training dataloader.", + ) + parser.add_argument( + "--per_device_eval_batch_size", + type=int, + default=16, + help="Batch size (per device) for the evaluation dataloader.", + ) + parser.add_argument( + "--max_seq_len", + type=int, + default=512, + help="The maximum sequence length.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=1e-3, + help= + "Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument("--weight_decay", + type=float, + default=0., + help="Weight decay to use.") + parser.add_argument("--num_train_epochs", + type=int, + default=1, + help="Total number of training epochs to perform.") + # Reference: https://github.com/eric-mitchell/direct-preference-optimization/blob/main/trainers.py + parser.add_argument( + "--beta", + type=float, + default=1e-1, + help= + "Temperature parameter for the DPO loss, typically something in the range of 0.1 to 0.5. We ignore the reference model as beta -> 0." + ) + parser.add_argument( + "--label_smoothing", + type=float, + default=0.0, + help= + "conservativeness for DPO loss, which assumes that preferences are noisy (flipped with probability label_smoothing)" + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help= + "Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument( + "--lr_scheduler_type", + type=SchedulerType, + default="cosine", + help="The scheduler type to use.", + choices=[ + "linear", "cosine", "cosine_with_restarts", "polynomial", + "constant", "constant_with_warmup" + ], + ) + parser.add_argument( + "--num_warmup_steps", + type=int, + default=0, + help="Number of steps for the warmup in the lr scheduler.") + parser.add_argument("--output_dir", + type=str, + default=None, + help="Where to store the model.") + parser.add_argument("--seed", + type=int, + default=1234, + help="A seed for reproducible training.") + parser.add_argument("--local_rank", + type=int, + default=-1, + help="local_rank for distributed training on gpus") + parser.add_argument('--gradient_checkpointing', + action='store_true', + help='Enable HF gradient checkpointing for model.') + parser.add_argument( + "--dropout", + type=float, + default=None, + help="If dropout configured, use it. " + "Otherwise, keep the default dropout configuration of the model.") + # deepspeed features + parser.add_argument('--offload', + action='store_true', + help='Enable ZeRO Offload techniques.') + parser.add_argument('--dtype', + type=str, + default='fp16', + choices=['fp16', 'bf16'], + help='Training data type') + parser.add_argument( + '--offload_reference_model', + action='store_true', + help='Enable ZeRO Offload techniques for reference model.') + parser.add_argument( + '--zero_stage', + type=int, + default=0, + help='ZeRO optimization stage for Actor model (and clones).') + ## LoRA for efficient training setting + parser.add_argument("--lora_dim", + type=int, + default=0, + help="If > 0, use LoRA for efficient training.") + parser.add_argument("--lora_module_name", + type=str, + default="decoder.layers.", + help="The scope of LoRA.") + parser.add_argument('--only_optimize_lora', + action='store_true', + help='Only optimize the LoRA parameters.') + parser.add_argument( + "--lora_learning_rate", + type=float, + default=5e-4, + help= + "Initial LoRA learning rate (after the potential warmup period) to use." + ) + ## low precision + parser.add_argument( + '--compute_fp32_loss', + action='store_true', + help='Relevant for low precision dtypes (fp16, bf16, etc.). ' + 'If specified, loss is calculated in fp32.') + ## Tensorboard logging + parser.add_argument('--enable_tensorboard', + action='store_true', + help='Enable tensorboard logging') + parser.add_argument('--tensorboard_path', + type=str, + default="step2_tensorboard") + ## Tokenizer + parser.add_argument( + "--add_eot_token", + action='store_true', + help="Add <|endoftext|> as additional special token to tokenizer") + ## Print loss + parser.add_argument('--print_loss', + action='store_true', + help='Prints loss at each step.') + parser = deepspeed.add_config_arguments(parser) + args = parser.parse_args() + + return args + + +# Reference: https://github.com/huggingface/trl/blob/main/trl/trainer/dpo_trainer.py +def get_batch_logps(logits, input_ids, label_mask): + labels = input_ids.clone() * label_mask + assert logits.shape[:-1] == labels.shape, \ + "Logits (batch and sequence length dim) and labels must have the same shape." + labels = labels[:, 1:] + label_mask = label_mask[:, 1:] + logits = logits[:, :-1, :] + per_token_logps = torch.gather(logits.log_softmax(-1), + dim=2, + index=labels.unsqueeze(2)).squeeze(2) + return (per_token_logps * label_mask).sum(-1) + + +def main(): + args = parse_args() + + if args.local_rank == -1: + device = torch.device(get_accelerator().device_name()) + else: + get_accelerator().set_device(args.local_rank) + device = torch.device(get_accelerator().device_name(), args.local_rank) + # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + # torch.distributed.init_process_group(backend='nccl') + deepspeed.init_distributed() + + args.global_rank = torch.distributed.get_rank() + + ds_config = get_train_ds_config(offload=args.offload, + dtype=args.dtype, + stage=args.zero_stage, + enable_tensorboard=args.enable_tensorboard, + tb_path=args.tensorboard_path, + tb_name="step2_model") + ds_config[ + 'train_micro_batch_size_per_gpu'] = args.per_device_train_batch_size + ds_config[ + 'train_batch_size'] = args.per_device_train_batch_size * torch.distributed.get_world_size( + ) * args.gradient_accumulation_steps + + # If passed along, set the training seed now. + set_random_seed(args.seed) + + torch.distributed.barrier() + + # load_hf_tokenizer will get the correct tokenizer and set padding tokens based on the model family + args.end_of_conversation_token = "<|endoftext|>" + additional_special_tokens = args.end_of_conversation_token if args.add_eot_token else None + tokenizer = load_hf_tokenizer(args.model_name_or_path, + fast_tokenizer=True, + add_special_tokens=additional_special_tokens) + + model = create_hf_model(AutoModelForCausalLM, + args.model_name_or_path, + tokenizer, + ds_config, + dropout=args.dropout) + + # DS Config for ref model + ref_zero_stage = args.zero_stage + if ref_zero_stage != 3: + # If it is ZeRO-3 then we use it for everything, otherwise assume we have enough memory for ref model + ref_zero_stage = 0 + ref_ds_config = get_eval_ds_config(args.offload_reference_model, + args.dtype, ref_zero_stage) + ref_ds_config[ + 'train_micro_batch_size_per_gpu'] = args.per_device_train_batch_size + ref_ds_config[ + 'train_batch_size'] = args.per_device_train_batch_size * torch.distributed.get_world_size( + ) * args.gradient_accumulation_steps + ref_ds_eval_config = get_eval_ds_config(offload=False, + dtype=args.dtype, + stage=ref_zero_stage) + ref_ds_eval_config[ + 'train_micro_batch_size_per_gpu'] = args.per_device_train_batch_size + ref_ds_eval_config[ + 'train_batch_size'] = args.per_device_train_batch_size * torch.distributed.get_world_size( + ) * args.gradient_accumulation_steps + ref_model = create_hf_model(AutoModelForCausalLM, + args.model_name_or_path, + tokenizer, + ref_ds_eval_config, + dropout=args.dropout) + # End of DS config for ref model + + if args.compute_fp32_loss: + print_rank_0( + f"Using model {model.__class__.__name__} with loss in fp32", + args.global_rank) + causal_lm_model_to_fp32_loss(model) + + # Copied from ../step2_reward_model_finetuning/main.py. + # Model bigscience/bloom-560m has large variance at ln_f.weight parameter + # This makes bf16 finetuning hard. + # In general, since we are replacing the model head, it makes sense to reset + # the LN that precedes it. + force_optimize_params = [] + if "bigscience/bloom-" in args.model_name_or_path: + zero_init_enabled = (args.zero_stage == 3) + params = [ + model.rwtranrsformer.ln_f.weight, model.rwtranrsformer.ln_f.bias + ] + with deepspeed.zero.GatheredParameters(params, + modifier_rank=0, + enabled=zero_init_enabled): + if deepspeed.comm.get_rank() == 0 or not zero_init_enabled: + torch.nn.init.ones_(model.rwtransformer.ln_f.weight) + torch.nn.init.zeros_(model.rwtransformer.ln_f.bias) + force_optimize_params.extend( + ['rwtransformer.ln_f.weight', 'rwtransformer.ln_f.bias']) + + if args.lora_dim > 0: + model = convert_linear_layer_to_lora(model, args.lora_module_name, + args.lora_dim) + if args.only_optimize_lora: + model = only_optimize_lora_parameters(model) + model = make_model_gradient_checkpointing_compatible(model) + + # Prepare the data + train_phase = 2 + train_dataset, eval_dataset = create_prompt_dataset( + args.local_rank, args.data_path, args.data_split, + args.data_output_path, train_phase, args.seed, tokenizer, + args.max_seq_len) + + # DataLoaders creation: + data_collator = DataCollatorReward() + if args.local_rank == -1: + train_sampler = RandomSampler(train_dataset) + eval_sampler = SequentialSampler(eval_dataset) + else: + train_sampler = DistributedSampler(train_dataset) + eval_sampler = DistributedSampler(eval_dataset) + train_dataloader = DataLoader(train_dataset, + collate_fn=data_collator, + sampler=train_sampler, + batch_size=args.per_device_train_batch_size) + eval_dataloader = DataLoader(eval_dataset, + collate_fn=data_collator, + sampler=eval_sampler, + batch_size=args.per_device_eval_batch_size) + + def evaluation(model, ref_model, tokenizer, eval_dataloader): + model.eval() + losses = 0 + for step, batch in enumerate(eval_dataloader): + batch = to_device(batch, device) + batch_size = batch['input_ids'].shape[0] // 2 + chosen_input_ids = batch['input_ids'][:batch_size] + rejected_input_ids = batch['input_ids'][batch_size:] + label_mask = (batch['input_ids'] != tokenizer.pad_token_id).int() + for i in range(batch_size): + divergence_ind = (chosen_input_ids[i] != + rejected_input_ids[i]).nonzero().squeeze(-1) + if len(divergence_ind) > 0: + divergence_ind = divergence_ind[0] + else: + divergence_ind = 0 + label_mask[i][:divergence_ind] = 0 + label_mask[i + batch_size][:divergence_ind] = 0 + with torch.no_grad(): + outputs = model(**batch) + ref_outputs = ref_model(**batch) + + logps = get_batch_logps(outputs.logits, batch['input_ids'], + label_mask) + ref_logps = get_batch_logps(ref_outputs.logits, batch['input_ids'], + label_mask) + + chosen_logps = logps[:batch_size] + rejected_logps = logps[batch_size:] + ref_chosen_logps = ref_logps[:batch_size] + ref_rejected_logps = ref_logps[batch_size:] + + logits = args.beta * ((chosen_logps - ref_chosen_logps) - + (rejected_logps - ref_rejected_logps)) + loss = (- torch.nn.functional.logsigmoid(logits) * (1 - args.label_smoothing) - \ + torch.nn.functional.logsigmoid(-logits) * args.label_smoothing).mean(0) + losses += loss.float() + losses = losses / (step + 1) + try: + losses = get_all_reduce_mean(losses) + except: + pass + chosen_rewards = args.beta * (chosen_logps - ref_chosen_logps).detach() + rejected_rewards = args.beta * (rejected_logps - + ref_rejected_logps).detach() + return chosen_rewards.mean().item(), rejected_rewards.mean().item( + ), losses.item() + + # Split weights in two groups, one with weight decay and the other not. + optimizer_grouped_parameters = get_optimizer_grouped_parameters( + model, args.weight_decay, args.lora_learning_rate) + + AdamOptimizer = DeepSpeedCPUAdam if args.offload else FusedAdam + optimizer = AdamOptimizer(optimizer_grouped_parameters, + lr=args.learning_rate, + betas=(0.9, 0.95)) + + num_update_steps_per_epoch = math.ceil( + len(train_dataloader) / args.gradient_accumulation_steps) + lr_scheduler = get_scheduler( + name=args.lr_scheduler_type, + optimizer=optimizer, + num_warmup_steps=args.num_warmup_steps, + num_training_steps=args.num_train_epochs * num_update_steps_per_epoch, + ) + + model, optimizer, _, lr_scheduler = deepspeed.initialize( + model=model, + optimizer=optimizer, + args=args, + config=ds_config, + lr_scheduler=lr_scheduler, + dist_init_required=True) + ref_model, *_ = deepspeed.initialize(model=ref_model, config=ref_ds_config) + ref_model.eval() + + if args.gradient_checkpointing: + model.gradient_checkpointing_enable() + + # Train! + print_rank_0("***** Running training *****", args.global_rank) + print_rank_0( + f"***** Evaluating rewards, Epoch {1}/{args.num_train_epochs} *****", + args.global_rank) + chosen_rewards, rejected_rewards, eval_loss = evaluation( + model, ref_model, tokenizer, eval_dataloader) + print_rank_0( + f"chosen: {chosen_rewards}, rejected: {rejected_rewards}, loss: {eval_loss}", + args.global_rank) + + for epoch in range(args.num_train_epochs): + print_rank_0( + f"Beginning of Epoch {epoch+1}/{args.num_train_epochs}, Total Micro Batches {len(train_dataloader)}", + args.global_rank) + model.train() + import time + for step, batch in enumerate(train_dataloader): + start = time.time() + batch = to_device(batch, device) + batch_size = batch['input_ids'].shape[0] // 2 + chosen_input_ids = batch['input_ids'][:batch_size] + rejected_input_ids = batch['input_ids'][batch_size:] + label_mask = (batch['input_ids'] != tokenizer.pad_token_id).int() + for i in range(batch_size): + divergence_ind = (chosen_input_ids[i] != + rejected_input_ids[i]).nonzero().squeeze(-1) + if len(divergence_ind) > 0: + divergence_ind = divergence_ind[0] + else: + divergence_ind = 0 + label_mask[i][:divergence_ind] = 0 + label_mask[i + batch_size][:divergence_ind] = 0 + outputs = model(**batch, use_cache=False) + with torch.no_grad(): + ref_outputs = ref_model(**batch) + + logps = get_batch_logps(outputs.logits, batch['input_ids'], + label_mask) + ref_logps = get_batch_logps(ref_outputs.logits, batch['input_ids'], + label_mask) + + chosen_logps = logps[:batch_size] + rejected_logps = logps[batch_size:] + ref_chosen_logps = ref_logps[:batch_size] + ref_rejected_logps = ref_logps[batch_size:] + + logits = args.beta * ((chosen_logps - ref_chosen_logps) - + (rejected_logps - ref_rejected_logps)) + loss = (- torch.nn.functional.logsigmoid(logits) * (1 - args.label_smoothing) - \ + torch.nn.functional.logsigmoid(-logits) * args.label_smoothing).mean(0) + if args.print_loss: + print( + f"Epoch: {epoch}, Step: {step}, Rank: {torch.distributed.get_rank()}, loss = {loss}" + ) + model.backward(loss) + model.step() + end = time.time() + if torch.distributed.get_rank() == 0: + print_throughput(model.model, args, end - start, + args.global_rank) + + # Evaluate rewards on the validation set. + print_rank_0( + f"***** Evaluating rewards, Epoch {epoch+1}/{args.num_train_epochs} *****", + args.global_rank) + chosen_rewards, rejected_rewards, eval_loss = evaluation( + model, ref_model, tokenizer, eval_dataloader) + print_rank_0( + f"chosen: {chosen_rewards}, rejected: {rejected_rewards}, loss: {eval_loss}", + args.global_rank) + model.tput_timer.update_epoch_count() + + if args.output_dir is not None: + print_rank_0('saving the final model ...', args.global_rank) + model = convert_lora_to_linear_layer(model) + + if args.global_rank == 0: + save_hf_format(model, tokenizer, args) + + if args.zero_stage == 3: + # For zero stage 3, each gpu only has a part of the model, so we need a special save function + save_zero_three_model(model, + args.global_rank, + args.output_dir, + zero_stage=args.zero_stage) + + +if __name__ == "__main__": + main() diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_log_output/opt-350M_globalBatchSize-32.log b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_log_output/opt-350M_globalBatchSize-32.log new file mode 100644 index 000000000..2f75b0a5d --- /dev/null +++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_log_output/opt-350M_globalBatchSize-32.log @@ -0,0 +1,6409 @@ +[2025-01-02 15:45:19,529] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-02 15:45:22,160] [WARNING] [runner.py:215:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. +[2025-01-02 15:45:22,160] [INFO] [runner.py:607:main] cmd = .venv/dsexamples/bin/python3 -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgMywgNCwgNSwgNiwgN119 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None main.py --data_path Dahoas/rm-static Dahoas/full-hh-rlhf --data_split 2,4,4 --model_name_or_path facebook/opt-350m --per_device_train_batch_size 2 --per_device_eval_batch_size 4 --max_seq_len 512 --learning_rate 5e-5 --weight_decay 0.1 --num_train_epochs 1 --dropout 0.0 --gradient_accumulation_steps 2 --lr_scheduler_type cosine --num_warmup_steps 0 --seed 1234 --zero_stage 0 --deepspeed --output_dir ./output +[2025-01-02 15:45:24,254] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-02 15:45:26,831] [INFO] [launch.py:146:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]} +[2025-01-02 15:45:26,831] [INFO] [launch.py:152:main] nnodes=1, num_local_procs=8, node_rank=0 +[2025-01-02 15:45:26,831] [INFO] [launch.py:163:main] global_rank_mapping=defaultdict(, {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]}) +[2025-01-02 15:45:26,832] [INFO] [launch.py:164:main] dist_world_size=8 +[2025-01-02 15:45:26,832] [INFO] [launch.py:168:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +[2025-01-02 15:45:26,832] [INFO] [launch.py:256:main] process 106031 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=0', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output'] +[2025-01-02 15:45:26,833] [INFO] [launch.py:256:main] process 106032 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=1', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output'] +[2025-01-02 15:45:26,833] [INFO] [launch.py:256:main] process 106033 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=2', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output'] +[2025-01-02 15:45:26,834] [INFO] [launch.py:256:main] process 106034 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=3', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output'] +[2025-01-02 15:45:26,834] [INFO] [launch.py:256:main] process 106035 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=4', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output'] +[2025-01-02 15:45:26,835] [INFO] [launch.py:256:main] process 106036 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=5', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output'] +[2025-01-02 15:45:26,835] [INFO] [launch.py:256:main] process 106037 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=6', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output'] +[2025-01-02 15:45:26,836] [INFO] [launch.py:256:main] process 106038 spawned with command: ['.venv/dsexamples/bin/python3', '-u', 'main.py', '--local_rank=7', '--data_path', 'Dahoas/rm-static', 'Dahoas/full-hh-rlhf', '--data_split', '2,4,4', '--model_name_or_path', 'facebook/opt-350m', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '4', '--max_seq_len', '512', '--learning_rate', '5e-5', '--weight_decay', '0.1', '--num_train_epochs', '1', '--dropout', '0.0', '--gradient_accumulation_steps', '2', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '0', '--seed', '1234', '--zero_stage', '0', '--deepspeed', '--output_dir', './output'] +[2025-01-02 15:45:30,732] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-02 15:45:30,910] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-02 15:45:31,042] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-02 15:45:31,049] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-02 15:45:31,103] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-02 15:45:31,135] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-02 15:45:31,144] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-02 15:45:31,147] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-01-02 15:45:33,071] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-01-02 15:45:33,584] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-01-02 15:45:33,584] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[rank3]:[W102 15:45:34.866823429 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 3] using GPU 3 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. +[2025-01-02 15:45:34,383] [INFO] [comm.py:652:init_distributed] cdb=None +[rank4]:[W102 15:45:34.247342944 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 4] using GPU 4 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. +[2025-01-02 15:45:34,450] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-01-02 15:45:34,472] [INFO] [comm.py:652:init_distributed] cdb=None +[2025-01-02 15:45:34,472] [INFO] [comm.py:652:init_distributed] cdb=None +[rank2]:[W102 15:45:34.334422354 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 2] using GPU 2 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. +[rank1]:[W102 15:45:34.339768589 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 1] using GPU 1 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. +[rank6]:[W102 15:45:34.340404849 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 6] using GPU 6 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. +[2025-01-02 15:45:34,557] [INFO] [comm.py:652:init_distributed] cdb=None +[rank7]:[W102 15:45:34.425895009 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 7] using GPU 7 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. +[2025-01-02 15:45:34,604] [INFO] [comm.py:652:init_distributed] cdb=None +[rank5]:[W102 15:45:34.470924200 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 5] using GPU 5 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. +[rank0]:[W102 15:45:34.726812378 ProcessGroupNCCL.cpp:4115] [PG ID 0 PG GUID 0 Rank 0] using GPU 0 to perform barrier as devices used by this process are currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect.Specify device_ids in barrier() to force use of a particular device,or call init_process_group() with a device_id. +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +Setting model_config.dropout to 0.0 +Setting model_config.attention_dropout to 0.0 +Setting model_config.activation_dropout to 0.0 +DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + return torch.load(train_fname), torch.load(eval_fname) +DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + return torch.load(train_fname), torch.load(eval_fname) +DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + return torch.load(train_fname), torch.load(eval_fname) +DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + return torch.load(train_fname), torch.load(eval_fname) +DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + return torch.load(train_fname), torch.load(eval_fname) +DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + return torch.load(train_fname), torch.load(eval_fname) +DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + return torch.load(train_fname), torch.load(eval_fname) +DeepspeedExamples/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py:378: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature. + return torch.load(train_fname), torch.load(eval_fname) +Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root... +Detected CUDA files, patching ldflags +Emitting ninja build file .cache/torch_extensions/py310_cu121/fused_adam/build.ninja... +.venv/dsexamples/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + warnings.warn( +Building extension module fused_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module fused_adam... +Time to load fused_adam op: 0.05063033103942871 seconds +[2025-01-02 15:46:44,140] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root... +Detected CUDA files, patching ldflags +Emitting ninja build file .cache/torch_extensions/py310_cu121/fused_adam/build.ninja... +.venv/dsexamples/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + warnings.warn( +Building extension module fused_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module fused_adam... +Time to load fused_adam op: 0.04431414604187012 seconds +[2025-01-02 15:46:44,304] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root... +Detected CUDA files, patching ldflags +Emitting ninja build file .cache/torch_extensions/py310_cu121/fused_adam/build.ninja... +.venv/dsexamples/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + warnings.warn( +Building extension module fused_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module fused_adam... +Time to load fused_adam op: 0.11332392692565918 seconds +[2025-01-02 15:46:44,637] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed info: version=0.16.2, git-hash=unknown, git-branch=unknown +[2025-01-02 15:46:44,637] [INFO] [comm.py:677:init_distributed] Distributed backend already initialized +[2025-01-02 15:46:44,637] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root... +Detected CUDA files, patching ldflags +Emitting ninja build file .cache/torch_extensions/py310_cu121/fused_adam/build.ninja... +.venv/dsexamples/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + warnings.warn( +Building extension module fused_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module fused_adam... +Time to load fused_adam op: 0.10501360893249512 seconds +[2025-01-02 15:46:45,431] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root... +Detected CUDA files, patching ldflags +Emitting ninja build file .cache/torch_extensions/py310_cu121/fused_adam/build.ninja... +.venv/dsexamples/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + warnings.warn( +Building extension module fused_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module fused_adam... +Time to load fused_adam op: 0.033315420150756836 seconds +[2025-01-02 15:46:48,423] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root... +Detected CUDA files, patching ldflags +Emitting ninja build file .cache/torch_extensions/py310_cu121/fused_adam/build.ninja... +.venv/dsexamples/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + warnings.warn( +Building extension module fused_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module fused_adam... +Time to load fused_adam op: 0.1015481948852539 seconds +[2025-01-02 15:46:49,014] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root... +Using .cache/torch_extensions/py310_cu121 as PyTorch extensions root... +Detected CUDA files, patching ldflags +Emitting ninja build file .cache/torch_extensions/py310_cu121/fused_adam/build.ninja... +.venv/dsexamples/lib/python3.10/site-packages/torch/utils/cpp_extension.py:1964: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. +If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. + warnings.warn( +Building extension module fused_adam... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module fused_adam... +Time to load fused_adam op: 0.13011908531188965 seconds +[2025-01-02 15:46:50,464] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +Loading extension module fused_adam... +Time to load fused_adam op: 0.20447063446044922 seconds +[2025-01-02 15:46:50,545] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +[2025-01-02 15:46:55,765] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[2025-01-02 15:46:55,766] [INFO] [logging.py:128:log_dist] [Rank 0] Using client Optimizer as basic optimizer +[2025-01-02 15:46:55,767] [INFO] [logging.py:128:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer +[2025-01-02 15:46:55,785] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +[2025-01-02 15:46:55,785] [INFO] [logging.py:128:log_dist] [Rank 0] Creating fp16 optimizer with dynamic loss scale +[2025-01-02 15:46:55,967] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Final Optimizer = FP16_Optimizer +[2025-01-02 15:46:55,967] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[2025-01-02 15:46:55,967] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[2025-01-02 15:46:55,967] [INFO] [logging.py:128:log_dist] [Rank 0] step=0, skipped=0, lr=[5e-05, 5e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:46:55,968] [INFO] [config.py:999:print] DeepSpeedEngine configuration: +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True, 'use_gds': False} +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] amp_enabled .................. False +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] amp_params ................... False +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] autotuning_config ............ { + "enabled": false, + "start_step": null, + "end_step": null, + "metric_path": null, + "arg_mappings": null, + "metric": "throughput", + "model_info": null, + "results_dir": "autotuning_results", + "exps_dir": "autotuning_exps", + "overwrite": true, + "fast": true, + "start_profile_step": 3, + "end_profile_step": 5, + "tuner_type": "gridsearch", + "tuner_early_stopping": 5, + "tuner_num_trials": 50, + "model_info_path": null, + "mp_size": 1, + "max_train_batch_size": null, + "min_train_batch_size": 1, + "max_train_micro_batch_size_per_gpu": 1.024000e+03, + "min_train_micro_batch_size_per_gpu": 1, + "num_tuning_micro_batch_sizes": 3 +} +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] bfloat16_enabled ............. False +[2025-01-02 15:46:55,969] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] checkpoint_parallel_write_pipeline False +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] checkpoint_tag_validation_enabled True +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] checkpoint_tag_validation_fail False +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] comms_config ................. +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] communication_data_type ...... None +[2025-01-02 15:46:55,969] [INFO] [config.py:1003:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] curriculum_enabled_legacy .... False +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] curriculum_params_legacy ..... False +[2025-01-02 15:46:55,969] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] data_efficiency_enabled ...... False +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] dataloader_drop_last ......... False +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] disable_allgather ............ False +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] dump_state ................... False +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] dynamic_loss_scale_args ...... {'init_scale': 65536, 'scale_window': 100, 'delayed_shift': 2, 'consecutive_hysteresis': False, 'min_scale': 1} +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] eigenvalue_enabled ........... False +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] eigenvalue_gas_boundary_resolution 1 +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] eigenvalue_layer_name ........ bert.encoder.layer +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] eigenvalue_layer_num ......... 0 +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] eigenvalue_max_iter .......... 100 +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] eigenvalue_stability ......... 1e-06 +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] eigenvalue_tol ............... 0.01 +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] eigenvalue_verbose ........... False +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] elasticity_enabled ........... False +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] flops_profiler_config ........ { + "enabled": false, + "recompute_fwd_factor": 0.0, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] fp16_auto_cast ............... False +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] fp16_enabled ................. True +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] fp16_master_weights_and_gradients False +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] global_rank .................. 0 +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] grad_accum_dtype ............. None +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] gradient_accumulation_steps .. 2 +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] gradient_clipping ............ 1.0 +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] gradient_predivide_factor .... 1.0 +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] graph_harvesting ............. False +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2025-01-02 15:46:55,970] [INFO] [config.py:1003:print] initial_dynamic_scale ........ 65536 +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] load_universal_checkpoint .... False +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] loss_scale ................... 0 +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] memory_breakdown ............. False +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] mics_hierarchial_params_gather False +[2025-01-02 15:46:55,971] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='step2_tensorboard/ds_tensorboard_logs/', job_name='step2_model_tensorboard') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] nebula_config ................ { + "enabled": false, + "persistent_storage_path": null, + "persistent_time_interval": 100, + "num_of_version_in_retention": 2, + "enable_nebula_load": true, + "load_path": null +} +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] optimizer_legacy_fusion ...... False +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] optimizer_name ............... None +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] optimizer_params ............. None +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] pld_enabled .................. False +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] pld_params ................... False +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] prescale_gradients ........... False +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] scheduler_name ............... None +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] scheduler_params ............. None +[2025-01-02 15:46:55,971] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] sparse_attention ............. None +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] sparse_gradients_enabled ..... False +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] steps_per_print .............. 10 +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] timers_config ................ enabled=True synchronized=True +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] train_batch_size ............. 32 +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] train_micro_batch_size_per_gpu 2 +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] use_data_before_expert_parallel_ False +[2025-01-02 15:46:55,971] [INFO] [config.py:1003:print] use_node_local_storage ....... False +[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print] wall_clock_breakdown ......... False +[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print] weight_quantization_config ... None +[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print] world_size ................... 8 +[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print] zero_allow_untested_optimizer False +[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100000000, max_in_cpu=1000000000, pin_memory=False) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='none', nvme_path=None, buffer_count=4, pin_memory=False, pipeline_read=False, pipeline_write=False, fast_init=False, ratio=1.0) sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=30000000 param_persistence_threshold=10000 model_persistence_threshold=9223372036854775807 max_live_parameters=30000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False module_granularity_threshold=0 use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False zeropp_loco_param=None mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=False pipeline_loading_checkpoint=False override_module_apply=True +[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print] zero_enabled ................. False +[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print] zero_force_ds_cpu_optimizer .. True +[2025-01-02 15:46:55,972] [INFO] [config.py:1003:print] zero_optimization_stage ...... 0 +[2025-01-02 15:46:55,972] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +[2025-01-02 15:46:55,972] [INFO] [config.py:989:print_user_config] json = { + "train_batch_size": 32, + "train_micro_batch_size_per_gpu": 2, + "steps_per_print": 10, + "zero_optimization": { + "stage": 0, + "overlap_comm": true, + "offload_param": { + "device": "none" + }, + "offload_optimizer": { + "device": "none" + }, + "stage3_param_persistence_threshold": 1.000000e+04, + "stage3_max_live_parameters": 3.000000e+07, + "stage3_prefetch_bucket_size": 3.000000e+07, + "memory_efficient_linear": false + }, + "fp16": { + "enabled": true, + "loss_scale_window": 100 + }, + "gradient_clipping": 1.0, + "prescale_gradients": false, + "wall_clock_breakdown": false, + "hybrid_engine": { + "enabled": false, + "max_out_tokens": 512, + "inference_tp_size": 1, + "release_inference_cache": false, + "pin_parameters": true, + "tp_gather_partition_size": 8 + }, + "tensorboard": { + "enabled": false, + "output_path": "step2_tensorboard/ds_tensorboard_logs/", + "job_name": "step2_model_tensorboard" + } +} +[2025-01-02 15:46:55,972] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed info: version=0.16.2, git-hash=unknown, git-branch=unknown +[2025-01-02 15:46:55,972] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +[2025-01-02 15:46:55,978] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +[2025-01-02 15:46:56,023] [INFO] [config.py:733:__init__] Config mesh_device None world_size = 8 +[2025-01-02 15:47:02,563] [INFO] [logging.py:128:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[2025-01-02 15:47:02,565] [INFO] [config.py:999:print] DeepSpeedEngine configuration: +[2025-01-02 15:47:02,565] [INFO] [config.py:1003:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2025-01-02 15:47:02,565] [INFO] [config.py:1003:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True, 'use_gds': False} +[2025-01-02 15:47:02,565] [INFO] [config.py:1003:print] amp_enabled .................. False +[2025-01-02 15:47:02,565] [INFO] [config.py:1003:print] amp_params ................... False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] autotuning_config ............ { + "enabled": false, + "start_step": null, + "end_step": null, + "metric_path": null, + "arg_mappings": null, + "metric": "throughput", + "model_info": null, + "results_dir": "autotuning_results", + "exps_dir": "autotuning_exps", + "overwrite": true, + "fast": true, + "start_profile_step": 3, + "end_profile_step": 5, + "tuner_type": "gridsearch", + "tuner_early_stopping": 5, + "tuner_num_trials": 50, + "model_info_path": null, + "mp_size": 1, + "max_train_batch_size": null, + "min_train_batch_size": 1, + "max_train_micro_batch_size_per_gpu": 1.024000e+03, + "min_train_micro_batch_size_per_gpu": 1, + "num_tuning_micro_batch_sizes": 3 +} +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] bfloat16_enabled ............. False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] bfloat16_immediate_grad_update False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] checkpoint_parallel_write_pipeline False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] checkpoint_tag_validation_enabled True +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] checkpoint_tag_validation_fail False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] comms_config ................. +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] communication_data_type ...... None +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] curriculum_enabled_legacy .... False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] curriculum_params_legacy ..... False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] data_efficiency_enabled ...... False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] dataloader_drop_last ......... False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] disable_allgather ............ False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] dump_state ................... False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] dynamic_loss_scale_args ...... None +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] eigenvalue_enabled ........... False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] eigenvalue_gas_boundary_resolution 1 +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] eigenvalue_layer_name ........ bert.encoder.layer +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] eigenvalue_layer_num ......... 0 +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] eigenvalue_max_iter .......... 100 +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] eigenvalue_stability ......... 1e-06 +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] eigenvalue_tol ............... 0.01 +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] eigenvalue_verbose ........... False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] elasticity_enabled ........... False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] flops_profiler_config ........ { + "enabled": false, + "recompute_fwd_factor": 0.0, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] fp16_auto_cast ............... False +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] fp16_enabled ................. True +[2025-01-02 15:47:02,566] [INFO] [config.py:1003:print] fp16_master_weights_and_gradients False +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] global_rank .................. 0 +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] grad_accum_dtype ............. None +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] gradient_accumulation_steps .. 2 +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] gradient_clipping ............ 1.0 +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] gradient_predivide_factor .... 1.0 +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] graph_harvesting ............. False +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] initial_dynamic_scale ........ 65536 +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] load_universal_checkpoint .... False +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] loss_scale ................... 0 +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] memory_breakdown ............. False +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] mics_hierarchial_params_gather False +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] mics_shard_size .............. -1 +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] nebula_config ................ { + "enabled": false, + "persistent_storage_path": null, + "persistent_time_interval": 100, + "num_of_version_in_retention": 2, + "enable_nebula_load": true, + "load_path": null +} +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] optimizer_legacy_fusion ...... False +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] optimizer_name ............... None +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] optimizer_params ............. None +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] pld_enabled .................. False +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] pld_params ................... False +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] prescale_gradients ........... False +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] scheduler_name ............... None +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] scheduler_params ............. None +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] seq_parallel_communication_data_type torch.float32 +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] sparse_attention ............. None +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] sparse_gradients_enabled ..... False +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] steps_per_print .............. 10 +[2025-01-02 15:47:02,567] [INFO] [config.py:1003:print] timers_config ................ enabled=True synchronized=True +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] train_batch_size ............. 32 +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] train_micro_batch_size_per_gpu 2 +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] use_data_before_expert_parallel_ False +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] use_node_local_storage ....... False +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] wall_clock_breakdown ......... False +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] weight_quantization_config ... None +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] world_size ................... 8 +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] zero_allow_untested_optimizer False +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100000000, max_in_cpu=1000000000, pin_memory=False) offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=10000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False module_granularity_threshold=0 use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False zeropp_loco_param=None mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=False pipeline_loading_checkpoint=False override_module_apply=True +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] zero_enabled ................. False +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] zero_force_ds_cpu_optimizer .. True +[2025-01-02 15:47:02,568] [INFO] [config.py:1003:print] zero_optimization_stage ...... 0 +[2025-01-02 15:47:02,568] [INFO] [config.py:989:print_user_config] json = { + "train_batch_size": 32, + "train_micro_batch_size_per_gpu": 2, + "steps_per_print": 10, + "zero_optimization": { + "stage": 0, + "stage3_param_persistence_threshold": 1.000000e+04, + "offload_param": { + "device": "none" + }, + "memory_efficient_linear": false + }, + "fp16": { + "enabled": true + }, + "gradient_clipping": 1.0, + "prescale_gradients": false, + "wall_clock_breakdown": false +} +***** Running training ***** +***** Evaluating rewards, Epoch 1/1 ***** +chosen: 0.0, rejected: 0.0, loss: 0.693359375 +Beginning of Epoch 1/1, Total Micro Batches 4708 +Model Parameters: 0.331 B, Latency: 0.27s, TFLOPs: 1.02, Samples/sec: 7.28, Time/seq 0.14s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:47:32,711] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 0 +[2025-01-02 15:47:32,711] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 0 +[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 +[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale fr[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 +[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 0 +[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 +[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 0 +[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 0 +[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 0 +[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 +[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 +[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 +[2025-01-02 15:47:32,713] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 65536, reducing to 32768.0 +[2025-01-02 15:47:32,712] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 0 +[2025-01-02 15:47:32,713] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.06, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.10s, TFLOPs: 2.70, Samples/sec: 19.26, Time/seq 0.05s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1 +[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 +[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1 +[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 +[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 +[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1 +[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1 +[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 +[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1 +[2025-01-02 15:47:33,343] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 32768.0, reducing to 16384.0 +[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1 +[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale fr[2025-01-02 15:47:33,343] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 +Model Parameters: 0.331 B, Latency: 0.52s, TFLOPs: 0.54, Samples/sec: 3.84, Time/seq 0.26s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:47:33,974] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2 +[2025-01-02 15:47:33,974] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2 +[2025-01-02 15:47:33,974] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 +[2025-01-02 15:47:33,974] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 +[2025-01-02 15:47:33,974] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2 +[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 +[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2 +[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 +[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 +[2025-01-02 15:47:33,975] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 16384.0, reducing to 8192.0 +[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2 +[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2 +[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 +[2025-01-02 15:47:33,975] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.58, Samples/sec: 4.11, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.62s, TFLOPs: 0.46, Samples/sec: 3.25, Time/seq 0.31s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:47:38,839] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 9 +[2025-01-02 15:47:38,839] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 15:47:38,839] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 9 +[2025-01-02 15:47:38,839] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 15:47:38,839] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 9 +[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 9 +[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 9 +[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 9 +[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 15:47:38,840] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 8192.0, reducing to 4096.0 +[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 9 +[2025-01-02 15:47:38,840] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 15:47:38,841] [INFO] [logging.py:128:log_dist] [Rank 0] step=10, skipped=4, lr=[4.999919851200522e-05, 4.999919851200522e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:47:38,842] [INFO] [timer.py:264:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=46.98077573584925, CurrSamplesPerSec=50.86885115471782, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.09, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:47:45,728] [INFO] [logging.py:128:log_dist] [Rank 0] step=20, skipped=4, lr=[4.999430071591966e-05, 4.999430071591966e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:47:45,746] [INFO] [timer.py:264:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=46.82057321463456, CurrSamplesPerSec=46.3240790591193, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:47:52,687] [INFO] [logging.py:128:log_dist] [Rank 0] step=30, skipped=4, lr=[4.998495126612987e-05, 4.998495126612987e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:47:52,708] [INFO] [timer.py:264:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=46.657677501465244, CurrSamplesPerSec=46.740073109032714, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:47:59,588] [INFO] [logging.py:128:log_dist] [Rank 0] step=40, skipped=4, lr=[4.9971151827835975e-05, 4.9971151827835975e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:47:59,608] [INFO] [timer.py:264:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=46.67284201254282, CurrSamplesPerSec=46.70549424381561, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:48:06,507] [INFO] [logging.py:128:log_dist] [Rank 0] step=50, skipped=4, lr=[4.995290485881111e-05, 4.995290485881111e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:48:06,528] [INFO] [timer.py:264:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=46.658721893844756, CurrSamplesPerSec=46.780295098825675, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:48:13,403] [INFO] [logging.py:128:log_dist] [Rank 0] step=60, skipped=4, lr=[4.993021360896366e-05, 4.993021360896366e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:48:13,424] [INFO] [timer.py:264:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=46.67496477850472, CurrSamplesPerSec=46.74603117536787, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.99, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:48:20,321] [INFO] [logging.py:128:log_dist] [Rank 0] step=70, skipped=4, lr=[4.99030821197584e-05, 4.99030821197584e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:48:20,340] [INFO] [timer.py:264:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=46.66552508644222, CurrSamplesPerSec=46.58816877398781, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:48:25,109] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 76 +[2025-01-02 15:48:25,109] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:48:25,109] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 76 +[2025-01-02 15:48:25,109] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:48:25,109] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 76 +[2025-01-02 15:48:25,109] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:48:25,109] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 76 +[2025-01-02 15:48:25,110] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 76 +[2025-01-02 15:48:25,110] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:48:25,110] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 76 +[2025-01-02 15:48:25,110] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0 +[2025-01-02 15:48:25,110] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:48:25,110] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:48:25,110] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 76 +[2025-01-02 15:48:25,110] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.08, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:48:27,157] [INFO] [logging.py:128:log_dist] [Rank 0] step=80, skipped=5, lr=[4.987487135239265e-05, 4.987487135239265e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:48:27,178] [INFO] [timer.py:264:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=46.72639900113086, CurrSamplesPerSec=46.69866909880398, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:48:34,057] [INFO] [logging.py:128:log_dist] [Rank 0] step=90, skipped=5, lr=[4.983931737433311e-05, 4.983931737433311e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:48:34,078] [INFO] [timer.py:264:stop] epoch=0/micro_step=180/global_step=90, RunningAvgSamplesPerSec=46.72344397631967, CurrSamplesPerSec=46.670138855517386, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:48:40,995] [INFO] [logging.py:128:log_dist] [Rank 0] step=100, skipped=5, lr=[4.979933934614882e-05, 4.979933934614882e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:48:41,016] [INFO] [timer.py:264:stop] epoch=0/micro_step=200/global_step=100, RunningAvgSamplesPerSec=46.69784876342507, CurrSamplesPerSec=46.6644759276489, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:48:47,897] [INFO] [logging.py:128:log_dist] [Rank 0] step=110, skipped=5, lr=[4.9754944388196535e-05, 4.9754944388196535e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:48:47,918] [INFO] [timer.py:264:stop] epoch=0/micro_step=220/global_step=110, RunningAvgSamplesPerSec=46.697522740072756, CurrSamplesPerSec=46.6885325659028, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:48:54,803] [INFO] [logging.py:128:log_dist] [Rank 0] step=120, skipped=5, lr=[4.970614040751798e-05, 4.970614040751798e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:48:54,823] [INFO] [timer.py:264:stop] epoch=0/micro_step=240/global_step=120, RunningAvgSamplesPerSec=46.69541297477131, CurrSamplesPerSec=46.75664876248707, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:49:01,714] [INFO] [logging.py:128:log_dist] [Rank 0] step=130, skipped=5, lr=[4.96529360964316e-05, 4.96529360964316e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:49:01,734] [INFO] [timer.py:264:stop] epoch=0/micro_step=260/global_step=130, RunningAvgSamplesPerSec=46.69230036582147, CurrSamplesPerSec=46.71986606957944, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:49:05,845] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 135 +[2025-01-02 15:49:05,845] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:49:05,845] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 135 +[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 135 +[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:49:05,846] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 2048.0, reducing to 1024.0 +[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 135 +[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 135 +[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 135 +[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 135 +[2025-01-02 15:49:05,846] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.09, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:49:08,600] [INFO] [logging.py:128:log_dist] [Rank 0] step=140, skipped=6, lr=[4.9601297749741036e-05, 4.9601297749741036e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:49:08,620] [INFO] [timer.py:264:stop] epoch=0/micro_step=280/global_step=140, RunningAvgSamplesPerSec=46.701067188567556, CurrSamplesPerSec=46.34668162363741, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:49:15,555] [INFO] [logging.py:128:log_dist] [Rank 0] step=150, skipped=6, lr=[4.9539759563783176e-05, 4.9539759563783176e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:49:15,575] [INFO] [timer.py:264:stop] epoch=0/micro_step=300/global_step=150, RunningAvgSamplesPerSec=46.676625606909724, CurrSamplesPerSec=46.37509004775261, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:49:22,519] [INFO] [logging.py:128:log_dist] [Rank 0] step=160, skipped=6, lr=[4.947385068096907e-05, 4.947385068096907e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:49:22,537] [INFO] [timer.py:264:stop] epoch=0/micro_step=320/global_step=160, RunningAvgSamplesPerSec=46.652045629756806, CurrSamplesPerSec=45.754688762495974, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:49:29,477] [INFO] [logging.py:128:log_dist] [Rank 0] step=170, skipped=6, lr=[4.940358284011574e-05, 4.940358284011574e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:49:29,498] [INFO] [timer.py:264:stop] epoch=0/micro_step=340/global_step=170, RunningAvgSamplesPerSec=46.63235423832134, CurrSamplesPerSec=46.33578549044944, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:49:36,430] [INFO] [logging.py:128:log_dist] [Rank 0] step=180, skipped=6, lr=[4.9328968556400026e-05, 4.9328968556400026e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:49:36,451] [INFO] [timer.py:264:stop] epoch=0/micro_step=360/global_step=180, RunningAvgSamplesPerSec=46.617390821416194, CurrSamplesPerSec=46.339240977849904, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.93, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:49:43,383] [INFO] [logging.py:128:log_dist] [Rank 0] step=190, skipped=6, lr=[4.9250021119129636e-05, 4.9250021119129636e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:49:43,403] [INFO] [timer.py:264:stop] epoch=0/micro_step=380/global_step=190, RunningAvgSamplesPerSec=46.60359803968563, CurrSamplesPerSec=46.34333703776871, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:49:50,336] [INFO] [logging.py:128:log_dist] [Rank 0] step=200, skipped=6, lr=[4.916675458937614e-05, 4.916675458937614e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:49:50,356] [INFO] [timer.py:264:stop] epoch=0/micro_step=400/global_step=200, RunningAvgSamplesPerSec=46.59140057519339, CurrSamplesPerSec=46.578597409153254, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 208 +[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 208 +[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 +[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 +[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 208 +[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 +[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 208 +[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 +[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 208 +[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 208 +[2025-01-02 15:49:56,558] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 +[2025-01-02 15:49:56,557] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 208 +[2025-01-02 15:49:56,558] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 +[2025-01-02 15:49:56,558] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 1024.0, reducing to 512.0 +[2025-01-02 15:49:56,558] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 208 +[2025-01-02 15:49:56,558] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 1024.0 to 512.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.05, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:49:57,226] [INFO] [logging.py:128:log_dist] [Rank 0] step=210, skipped=7, lr=[4.908813412994094e-05, 4.908813412994094e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:49:57,247] [INFO] [timer.py:264:stop] epoch=0/micro_step=420/global_step=210, RunningAvgSamplesPerSec=46.60008012821061, CurrSamplesPerSec=46.86441592162194, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.57s, TFLOPs: 0.49, Samples/sec: 3.51, Time/seq 0.29s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:50:04,214] [INFO] [logging.py:128:log_dist] [Rank 0] step=220, skipped=7, lr=[4.899670281569845e-05, 4.899670281569845e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:50:04,232] [INFO] [timer.py:264:stop] epoch=0/micro_step=440/global_step=220, RunningAvgSamplesPerSec=46.580701736622736, CurrSamplesPerSec=45.654661323911306, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.56, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:50:11,163] [INFO] [logging.py:128:log_dist] [Rank 0] step=230, skipped=7, lr=[4.890099752667294e-05, 4.890099752667294e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:50:11,183] [INFO] [timer.py:264:stop] epoch=0/micro_step=460/global_step=230, RunningAvgSamplesPerSec=46.57159456525261, CurrSamplesPerSec=46.32703709314177, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.99, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:50:18,110] [INFO] [logging.py:128:log_dist] [Rank 0] step=240, skipped=7, lr=[4.880103530862256e-05, 4.880103530862256e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:50:18,131] [INFO] [timer.py:264:stop] epoch=0/micro_step=480/global_step=240, RunningAvgSamplesPerSec=46.563629676576625, CurrSamplesPerSec=46.309614118584825, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:50:25,065] [INFO] [logging.py:128:log_dist] [Rank 0] step=250, skipped=7, lr=[4.86968339654932e-05, 4.86968339654932e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:50:25,086] [INFO] [timer.py:264:stop] epoch=0/micro_step=500/global_step=250, RunningAvgSamplesPerSec=46.554810311450424, CurrSamplesPerSec=46.38267044164213, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.56, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:50:32,025] [INFO] [logging.py:128:log_dist] [Rank 0] step=260, skipped=7, lr=[4.858841205624759e-05, 4.858841205624759e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:50:32,046] [INFO] [timer.py:264:stop] epoch=0/micro_step=520/global_step=260, RunningAvgSamplesPerSec=46.54628059347242, CurrSamplesPerSec=46.90926209286344, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.56, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:50:38,997] [INFO] [logging.py:128:log_dist] [Rank 0] step=270, skipped=7, lr=[4.8475788891559783e-05, 4.8475788891559783e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:50:39,018] [INFO] [timer.py:264:stop] epoch=0/micro_step=540/global_step=270, RunningAvgSamplesPerSec=46.535565503469485, CurrSamplesPerSec=46.32256021546509, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:50:45,948] [INFO] [logging.py:128:log_dist] [Rank 0] step=280, skipped=7, lr=[4.835898453037574e-05, 4.835898453037574e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:50:45,969] [INFO] [timer.py:264:stop] epoch=0/micro_step=560/global_step=280, RunningAvgSamplesPerSec=46.52910793247543, CurrSamplesPerSec=46.34376908630242, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:50:52,901] [INFO] [logging.py:128:log_dist] [Rank 0] step=290, skipped=7, lr=[4.823801977634082e-05, 4.823801977634082e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:50:52,922] [INFO] [timer.py:264:stop] epoch=0/micro_step=580/global_step=290, RunningAvgSamplesPerSec=46.52320489698547, CurrSamplesPerSec=46.32582185506885, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:50:59,855] [INFO] [logging.py:128:log_dist] [Rank 0] step=300, skipped=7, lr=[4.811291617409437e-05, 4.811291617409437e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:50:59,875] [INFO] [timer.py:264:stop] epoch=0/micro_step=600/global_step=300, RunningAvgSamplesPerSec=46.51706229660043, CurrSamplesPerSec=46.36179428641033, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:51:06,787] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 +[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 +[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 +[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 +[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:51:06,788] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 +[2025-01-02 15:51:06,789] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:51:06,789] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 +[2025-01-02 15:51:06,794] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:51:06,794] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 512.0 to 1024.0 +[2025-01-02 15:51:06,807] [INFO] [logging.py:128:log_dist] [Rank 0] step=310, skipped=7, lr=[4.7983696005432587e-05, 4.7983696005432587e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:51:06,827] [INFO] [timer.py:264:stop] epoch=0/micro_step=620/global_step=310, RunningAvgSamplesPerSec=46.511774321108796, CurrSamplesPerSec=46.36275517013998, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:51:13,760] [INFO] [logging.py:128:log_dist] [Rank 0] step=320, skipped=7, lr=[4.7850382285339924e-05, 4.7850382285339924e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:51:13,780] [INFO] [timer.py:264:stop] epoch=0/micro_step=640/global_step=320, RunningAvgSamplesPerSec=46.50668977864453, CurrSamplesPerSec=46.34026492494737, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:51:20,713] [INFO] [logging.py:128:log_dist] [Rank 0] step=330, skipped=7, lr=[4.771299875788999e-05, 4.771299875788999e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:51:20,734] [INFO] [timer.py:264:stop] epoch=0/micro_step=660/global_step=330, RunningAvgSamplesPerSec=46.502571796260625, CurrSamplesPerSec=46.319203124992384, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:51:27,664] [INFO] [logging.py:128:log_dist] [Rank 0] step=340, skipped=7, lr=[4.7571569892016555e-05, 4.7571569892016555e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:51:27,685] [INFO] [timer.py:264:stop] epoch=0/micro_step=680/global_step=340, RunningAvgSamplesPerSec=46.49864772289161, CurrSamplesPerSec=46.32430289713105, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:51:34,611] [INFO] [logging.py:128:log_dist] [Rank 0] step=350, skipped=7, lr=[4.742612087715547e-05, 4.742612087715547e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:51:34,631] [INFO] [timer.py:264:stop] epoch=0/micro_step=700/global_step=350, RunningAvgSamplesPerSec=46.495692632814894, CurrSamplesPerSec=46.390253314093556, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:51:41,565] [INFO] [logging.py:128:log_dist] [Rank 0] step=360, skipped=7, lr=[4.727667761875828e-05, 4.727667761875828e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:51:41,586] [INFO] [timer.py:264:stop] epoch=0/micro_step=720/global_step=360, RunningAvgSamplesPerSec=46.491304442529376, CurrSamplesPerSec=46.325166292582765, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:51:48,518] [INFO] [logging.py:128:log_dist] [Rank 0] step=370, skipped=7, lr=[4.712326673367824e-05, 4.712326673367824e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:51:48,538] [INFO] [timer.py:264:stop] epoch=0/micro_step=740/global_step=370, RunningAvgSamplesPerSec=46.487412684576384, CurrSamplesPerSec=46.33804109742901, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:51:55,472] [INFO] [logging.py:128:log_dist] [Rank 0] step=380, skipped=7, lr=[4.696591554542973e-05, 4.696591554542973e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:51:55,493] [INFO] [timer.py:264:stop] epoch=0/micro_step=760/global_step=380, RunningAvgSamplesPerSec=46.48344060347751, CurrSamplesPerSec=46.31380082968855, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:52:02,420] [INFO] [logging.py:128:log_dist] [Rank 0] step=390, skipped=7, lr=[4.6804652079321726e-05, 4.6804652079321726e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:52:02,441] [INFO] [timer.py:264:stop] epoch=0/micro_step=780/global_step=390, RunningAvgSamplesPerSec=46.48118928657953, CurrSamplesPerSec=46.933735963905434, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:52:09,377] [INFO] [logging.py:128:log_dist] [Rank 0] step=400, skipped=7, lr=[4.663950505746629e-05, 4.663950505746629e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:52:09,398] [INFO] [timer.py:264:stop] epoch=0/micro_step=800/global_step=400, RunningAvgSamplesPerSec=46.47709234422624, CurrSamplesPerSec=46.35471700350897, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:52:16,305] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:52:16,306] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:52:16,307] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:52:16,309] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:52:16,309] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:52:16,312] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:52:16,312] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:52:16,325] [INFO] [logging.py:128:log_dist] [Rank 0] step=410, skipped=7, lr=[4.6470503893662995e-05, 4.6470503893662995e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:52:16,345] [INFO] [timer.py:264:stop] epoch=0/micro_step=820/global_step=410, RunningAvgSamplesPerSec=46.47536079054126, CurrSamplesPerSec=46.36307547356772, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:52:23,279] [INFO] [logging.py:128:log_dist] [Rank 0] step=420, skipped=7, lr=[4.6297678688160096e-05, 4.6297678688160096e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:52:23,299] [INFO] [timer.py:264:stop] epoch=0/micro_step=840/global_step=420, RunningAvgSamplesPerSec=46.47249330297309, CurrSamplesPerSec=46.40106279118714, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:52:30,233] [INFO] [logging.py:128:log_dist] [Rank 0] step=430, skipped=7, lr=[4.612106022229352e-05, 4.612106022229352e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:52:30,254] [INFO] [timer.py:264:stop] epoch=0/micro_step=860/global_step=430, RunningAvgSamplesPerSec=46.469326490503796, CurrSamplesPerSec=46.32311977785527, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:52:37,190] [INFO] [logging.py:128:log_dist] [Rank 0] step=440, skipped=7, lr=[4.594067995300447e-05, 4.594067995300447e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:52:37,210] [INFO] [timer.py:264:stop] epoch=0/micro_step=880/global_step=440, RunningAvgSamplesPerSec=46.466282683995, CurrSamplesPerSec=46.33493769496381, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:52:44,142] [INFO] [logging.py:128:log_dist] [Rank 0] step=450, skipped=7, lr=[4.57565700072367e-05, 4.57565700072367e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:52:44,162] [INFO] [timer.py:264:stop] epoch=0/micro_step=900/global_step=450, RunningAvgSamplesPerSec=46.463773986613354, CurrSamplesPerSec=46.314376162125654, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:52:51,093] [INFO] [logging.py:128:log_dist] [Rank 0] step=460, skipped=7, lr=[4.556876317621458e-05, 4.556876317621458e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:52:51,114] [INFO] [timer.py:264:stop] epoch=0/micro_step=920/global_step=460, RunningAvgSamplesPerSec=46.46132522763121, CurrSamplesPerSec=46.33345012880701, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:52:58,047] [INFO] [logging.py:128:log_dist] [Rank 0] step=470, skipped=7, lr=[4.5377292909602656e-05, 4.5377292909602656e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:52:58,068] [INFO] [timer.py:264:stop] epoch=0/micro_step=940/global_step=470, RunningAvgSamplesPerSec=46.45878779560237, CurrSamplesPerSec=46.43445319841673, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 472 +[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 472 +[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 472 +[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 472 +[2025-01-02 15:53:00,097] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 2048.0, reducing to 1024.0 +[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:53:00,097] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 472 +[2025-01-02 15:53:00,098] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:53:00,098] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:53:00,098] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 472 +[2025-01-02 15:53:00,098] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.05, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:53:04,937] [INFO] [logging.py:128:log_dist] [Rank 0] step=480, skipped=8, lr=[4.520186560426292e-05, 4.520186560426292e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:53:04,958] [INFO] [timer.py:264:stop] epoch=0/micro_step=960/global_step=480, RunningAvgSamplesPerSec=46.4652574508203, CurrSamplesPerSec=46.38100350402843, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:53:11,893] [INFO] [logging.py:128:log_dist] [Rank 0] step=490, skipped=8, lr=[4.5003529295830075e-05, 4.5003529295830075e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:53:11,913] [INFO] [timer.py:264:stop] epoch=0/micro_step=980/global_step=490, RunningAvgSamplesPerSec=46.46303714998046, CurrSamplesPerSec=46.34392910632221, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:53:18,845] [INFO] [logging.py:128:log_dist] [Rank 0] step=500, skipped=8, lr=[4.4801630223777665e-05, 4.4801630223777665e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:53:18,865] [INFO] [timer.py:264:stop] epoch=0/micro_step=1000/global_step=500, RunningAvgSamplesPerSec=46.46079008466478, CurrSamplesPerSec=46.36059323775675, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:53:25,798] [INFO] [logging.py:128:log_dist] [Rank 0] step=510, skipped=8, lr=[4.459620434769351e-05, 4.459620434769351e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:53:25,819] [INFO] [timer.py:264:stop] epoch=0/micro_step=1020/global_step=510, RunningAvgSamplesPerSec=46.45906494240357, CurrSamplesPerSec=46.75834281138947, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.99, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:53:32,751] [INFO] [logging.py:128:log_dist] [Rank 0] step=520, skipped=8, lr=[4.438728825531305e-05, 4.438728825531305e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:53:32,772] [INFO] [timer.py:264:stop] epoch=0/micro_step=1040/global_step=520, RunningAvgSamplesPerSec=46.45683775721524, CurrSamplesPerSec=46.369770828201965, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:53:39,703] [INFO] [logging.py:128:log_dist] [Rank 0] step=530, skipped=8, lr=[4.417491915600285e-05, 4.417491915600285e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:53:39,724] [INFO] [timer.py:264:stop] epoch=0/micro_step=1060/global_step=530, RunningAvgSamplesPerSec=46.454981721927275, CurrSamplesPerSec=46.67327109354972, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:53:46,661] [INFO] [logging.py:128:log_dist] [Rank 0] step=540, skipped=8, lr=[4.395913487413324e-05, 4.395913487413324e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:53:46,681] [INFO] [timer.py:264:stop] epoch=0/micro_step=1080/global_step=540, RunningAvgSamplesPerSec=46.45267445975501, CurrSamplesPerSec=46.34778592305895, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:53:53,612] [INFO] [logging.py:128:log_dist] [Rank 0] step=550, skipped=8, lr=[4.37399738423417e-05, 4.37399738423417e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:53:53,632] [INFO] [timer.py:264:stop] epoch=0/micro_step=1100/global_step=550, RunningAvgSamplesPerSec=46.45093776947002, CurrSamplesPerSec=46.396587634057866, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:54:00,566] [INFO] [logging.py:128:log_dist] [Rank 0] step=560, skipped=8, lr=[4.351747509468763e-05, 4.351747509468763e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:54:00,586] [INFO] [timer.py:264:stop] epoch=0/micro_step=1120/global_step=560, RunningAvgSamplesPerSec=46.44909549888697, CurrSamplesPerSec=46.45555564637373, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 13.99, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:54:07,514] [INFO] [logging.py:128:log_dist] [Rank 0] step=570, skipped=8, lr=[4.3291678259700163e-05, 4.3291678259700163e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:54:07,534] [INFO] [timer.py:264:stop] epoch=0/micro_step=1140/global_step=570, RunningAvgSamplesPerSec=46.44810234293339, CurrSamplesPerSec=46.34565739295321, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:54:10,277] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:54:10,277] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:54:10,277] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:54:10,278] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:54:10,279] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:54:10,284] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:54:10,284] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:54:14,467] [INFO] [logging.py:128:log_dist] [Rank 0] step=580, skipped=8, lr=[4.306262355332006e-05, 4.306262355332006e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:54:14,487] [INFO] [timer.py:264:stop] epoch=0/micro_step=1160/global_step=580, RunningAvgSamplesPerSec=46.446247391835065, CurrSamplesPerSec=46.379192448009945, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:54:21,419] [INFO] [logging.py:128:log_dist] [Rank 0] step=590, skipped=8, lr=[4.2830351771736965e-05, 4.2830351771736965e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:54:21,440] [INFO] [timer.py:264:stop] epoch=0/micro_step=1180/global_step=590, RunningAvgSamplesPerSec=46.444511706837304, CurrSamplesPerSec=46.33538558305289, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:54:28,372] [INFO] [logging.py:128:log_dist] [Rank 0] step=600, skipped=8, lr=[4.259490428412335e-05, 4.259490428412335e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:54:28,392] [INFO] [timer.py:264:stop] epoch=0/micro_step=1200/global_step=600, RunningAvgSamplesPerSec=46.44315240764414, CurrSamplesPerSec=46.37826293436061, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:54:35,326] [INFO] [logging.py:128:log_dist] [Rank 0] step=610, skipped=8, lr=[4.235632302526635e-05, 4.235632302526635e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:54:35,347] [INFO] [timer.py:264:stop] epoch=0/micro_step=1220/global_step=610, RunningAvgSamplesPerSec=46.4417772135784, CurrSamplesPerSec=46.33301827265184, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:54:42,279] [INFO] [logging.py:128:log_dist] [Rank 0] step=620, skipped=8, lr=[4.2114650488098936e-05, 4.2114650488098936e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:54:42,299] [INFO] [timer.py:264:stop] epoch=0/micro_step=1240/global_step=620, RunningAvgSamplesPerSec=46.44041818234045, CurrSamplesPerSec=46.349610533055575, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:54:49,231] [INFO] [logging.py:128:log_dist] [Rank 0] step=630, skipped=8, lr=[4.1869929716131605e-05, 4.1869929716131605e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:54:49,251] [INFO] [timer.py:264:stop] epoch=0/micro_step=1260/global_step=630, RunningAvgSamplesPerSec=46.43896915559678, CurrSamplesPerSec=46.32174487732559, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:54:54,756] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 637 +[2025-01-02 15:54:54,756] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 637 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:54:54,756] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 637 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 637 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 637 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 637 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 637 +[2025-01-02 15:54:54,757] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 2048.0, reducing to 1024.0 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 637 +[2025-01-02 15:54:54,757] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.05, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:54:56,121] [INFO] [logging.py:128:log_dist] [Rank 0] step=640, skipped=9, lr=[4.164711079369153e-05, 4.164711079369153e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:54:56,141] [INFO] [timer.py:264:stop] epoch=0/micro_step=1280/global_step=640, RunningAvgSamplesPerSec=46.44426552356086, CurrSamplesPerSec=46.362723140040615, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:55:03,067] [INFO] [logging.py:128:log_dist] [Rank 0] step=650, skipped=9, lr=[4.1396718898658025e-05, 4.1396718898658025e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:55:03,087] [INFO] [timer.py:264:stop] epoch=0/micro_step=1300/global_step=650, RunningAvgSamplesPerSec=46.443367108838785, CurrSamplesPerSec=46.33520962598524, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:55:10,025] [INFO] [logging.py:128:log_dist] [Rank 0] step=660, skipped=9, lr=[4.1143406637287735e-05, 4.1143406637287735e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:55:10,045] [INFO] [timer.py:264:stop] epoch=0/micro_step=1320/global_step=660, RunningAvgSamplesPerSec=46.44134794762593, CurrSamplesPerSec=46.33818507979863, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:55:16,979] [INFO] [logging.py:128:log_dist] [Rank 0] step=670, skipped=9, lr=[4.088721912620461e-05, 4.088721912620461e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:55:17,000] [INFO] [timer.py:264:stop] epoch=0/micro_step=1340/global_step=670, RunningAvgSamplesPerSec=46.43998496782961, CurrSamplesPerSec=47.037639814383006, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.19, Samples/sec: 15.57, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.56, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:55:23,929] [INFO] [logging.py:128:log_dist] [Rank 0] step=680, skipped=9, lr=[4.0628201994134016e-05, 4.0628201994134016e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:55:23,949] [INFO] [timer.py:264:stop] epoch=0/micro_step=1360/global_step=680, RunningAvgSamplesPerSec=46.43896855112754, CurrSamplesPerSec=46.311851198280706, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.15s, TFLOPs: 1.93, Samples/sec: 13.76, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:55:30,886] [INFO] [logging.py:128:log_dist] [Rank 0] step=690, skipped=9, lr=[4.036640137377588e-05, 4.036640137377588e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:55:30,907] [INFO] [timer.py:264:stop] epoch=0/micro_step=1380/global_step=690, RunningAvgSamplesPerSec=46.437531826930254, CurrSamplesPerSec=45.93531883513, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:55:37,833] [INFO] [logging.py:128:log_dist] [Rank 0] step=700, skipped=9, lr=[4.010186389358825e-05, 4.010186389358825e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:55:37,854] [INFO] [timer.py:264:stop] epoch=0/micro_step=1400/global_step=700, RunningAvgSamplesPerSec=46.436798376271994, CurrSamplesPerSec=46.37616365270356, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:55:44,791] [INFO] [logging.py:128:log_dist] [Rank 0] step=710, skipped=9, lr=[3.983463666948233e-05, 3.983463666948233e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:55:44,811] [INFO] [timer.py:264:stop] epoch=0/micro_step=1420/global_step=710, RunningAvgSamplesPerSec=46.43521967480529, CurrSamplesPerSec=46.30605121647706, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:55:51,744] [INFO] [logging.py:128:log_dist] [Rank 0] step=720, skipped=9, lr=[3.9564767296430877e-05, 3.9564767296430877e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:55:51,765] [INFO] [timer.py:264:stop] epoch=0/micro_step=1440/global_step=720, RunningAvgSamplesPerSec=46.434081332236666, CurrSamplesPerSec=46.35162737441071, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.19, Samples/sec: 15.57, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.55, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:55:58,693] [INFO] [logging.py:128:log_dist] [Rank 0] step=730, skipped=9, lr=[3.929230383999124e-05, 3.929230383999124e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:55:58,714] [INFO] [timer.py:264:stop] epoch=0/micro_step=1460/global_step=730, RunningAvgSamplesPerSec=46.43326436780265, CurrSamplesPerSec=46.40451197798106, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.54, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:56:04,939] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:56:04,939] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:56:04,939] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:56:04,939] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:56:04,940] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:56:04,940] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:56:04,940] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:56:04,940] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:56:04,940] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:56:04,941] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:56:04,941] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:56:04,942] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:56:04,942] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 15:56:04,945] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:56:04,946] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:56:05,667] [INFO] [logging.py:128:log_dist] [Rank 0] step=740, skipped=9, lr=[3.901729482774453e-05, 3.901729482774453e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:56:05,684] [INFO] [timer.py:264:stop] epoch=0/micro_step=1480/global_step=740, RunningAvgSamplesPerSec=46.43108873755449, CurrSamplesPerSec=45.89287960413895, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:56:12,611] [INFO] [logging.py:128:log_dist] [Rank 0] step=750, skipped=9, lr=[3.8739789240652524e-05, 3.8739789240652524e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:56:12,632] [INFO] [timer.py:264:stop] epoch=0/micro_step=1500/global_step=750, RunningAvgSamplesPerSec=46.430940342580826, CurrSamplesPerSec=46.33469775847788, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.55, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:56:19,572] [INFO] [logging.py:128:log_dist] [Rank 0] step=760, skipped=9, lr=[3.845983650433384e-05, 3.845983650433384e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:56:19,593] [INFO] [timer.py:264:stop] epoch=0/micro_step=1520/global_step=760, RunningAvgSamplesPerSec=46.42927334106034, CurrSamplesPerSec=46.41226248648767, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:56:26,522] [INFO] [logging.py:128:log_dist] [Rank 0] step=770, skipped=9, lr=[3.817748648026087e-05, 3.817748648026087e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:56:26,542] [INFO] [timer.py:264:stop] epoch=0/micro_step=1540/global_step=770, RunningAvgSamplesPerSec=46.42870842332397, CurrSamplesPerSec=46.3644207962945, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:56:33,474] [INFO] [logging.py:128:log_dist] [Rank 0] step=780, skipped=9, lr=[3.78927894568792e-05, 3.78927894568792e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:56:33,494] [INFO] [timer.py:264:stop] epoch=0/micro_step=1560/global_step=780, RunningAvgSamplesPerSec=46.427907498021575, CurrSamplesPerSec=46.35952035358448, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:56:40,426] [INFO] [logging.py:128:log_dist] [Rank 0] step=790, skipped=9, lr=[3.7605796140650764e-05, 3.7605796140650764e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:56:40,447] [INFO] [timer.py:264:stop] epoch=0/micro_step=1580/global_step=790, RunningAvgSamplesPerSec=46.42688505692321, CurrSamplesPerSec=46.38398484232087, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:56:47,379] [INFO] [logging.py:128:log_dist] [Rank 0] step=800, skipped=9, lr=[3.73165576470228e-05, 3.73165576470228e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:56:47,399] [INFO] [timer.py:264:stop] epoch=0/micro_step=1600/global_step=800, RunningAvgSamplesPerSec=46.42593307965039, CurrSamplesPerSec=46.31824404565779, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:56:54,330] [INFO] [logging.py:128:log_dist] [Rank 0] step=810, skipped=9, lr=[3.70251254913238e-05, 3.70251254913238e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:56:54,350] [INFO] [timer.py:264:stop] epoch=0/micro_step=1620/global_step=810, RunningAvgSamplesPerSec=46.42510739233998, CurrSamplesPerSec=46.36661512111462, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:57:01,282] [INFO] [logging.py:128:log_dist] [Rank 0] step=820, skipped=9, lr=[3.673155157958827e-05, 3.673155157958827e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:57:01,303] [INFO] [timer.py:264:stop] epoch=0/micro_step=1640/global_step=820, RunningAvgSamplesPerSec=46.42418767305092, CurrSamplesPerSec=46.3146478518564, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:57:08,234] [INFO] [logging.py:128:log_dist] [Rank 0] step=830, skipped=9, lr=[3.6435888199311916e-05, 3.6435888199311916e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:57:08,254] [INFO] [timer.py:264:stop] epoch=0/micro_step=1660/global_step=830, RunningAvgSamplesPerSec=46.42334132276314, CurrSamplesPerSec=46.34514529458703, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:57:14,472] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:57:14,473] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:57:14,474] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:57:14,475] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:57:14,479] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:57:14,479] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:57:15,187] [INFO] [logging.py:128:log_dist] [Rank 0] step=840, skipped=9, lr=[3.6138188010138916e-05, 3.6138188010138916e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:57:15,208] [INFO] [timer.py:264:stop] epoch=0/micro_step=1680/global_step=840, RunningAvgSamplesPerSec=46.42245676253584, CurrSamplesPerSec=46.35514926425546, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:57:22,140] [INFO] [logging.py:128:log_dist] [Rank 0] step=850, skipped=9, lr=[3.583850403448287e-05, 3.583850403448287e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:57:22,161] [INFO] [timer.py:264:stop] epoch=0/micro_step=1700/global_step=850, RunningAvgSamplesPerSec=46.42177760625656, CurrSamplesPerSec=46.53673661663851, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:57:29,092] [INFO] [logging.py:128:log_dist] [Rank 0] step=860, skipped=9, lr=[3.5536889648083114e-05, 3.5536889648083114e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:57:29,113] [INFO] [timer.py:264:stop] epoch=0/micro_step=1720/global_step=860, RunningAvgSamplesPerSec=46.421073151354676, CurrSamplesPerSec=46.339704948271745, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.98, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:57:36,046] [INFO] [logging.py:128:log_dist] [Rank 0] step=870, skipped=9, lr=[3.523339857049819e-05, 3.523339857049819e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:57:36,066] [INFO] [timer.py:264:stop] epoch=0/micro_step=1740/global_step=870, RunningAvgSamplesPerSec=46.420195916054574, CurrSamplesPerSec=46.33840905415208, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 871 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 871 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 871 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 871 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:57:37,399] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 871 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 871 +[2025-01-02 15:57:37,399] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.05, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:57:42,936] [INFO] [logging.py:128:log_dist] [Rank 0] step=880, skipped=10, lr=[3.495869669843086e-05, 3.495869669843086e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:57:42,956] [INFO] [timer.py:264:stop] epoch=0/micro_step=1760/global_step=880, RunningAvgSamplesPerSec=46.42414880906899, CurrSamplesPerSec=46.31806821875464, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:57:49,887] [INFO] [logging.py:128:log_dist] [Rank 0] step=890, skipped=10, lr=[3.4651789094342044e-05, 3.4651789094342044e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:57:49,907] [INFO] [timer.py:264:stop] epoch=0/micro_step=1780/global_step=890, RunningAvgSamplesPerSec=46.42345409193999, CurrSamplesPerSec=46.35346829552468, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:57:56,836] [INFO] [logging.py:128:log_dist] [Rank 0] step=900, skipped=10, lr=[3.434316244145236e-05, 3.434316244145236e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:57:56,857] [INFO] [timer.py:264:stop] epoch=0/micro_step=1800/global_step=900, RunningAvgSamplesPerSec=46.42303656570756, CurrSamplesPerSec=46.4751966952545, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:58:03,784] [INFO] [logging.py:128:log_dist] [Rank 0] step=910, skipped=10, lr=[3.403287170825234e-05, 3.403287170825234e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:58:03,805] [INFO] [timer.py:264:stop] epoch=0/micro_step=1820/global_step=910, RunningAvgSamplesPerSec=46.42259453897555, CurrSamplesPerSec=46.36161812871359, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:58:10,733] [INFO] [logging.py:128:log_dist] [Rank 0] step=920, skipped=10, lr=[3.3720972159616496e-05, 3.3720972159616496e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:58:10,751] [INFO] [timer.py:264:stop] epoch=0/micro_step=1840/global_step=920, RunningAvgSamplesPerSec=46.42241240368338, CurrSamplesPerSec=46.487559281055454, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.55, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:58:17,702] [INFO] [logging.py:128:log_dist] [Rank 0] step=930, skipped=10, lr=[3.340751934696017e-05, 3.340751934696017e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:58:17,722] [INFO] [timer.py:264:stop] epoch=0/micro_step=1860/global_step=930, RunningAvgSamplesPerSec=46.420715482541, CurrSamplesPerSec=46.42123576943527, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:58:24,619] [INFO] [logging.py:128:log_dist] [Rank 0] step=940, skipped=10, lr=[3.309256909834556e-05, 3.309256909834556e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:58:24,639] [INFO] [timer.py:264:stop] epoch=0/micro_step=1880/global_step=940, RunningAvgSamplesPerSec=46.42255350808398, CurrSamplesPerSec=46.7765126862402, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:58:31,518] [INFO] [logging.py:128:log_dist] [Rank 0] step=950, skipped=10, lr=[3.2776177508538304e-05, 3.2776177508538304e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:58:31,538] [INFO] [timer.py:264:stop] epoch=0/micro_step=1900/global_step=950, RunningAvgSamplesPerSec=46.42571670265899, CurrSamplesPerSec=46.77811035486935, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.98, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:58:38,418] [INFO] [logging.py:128:log_dist] [Rank 0] step=960, skipped=10, lr=[3.245840092901662e-05, 3.245840092901662e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:58:38,439] [INFO] [timer.py:264:stop] epoch=0/micro_step=1920/global_step=960, RunningAvgSamplesPerSec=46.428712246785175, CurrSamplesPerSec=46.81064173887472, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.13, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:58:45,316] [INFO] [logging.py:128:log_dist] [Rank 0] step=970, skipped=10, lr=[3.213929595793479e-05, 3.213929595793479e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:58:45,336] [INFO] [timer.py:264:stop] epoch=0/micro_step=1940/global_step=970, RunningAvgSamplesPerSec=46.431706559142256, CurrSamplesPerSec=46.806527953026816, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:58:47,365] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:58:47,365] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 15:58:47,366] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 15:58:47,367] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.19, Samples/sec: 15.59, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:58:52,204] [INFO] [logging.py:128:log_dist] [Rank 0] step=980, skipped=10, lr=[3.1818919430042524e-05, 3.1818919430042524e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:58:52,225] [INFO] [timer.py:264:stop] epoch=0/micro_step=1960/global_step=980, RunningAvgSamplesPerSec=46.43535870004616, CurrSamplesPerSec=46.78776389902927, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:58:59,091] [INFO] [logging.py:128:log_dist] [Rank 0] step=990, skipped=10, lr=[3.1497328406562476e-05, 3.1497328406562476e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:58:59,112] [INFO] [timer.py:264:stop] epoch=0/micro_step=1980/global_step=990, RunningAvgSamplesPerSec=46.43904306339881, CurrSamplesPerSec=46.83297000125497, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.94, Samples/sec: 13.81, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:59:05,986] [INFO] [logging.py:128:log_dist] [Rank 0] step=1000, skipped=10, lr=[3.117458016502711e-05, 3.117458016502711e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:59:06,006] [INFO] [timer.py:264:stop] epoch=0/micro_step=2000/global_step=1000, RunningAvgSamplesPerSec=46.44231158064333, CurrSamplesPerSec=46.81330302519387, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.57s, TFLOPs: 0.50, Samples/sec: 3.53, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:59:12,898] [INFO] [logging.py:128:log_dist] [Rank 0] step=1010, skipped=10, lr=[3.0850732189077236e-05, 3.0850732189077236e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:59:12,918] [INFO] [timer.py:264:stop] epoch=0/micro_step=2020/global_step=1010, RunningAvgSamplesPerSec=46.444083453859605, CurrSamplesPerSec=46.8065442761449, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:59:19,784] [INFO] [logging.py:128:log_dist] [Rank 0] step=1020, skipped=10, lr=[3.05258421582238e-05, 3.05258421582238e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:59:19,804] [INFO] [timer.py:264:stop] epoch=0/micro_step=2040/global_step=1020, RunningAvgSamplesPerSec=46.447488410641995, CurrSamplesPerSec=46.75251188976779, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:59:26,682] [INFO] [logging.py:128:log_dist] [Rank 0] step=1030, skipped=10, lr=[3.0199967937574774e-05, 3.0199967937574774e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:59:26,699] [INFO] [timer.py:264:stop] epoch=0/micro_step=2060/global_step=1030, RunningAvgSamplesPerSec=46.45030464457745, CurrSamplesPerSec=46.19032410349784, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:59:27,333] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1030 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:59:27,333] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1030 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1030 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1030 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1030 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1030 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1030 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:59:27,334] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:59:27,335] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1030 +[2025-01-02 15:59:27,335] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 15:59:27,335] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.58, Samples/sec: 4.10, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:59:33,531] [INFO] [logging.py:128:log_dist] [Rank 0] step=1040, skipped=11, lr=[2.9905887623649602e-05, 2.9905887623649602e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:59:33,551] [INFO] [timer.py:264:stop] epoch=0/micro_step=2080/global_step=1040, RunningAvgSamplesPerSec=46.45609527814709, CurrSamplesPerSec=46.61336089809432, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:59:40,420] [INFO] [logging.py:128:log_dist] [Rank 0] step=1050, skipped=11, lr=[2.9578303480235774e-05, 2.9578303480235774e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:59:40,441] [INFO] [timer.py:264:stop] epoch=0/micro_step=2100/global_step=1050, RunningAvgSamplesPerSec=46.45922477354279, CurrSamplesPerSec=46.80225168829745, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:59:47,310] [INFO] [logging.py:128:log_dist] [Rank 0] step=1060, skipped=11, lr=[2.9249903910062116e-05, 2.9249903910062116e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:59:47,330] [INFO] [timer.py:264:stop] epoch=0/micro_step=2120/global_step=1060, RunningAvgSamplesPerSec=46.46228296302194, CurrSamplesPerSec=46.73650876277751, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.54, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:59:54,242] [INFO] [logging.py:128:log_dist] [Rank 0] step=1070, skipped=11, lr=[2.8920747403309247e-05, 2.8920747403309247e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 15:59:54,262] [INFO] [timer.py:264:stop] epoch=0/micro_step=2140/global_step=1070, RunningAvgSamplesPerSec=46.46284313048135, CurrSamplesPerSec=46.67312502129054, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1070 +[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1070 +[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1070 +[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:59:54,893] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 2048.0, reducing to 1024.0 +[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1070 +[2025-01-02 15:59:54,894] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:59:54,894] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1070 +[2025-01-02 15:59:54,893] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1070 +[2025-01-02 15:59:54,894] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:59:54,894] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 15:59:54,894] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1070 +[2025-01-02 15:59:54,894] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.58, Samples/sec: 4.11, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:00:01,069] [INFO] [logging.py:128:log_dist] [Rank 0] step=1080, skipped=12, lr=[2.8623907817398308e-05, 2.8623907817398308e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:00:01,090] [INFO] [timer.py:264:stop] epoch=0/micro_step=2160/global_step=1080, RunningAvgSamplesPerSec=46.469588801539444, CurrSamplesPerSec=46.82241573242371, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:00:07,965] [INFO] [logging.py:128:log_dist] [Rank 0] step=1090, skipped=12, lr=[2.8293474746020472e-05, 2.8293474746020472e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:00:07,984] [INFO] [timer.py:264:stop] epoch=0/micro_step=2180/global_step=1090, RunningAvgSamplesPerSec=46.47228214300523, CurrSamplesPerSec=46.75736546050338, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:00:14,870] [INFO] [logging.py:128:log_dist] [Rank 0] step=1100, skipped=12, lr=[2.7962455084554778e-05, 2.7962455084554778e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:00:14,890] [INFO] [timer.py:264:stop] epoch=0/micro_step=2200/global_step=1100, RunningAvgSamplesPerSec=46.47416601952744, CurrSamplesPerSec=46.769764555584786, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.98, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:00:21,778] [INFO] [logging.py:128:log_dist] [Rank 0] step=1110, skipped=12, lr=[2.763090778983777e-05, 2.763090778983777e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:00:21,798] [INFO] [timer.py:264:stop] epoch=0/micro_step=2220/global_step=1110, RunningAvgSamplesPerSec=46.47592055429421, CurrSamplesPerSec=46.91998677298789, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:00:28,694] [INFO] [logging.py:128:log_dist] [Rank 0] step=1120, skipped=12, lr=[2.729889191268107e-05, 2.729889191268107e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:00:28,714] [INFO] [timer.py:264:stop] epoch=0/micro_step=2240/global_step=1120, RunningAvgSamplesPerSec=46.477148202453265, CurrSamplesPerSec=46.72287486730245, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:00:35,589] [INFO] [logging.py:128:log_dist] [Rank 0] step=1130, skipped=12, lr=[2.696646658735396e-05, 2.696646658735396e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:00:35,609] [INFO] [timer.py:264:stop] epoch=0/micro_step=2260/global_step=1130, RunningAvgSamplesPerSec=46.479514044879025, CurrSamplesPerSec=46.664411030997236, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:00:42,488] [INFO] [logging.py:128:log_dist] [Rank 0] step=1140, skipped=12, lr=[2.6633691021051226e-05, 2.6633691021051226e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:00:42,509] [INFO] [timer.py:264:stop] epoch=0/micro_step=2280/global_step=1140, RunningAvgSamplesPerSec=46.481591664999485, CurrSamplesPerSec=46.77589321144054, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.56, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:00:49,420] [INFO] [logging.py:128:log_dist] [Rank 0] step=1150, skipped=12, lr=[2.6300624483347926e-05, 2.6300624483347926e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:00:49,440] [INFO] [timer.py:264:stop] epoch=0/micro_step=2300/global_step=1150, RunningAvgSamplesPerSec=46.481755312094066, CurrSamplesPerSec=46.77967552383947, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:00:56,307] [INFO] [logging.py:128:log_dist] [Rank 0] step=1160, skipped=12, lr=[2.596732629564309e-05, 2.596732629564309e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:00:56,328] [INFO] [timer.py:264:stop] epoch=0/micro_step=2320/global_step=1160, RunningAvgSamplesPerSec=46.48439949209591, CurrSamplesPerSec=46.79316314116429, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:01:03,200] [INFO] [logging.py:128:log_dist] [Rank 0] step=1170, skipped=12, lr=[2.56338558205942e-05, 2.56338558205942e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:01:03,221] [INFO] [timer.py:264:stop] epoch=0/micro_step=2340/global_step=1170, RunningAvgSamplesPerSec=46.48707192862625, CurrSamplesPerSec=46.97179330281597, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:01:04,561] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:01:04,561] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:01:04,562] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:01:04,563] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:01:04,563] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:01:04,564] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:01:04,564] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:01:10,094] [INFO] [logging.py:128:log_dist] [Rank 0] step=1180, skipped=12, lr=[2.5300272451544234e-05, 2.5300272451544234e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:01:10,114] [INFO] [timer.py:264:stop] epoch=0/micro_step=2360/global_step=1180, RunningAvgSamplesPerSec=46.48953479190863, CurrSamplesPerSec=46.803459409078194, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:01:16,986] [INFO] [logging.py:128:log_dist] [Rank 0] step=1190, skipped=12, lr=[2.496663560194338e-05, 2.496663560194338e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:01:17,006] [INFO] [timer.py:264:stop] epoch=0/micro_step=2380/global_step=1190, RunningAvgSamplesPerSec=46.49180947433954, CurrSamplesPerSec=46.80528742936384, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.57s, TFLOPs: 0.50, Samples/sec: 3.53, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:01:19,727] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1193 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1193 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1193 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1193 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1193 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1193 +[2025-01-02 16:01:19,728] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 2048.0, reducing to 1024.0 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1193 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1193 +[2025-01-02 16:01:19,728] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 2048.0 to 1024.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.09, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:01:23,847] [INFO] [logging.py:128:log_dist] [Rank 0] step=1200, skipped=13, lr=[2.4666365824494565e-05, 2.4666365824494565e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:01:23,868] [INFO] [timer.py:264:stop] epoch=0/micro_step=2400/global_step=1200, RunningAvgSamplesPerSec=46.49591524184215, CurrSamplesPerSec=46.686599976696954, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:01:30,757] [INFO] [logging.py:128:log_dist] [Rank 0] step=1210, skipped=13, lr=[2.4332791071488294e-05, 2.4332791071488294e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:01:30,777] [INFO] [timer.py:264:stop] epoch=0/micro_step=2420/global_step=1210, RunningAvgSamplesPerSec=46.49732960112968, CurrSamplesPerSec=46.76973196064543, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:01:37,682] [INFO] [logging.py:128:log_dist] [Rank 0] step=1220, skipped=13, lr=[2.3999335152896784e-05, 2.3999335152896784e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:01:37,703] [INFO] [timer.py:264:stop] epoch=0/micro_step=2440/global_step=1220, RunningAvgSamplesPerSec=46.497692834200365, CurrSamplesPerSec=46.50880673295993, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.99, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.18, Samples/sec: 15.49, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.57s, TFLOPs: 0.49, Samples/sec: 3.50, Time/seq 0.29s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:01:44,633] [INFO] [logging.py:128:log_dist] [Rank 0] step=1230, skipped=13, lr=[2.3666057459470436e-05, 2.3666057459470436e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:01:44,654] [INFO] [timer.py:264:stop] epoch=0/micro_step=2460/global_step=1230, RunningAvgSamplesPerSec=46.49675848239761, CurrSamplesPerSec=46.493952415474645, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:01:51,580] [INFO] [logging.py:128:log_dist] [Rank 0] step=1240, skipped=13, lr=[2.3333017350216558e-05, 2.3333017350216558e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:01:51,600] [INFO] [timer.py:264:stop] epoch=0/micro_step=2480/global_step=1240, RunningAvgSamplesPerSec=46.49602569155278, CurrSamplesPerSec=46.5161407241932, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:01:58,509] [INFO] [logging.py:128:log_dist] [Rank 0] step=1250, skipped=13, lr=[2.300027414182708e-05, 2.300027414182708e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:01:58,529] [INFO] [timer.py:264:stop] epoch=0/micro_step=2500/global_step=1250, RunningAvgSamplesPerSec=46.49621684629403, CurrSamplesPerSec=46.497528184225175, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:02:05,438] [INFO] [logging.py:128:log_dist] [Rank 0] step=1260, skipped=13, lr=[2.2667887098113915e-05, 2.2667887098113915e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:02:05,458] [INFO] [timer.py:264:stop] epoch=0/micro_step=2520/global_step=1260, RunningAvgSamplesPerSec=46.496336564272966, CurrSamplesPerSec=46.48627120484043, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:02:12,370] [INFO] [logging.py:128:log_dist] [Rank 0] step=1270, skipped=13, lr=[2.233591541945361e-05, 2.233591541945361e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:02:12,390] [INFO] [timer.py:264:stop] epoch=0/micro_step=2540/global_step=1270, RunningAvgSamplesPerSec=46.49631392383694, CurrSamplesPerSec=46.47926853952197, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.19, Samples/sec: 15.56, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:02:19,299] [INFO] [logging.py:128:log_dist] [Rank 0] step=1280, skipped=13, lr=[2.2004418232243425e-05, 2.2004418232243425e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:02:19,320] [INFO] [timer.py:264:stop] epoch=0/micro_step=2560/global_step=1280, RunningAvgSamplesPerSec=46.496336834420134, CurrSamplesPerSec=46.493743040656476, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:02:26,231] [INFO] [logging.py:128:log_dist] [Rank 0] step=1290, skipped=13, lr=[2.1673454578370484e-05, 2.1673454578370484e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:02:26,251] [INFO] [timer.py:264:stop] epoch=0/micro_step=2580/global_step=1290, RunningAvgSamplesPerSec=46.49628521737437, CurrSamplesPerSec=46.5082749070585, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:02:29,678] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:02:29,678] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:02:29,678] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:02:29,678] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:02:29,679] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:02:29,679] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:02:29,679] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:02:29,679] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:02:29,679] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:02:29,679] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:02:29,679] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +[2025-01-02 16:02:29,684] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:02:29,684] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 1024.0 to 2048.0 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:02:33,187] [INFO] [logging.py:128:log_dist] [Rank 0] step=1300, skipped=13, lr=[2.1343083404695983e-05, 2.1343083404695983e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:02:33,205] [INFO] [timer.py:264:stop] epoch=0/micro_step=2600/global_step=1300, RunningAvgSamplesPerSec=46.49535814691071, CurrSamplesPerSec=46.25004359522548, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:02:40,113] [INFO] [logging.py:128:log_dist] [Rank 0] step=1310, skipped=13, lr=[2.101336355255645e-05, 2.101336355255645e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:02:40,134] [INFO] [timer.py:264:stop] epoch=0/micro_step=2620/global_step=1310, RunningAvgSamplesPerSec=46.495657776731484, CurrSamplesPerSec=46.5038918355933, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:02:47,050] [INFO] [logging.py:128:log_dist] [Rank 0] step=1320, skipped=13, lr=[2.0684353747283626e-05, 2.0684353747283626e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:02:47,070] [INFO] [timer.py:264:stop] epoch=0/micro_step=2640/global_step=1320, RunningAvgSamplesPerSec=46.49561196265634, CurrSamplesPerSec=46.49445170149724, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:02:53,981] [INFO] [logging.py:128:log_dist] [Rank 0] step=1330, skipped=13, lr=[2.035611258774508e-05, 2.035611258774508e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:02:54,002] [INFO] [timer.py:264:stop] epoch=0/micro_step=2660/global_step=1330, RunningAvgSamplesPerSec=46.495620556823845, CurrSamplesPerSec=46.64968416069277, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:03:00,913] [INFO] [logging.py:128:log_dist] [Rank 0] step=1340, skipped=13, lr=[2.0028698535907454e-05, 2.0028698535907454e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:03:00,934] [INFO] [timer.py:264:stop] epoch=0/micro_step=2680/global_step=1340, RunningAvgSamplesPerSec=46.49557111930108, CurrSamplesPerSec=46.49372693497934, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:03:07,851] [INFO] [logging.py:128:log_dist] [Rank 0] step=1350, skipped=13, lr=[1.970216990642385e-05, 1.970216990642385e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:03:07,871] [INFO] [timer.py:264:stop] epoch=0/micro_step=2700/global_step=1350, RunningAvgSamplesPerSec=46.4952625195588, CurrSamplesPerSec=46.51009605835737, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:03:14,772] [INFO] [logging.py:128:log_dist] [Rank 0] step=1360, skipped=13, lr=[1.9376584856247734e-05, 1.9376584856247734e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:03:14,793] [INFO] [timer.py:264:stop] epoch=0/micro_step=2720/global_step=1360, RunningAvgSamplesPerSec=46.49589953343763, CurrSamplesPerSec=46.493340402075496, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:03:21,676] [INFO] [logging.py:128:log_dist] [Rank 0] step=1370, skipped=13, lr=[1.9052001374274694e-05, 1.9052001374274694e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:03:21,696] [INFO] [timer.py:264:stop] epoch=0/micro_step=2740/global_step=1370, RunningAvgSamplesPerSec=46.497266423476795, CurrSamplesPerSec=46.77222560471587, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:03:28,578] [INFO] [logging.py:128:log_dist] [Rank 0] step=1380, skipped=13, lr=[1.8728477271014252e-05, 1.8728477271014252e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:03:28,598] [INFO] [timer.py:264:stop] epoch=0/micro_step=2760/global_step=1380, RunningAvgSamplesPerSec=46.498741093520614, CurrSamplesPerSec=46.613199012037946, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:03:35,473] [INFO] [logging.py:128:log_dist] [Rank 0] step=1390, skipped=13, lr=[1.8406070168293457e-05, 1.8406070168293457e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:03:35,494] [INFO] [timer.py:264:stop] epoch=0/micro_step=2780/global_step=1390, RunningAvgSamplesPerSec=46.500554058102814, CurrSamplesPerSec=46.81554004591409, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:03:38,900] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:03:38,901] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:03:38,901] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:03:38,901] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:03:38,901] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:03:38,901] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:03:38,902] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:03:38,902] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:03:38,901] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:03:38,902] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:03:38,902] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:03:38,902] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:03:42,364] [INFO] [logging.py:128:log_dist] [Rank 0] step=1400, skipped=13, lr=[1.8084837488994006e-05, 1.8084837488994006e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:03:42,384] [INFO] [timer.py:264:stop] epoch=0/micro_step=2800/global_step=1400, RunningAvgSamplesPerSec=46.50243920837039, CurrSamplesPerSec=46.78748663020288, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.57s, TFLOPs: 0.49, Samples/sec: 3.49, Time/seq 0.29s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.93, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:03:49,284] [INFO] [logging.py:128:log_dist] [Rank 0] step=1410, skipped=13, lr=[1.7764836446824833e-05, 1.7764836446824833e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:03:49,304] [INFO] [timer.py:264:stop] epoch=0/micro_step=2820/global_step=1410, RunningAvgSamplesPerSec=46.50316609477529, CurrSamplesPerSec=46.82542141733676, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:03:56,173] [INFO] [logging.py:128:log_dist] [Rank 0] step=1420, skipped=13, lr=[1.7446124036132035e-05, 1.7446124036132035e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:03:56,194] [INFO] [timer.py:264:stop] epoch=0/micro_step=2840/global_step=1420, RunningAvgSamplesPerSec=46.50514626952885, CurrSamplesPerSec=46.80305138832927, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:04:03,071] [INFO] [logging.py:128:log_dist] [Rank 0] step=1430, skipped=13, lr=[1.71287570217477e-05, 1.71287570217477e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:04:03,091] [INFO] [timer.py:264:stop] epoch=0/micro_step=2860/global_step=1430, RunningAvgSamplesPerSec=46.50676320685188, CurrSamplesPerSec=46.848777623724516, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:04:07,853] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1436 +[2025-01-02 16:04:07,853] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:04:07,853] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1436 +[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1436 +[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1436 +[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1436 +[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1436 +[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1436 +[2025-01-02 16:04:07,854] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.58, Samples/sec: 4.11, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:04:09,908] [INFO] [logging.py:128:log_dist] [Rank 0] step=1440, skipped=14, lr=[1.684432374584351e-05, 1.684432374584351e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:04:09,929] [INFO] [timer.py:264:stop] epoch=0/micro_step=2880/global_step=1440, RunningAvgSamplesPerSec=46.51114454430859, CurrSamplesPerSec=46.99939345444054, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:04:16,798] [INFO] [logging.py:128:log_dist] [Rank 0] step=1450, skipped=14, lr=[1.6529668505230238e-05, 1.6529668505230238e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:04:16,818] [INFO] [timer.py:264:stop] epoch=0/micro_step=2900/global_step=1450, RunningAvgSamplesPerSec=46.51307636486264, CurrSamplesPerSec=46.83207123269369, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:04:23,735] [INFO] [logging.py:128:log_dist] [Rank 0] step=1460, skipped=14, lr=[1.6216521887842863e-05, 1.6216521887842863e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:04:23,756] [INFO] [timer.py:264:stop] epoch=0/micro_step=2920/global_step=1460, RunningAvgSamplesPerSec=46.51272734068831, CurrSamplesPerSec=46.46213298539395, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:04:30,666] [INFO] [logging.py:128:log_dist] [Rank 0] step=1470, skipped=14, lr=[1.59049396672081e-05, 1.59049396672081e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:04:30,687] [INFO] [timer.py:264:stop] epoch=0/micro_step=2940/global_step=1470, RunningAvgSamplesPerSec=46.51279988022624, CurrSamplesPerSec=46.513899984808106, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:04:37,603] [INFO] [logging.py:128:log_dist] [Rank 0] step=1480, skipped=14, lr=[1.5594977338223077e-05, 1.5594977338223077e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:04:37,622] [INFO] [timer.py:264:stop] epoch=0/micro_step=2960/global_step=1480, RunningAvgSamplesPerSec=46.512692069553914, CurrSamplesPerSec=46.78629604259554, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:04:44,541] [INFO] [logging.py:128:log_dist] [Rank 0] step=1490, skipped=14, lr=[1.528669010727125e-05, 1.528669010727125e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:04:44,562] [INFO] [timer.py:264:stop] epoch=0/micro_step=2980/global_step=1490, RunningAvgSamplesPerSec=46.51222927903836, CurrSamplesPerSec=46.50253840695117, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:04:51,473] [INFO] [logging.py:128:log_dist] [Rank 0] step=1500, skipped=14, lr=[1.4980132882389835e-05, 1.4980132882389835e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:04:51,494] [INFO] [timer.py:264:stop] epoch=0/micro_step=3000/global_step=1500, RunningAvgSamplesPerSec=46.51210125057641, CurrSamplesPerSec=46.50220006209979, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:04:58,403] [INFO] [logging.py:128:log_dist] [Rank 0] step=1510, skipped=14, lr=[1.4675360263490295e-05, 1.4675360263490295e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:04:58,424] [INFO] [timer.py:264:stop] epoch=0/micro_step=3020/global_step=1510, RunningAvgSamplesPerSec=46.51201324992706, CurrSamplesPerSec=46.546904191841676, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:05:05,351] [INFO] [logging.py:128:log_dist] [Rank 0] step=1520, skipped=14, lr=[1.4372426532633664e-05, 1.4372426532633664e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:05:05,372] [INFO] [timer.py:264:stop] epoch=0/micro_step=3040/global_step=1520, RunningAvgSamplesPerSec=46.51129958829241, CurrSamplesPerSec=46.45904510287717, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:05:12,297] [INFO] [logging.py:128:log_dist] [Rank 0] step=1530, skipped=14, lr=[1.4071385644362672e-05, 1.4071385644362672e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:05:12,318] [INFO] [timer.py:264:stop] epoch=0/micro_step=3060/global_step=1530, RunningAvgSamplesPerSec=46.51070949284205, CurrSamplesPerSec=46.50381127215948, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:05:17,818] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:05:17,819] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:05:17,820] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:05:17,820] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:05:17,820] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:05:17,824] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:05:17,824] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:05:19,220] [INFO] [logging.py:128:log_dist] [Rank 0] step=1540, skipped=14, lr=[1.3772291216091954e-05, 1.3772291216091954e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:05:19,241] [INFO] [timer.py:264:stop] epoch=0/micro_step=3080/global_step=1540, RunningAvgSamplesPerSec=46.5110599747054, CurrSamplesPerSec=46.807442065172594, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:05:26,152] [INFO] [logging.py:128:log_dist] [Rank 0] step=1550, skipped=14, lr=[1.347519651855848e-05, 1.347519651855848e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:05:26,173] [INFO] [timer.py:264:stop] epoch=0/micro_step=3100/global_step=1550, RunningAvgSamplesPerSec=46.51086523978462, CurrSamplesPerSec=46.47960655075998, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:05:33,087] [INFO] [logging.py:128:log_dist] [Rank 0] step=1560, skipped=14, lr=[1.3180154466333705e-05, 1.3180154466333705e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:05:33,108] [INFO] [timer.py:264:stop] epoch=0/micro_step=3120/global_step=1560, RunningAvgSamplesPerSec=46.51062351613674, CurrSamplesPerSec=46.46309803286058, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:05:40,050] [INFO] [logging.py:128:log_dist] [Rank 0] step=1570, skipped=14, lr=[1.2887217608399083e-05, 1.2887217608399083e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:05:40,071] [INFO] [timer.py:264:stop] epoch=0/micro_step=3140/global_step=1570, RunningAvgSamplesPerSec=46.50958519849321, CurrSamplesPerSec=46.503472908785604, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:05:46,982] [INFO] [logging.py:128:log_dist] [Rank 0] step=1580, skipped=14, lr=[1.2596438118786732e-05, 1.2596438118786732e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:05:47,003] [INFO] [timer.py:264:stop] epoch=0/micro_step=3160/global_step=1580, RunningAvgSamplesPerSec=46.50945444674376, CurrSamplesPerSec=46.49416179217857, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.57s, TFLOPs: 0.50, Samples/sec: 3.52, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.95, Samples/sec: 13.88, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:05:53,931] [INFO] [logging.py:128:log_dist] [Rank 0] step=1590, skipped=14, lr=[1.2307867787286942e-05, 1.2307867787286942e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:05:53,951] [INFO] [timer.py:264:stop] epoch=0/micro_step=3180/global_step=1590, RunningAvgSamplesPerSec=46.50879849785462, CurrSamplesPerSec=46.54303030602095, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:06:00,889] [INFO] [logging.py:128:log_dist] [Rank 0] step=1600, skipped=14, lr=[1.2021558010224001e-05, 1.2021558010224001e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:06:00,908] [INFO] [timer.py:264:stop] epoch=0/micro_step=3200/global_step=1600, RunningAvgSamplesPerSec=46.50777483497731, CurrSamplesPerSec=46.43972299579574, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.18, Samples/sec: 15.53, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:06:07,814] [INFO] [logging.py:128:log_dist] [Rank 0] step=1610, skipped=14, lr=[1.1737559781302185e-05, 1.1737559781302185e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:06:07,835] [INFO] [timer.py:264:stop] epoch=0/micro_step=3220/global_step=1610, RunningAvgSamplesPerSec=46.50800788060955, CurrSamplesPerSec=46.49611069658042, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:06:14,759] [INFO] [logging.py:128:log_dist] [Rank 0] step=1620, skipped=14, lr=[1.1455923682523475e-05, 1.1455923682523475e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:06:14,778] [INFO] [timer.py:264:stop] epoch=0/micro_step=3240/global_step=1620, RunningAvgSamplesPerSec=46.507528936833005, CurrSamplesPerSec=45.83951018076546, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:06:21,714] [INFO] [logging.py:128:log_dist] [Rank 0] step=1630, skipped=14, lr=[1.1176699875178485e-05, 1.1176699875178485e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:06:21,735] [INFO] [timer.py:264:stop] epoch=0/micro_step=3260/global_step=1630, RunningAvgSamplesPerSec=46.5067150409607, CurrSamplesPerSec=46.487349963813216, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:06:27,245] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:06:27,246] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:06:27,246] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:06:27,246] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:06:27,246] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:06:27,246] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:06:27,250] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:06:27,251] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:06:28,650] [INFO] [logging.py:128:log_dist] [Rank 0] step=1640, skipped=14, lr=[1.0899938090912464e-05, 1.0899938090912464e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:06:28,671] [INFO] [timer.py:264:stop] epoch=0/micro_step=3280/global_step=1640, RunningAvgSamplesPerSec=46.50646437840029, CurrSamplesPerSec=46.486061899197516, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:06:35,580] [INFO] [logging.py:128:log_dist] [Rank 0] step=1650, skipped=14, lr=[1.0625687622867731e-05, 1.0625687622867731e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:06:35,601] [INFO] [timer.py:264:stop] epoch=0/micro_step=3300/global_step=1650, RunningAvgSamplesPerSec=46.506480546092504, CurrSamplesPerSec=46.49532145114602, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:06:40,393] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1656 +[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1656 +[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1656 +[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:06:40,394] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 8192.0, reducing to 4096.0 +[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1656 +[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1656 +[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:06:40,394] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1656 +[2025-01-02 16:06:40,395] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.08, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:06:42,454] [INFO] [logging.py:128:log_dist] [Rank 0] step=1660, skipped=15, lr=[1.038104975748232e-05, 1.038104975748232e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:06:42,473] [INFO] [timer.py:264:stop] epoch=0/micro_step=3320/global_step=1660, RunningAvgSamplesPerSec=46.5088144037295, CurrSamplesPerSec=46.2695112142726, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:06:49,394] [INFO] [logging.py:128:log_dist] [Rank 0] step=1670, skipped=15, lr=[1.011170498391135e-05, 1.011170498391135e-05], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:06:49,413] [INFO] [timer.py:264:stop] epoch=0/micro_step=3340/global_step=1670, RunningAvgSamplesPerSec=46.508717776579374, CurrSamplesPerSec=46.62398308028986, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:06:56,325] [INFO] [logging.py:128:log_dist] [Rank 0] step=1680, skipped=15, lr=[9.845011916199696e-06, 9.845011916199696e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:06:56,346] [INFO] [timer.py:264:stop] epoch=0/micro_step=3360/global_step=1680, RunningAvgSamplesPerSec=46.50875869239154, CurrSamplesPerSec=46.45526622191193, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:07:03,255] [INFO] [logging.py:128:log_dist] [Rank 0] step=1690, skipped=15, lr=[9.581018054183269e-06, 9.581018054183269e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:07:03,276] [INFO] [timer.py:264:stop] epoch=0/micro_step=3380/global_step=1690, RunningAvgSamplesPerSec=46.50870012606984, CurrSamplesPerSec=46.556898566121795, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:07:10,185] [INFO] [logging.py:128:log_dist] [Rank 0] step=1700, skipped=15, lr=[9.31977041695123e-06, 9.31977041695123e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:07:10,206] [INFO] [timer.py:264:stop] epoch=0/micro_step=3400/global_step=1700, RunningAvgSamplesPerSec=46.508622175904215, CurrSamplesPerSec=46.45679378061621, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:07:17,116] [INFO] [logging.py:128:log_dist] [Rank 0] step=1710, skipped=15, lr=[9.061315534471568e-06, 9.061315534471568e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:07:17,137] [INFO] [timer.py:264:stop] epoch=0/micro_step=3420/global_step=1710, RunningAvgSamplesPerSec=46.50870541176424, CurrSamplesPerSec=46.7551991450285, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.15s, TFLOPs: 1.86, Samples/sec: 13.23, Time/seq 0.08s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:07:24,056] [INFO] [logging.py:128:log_dist] [Rank 0] step=1720, skipped=15, lr=[8.805699439303772e-06, 8.805699439303772e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:07:24,077] [INFO] [timer.py:264:stop] epoch=0/micro_step=3440/global_step=1720, RunningAvgSamplesPerSec=46.50831570128103, CurrSamplesPerSec=46.47883396086827, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:07:30,991] [INFO] [logging.py:128:log_dist] [Rank 0] step=1730, skipped=15, lr=[8.552967658400174e-06, 8.552967658400174e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:07:31,012] [INFO] [timer.py:264:stop] epoch=0/micro_step=3460/global_step=1730, RunningAvgSamplesPerSec=46.50817874207723, CurrSamplesPerSec=46.92904261360235, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:07:37,918] [INFO] [logging.py:128:log_dist] [Rank 0] step=1740, skipped=15, lr=[8.303165204997231e-06, 8.303165204997231e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:07:37,939] [INFO] [timer.py:264:stop] epoch=0/micro_step=3480/global_step=1740, RunningAvgSamplesPerSec=46.5082478561663, CurrSamplesPerSec=46.53083177923073, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.18, Samples/sec: 15.55, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.56, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:07:44,850] [INFO] [logging.py:128:log_dist] [Rank 0] step=1750, skipped=15, lr=[8.056336570598434e-06, 8.056336570598434e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:07:44,871] [INFO] [timer.py:264:stop] epoch=0/micro_step=3500/global_step=1750, RunningAvgSamplesPerSec=46.50811673391772, CurrSamplesPerSec=46.45566820130503, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:07:50,376] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:07:50,376] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:07:50,375] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:07:50,376] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:07:50,376] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:07:50,376] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:07:50,380] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:07:50,380] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:07:51,779] [INFO] [logging.py:128:log_dist] [Rank 0] step=1760, skipped=15, lr=[7.812525717049999e-06, 7.812525717049999e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:07:51,799] [INFO] [timer.py:264:stop] epoch=0/micro_step=3520/global_step=1760, RunningAvgSamplesPerSec=46.50828314448669, CurrSamplesPerSec=46.52642832694709, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:07:58,711] [INFO] [logging.py:128:log_dist] [Rank 0] step=1770, skipped=15, lr=[7.571776068710998e-06, 7.571776068710998e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:07:58,731] [INFO] [timer.py:264:stop] epoch=0/micro_step=3540/global_step=1770, RunningAvgSamplesPerSec=46.50829869356961, CurrSamplesPerSec=46.50495529908099, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:08:05,651] [INFO] [logging.py:128:log_dist] [Rank 0] step=1780, skipped=15, lr=[7.334130504719211e-06, 7.334130504719211e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:08:05,671] [INFO] [timer.py:264:stop] epoch=0/micro_step=3560/global_step=1780, RunningAvgSamplesPerSec=46.507906608593274, CurrSamplesPerSec=46.49524091740414, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:08:12,577] [INFO] [logging.py:128:log_dist] [Rank 0] step=1790, skipped=15, lr=[7.099631351354036e-06, 7.099631351354036e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:08:12,597] [INFO] [timer.py:264:stop] epoch=0/micro_step=3580/global_step=1790, RunningAvgSamplesPerSec=46.50816179436258, CurrSamplesPerSec=46.53826953800675, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:08:19,505] [INFO] [logging.py:128:log_dist] [Rank 0] step=1800, skipped=15, lr=[6.868320374497869e-06, 6.868320374497869e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:08:19,526] [INFO] [timer.py:264:stop] epoch=0/micro_step=3600/global_step=1800, RunningAvgSamplesPerSec=46.50819381179044, CurrSamplesPerSec=46.46337147026462, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:08:26,432] [INFO] [logging.py:128:log_dist] [Rank 0] step=1810, skipped=15, lr=[6.64023877219738e-06, 6.64023877219738e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:08:26,452] [INFO] [timer.py:264:stop] epoch=0/micro_step=3620/global_step=1810, RunningAvgSamplesPerSec=46.5083076047805, CurrSamplesPerSec=46.491520562684975, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.95, Samples/sec: 13.91, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.92, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:08:33,369] [INFO] [logging.py:128:log_dist] [Rank 0] step=1820, skipped=15, lr=[6.415427167325794e-06, 6.415427167325794e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:08:33,389] [INFO] [timer.py:264:stop] epoch=0/micro_step=3640/global_step=1820, RunningAvgSamplesPerSec=46.50816781684397, CurrSamplesPerSec=46.81537675249691, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:08:40,298] [INFO] [logging.py:128:log_dist] [Rank 0] step=1830, skipped=15, lr=[6.19392560034775e-06, 6.19392560034775e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:08:40,318] [INFO] [timer.py:264:stop] epoch=0/micro_step=3660/global_step=1830, RunningAvgSamplesPerSec=46.50820709570212, CurrSamplesPerSec=46.5024095130939, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:08:47,228] [INFO] [logging.py:128:log_dist] [Rank 0] step=1840, skipped=15, lr=[5.975773522187763e-06, 5.975773522187763e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:08:47,248] [INFO] [timer.py:264:stop] epoch=0/micro_step=3680/global_step=1840, RunningAvgSamplesPerSec=46.508121168997306, CurrSamplesPerSec=46.481586429598636, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1841 +[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1841 +[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1841 +[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1841 +[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1841 +[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:08:48,576] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 8192.0, reducing to 4096.0 +[2025-01-02 16:08:48,576] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1841 +[2025-01-02 16:08:48,577] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:08:48,577] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1841 +[2025-01-02 16:08:48,577] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:08:48,577] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.07, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1843 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1843 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1843 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1843 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1843 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:08:49,900] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1843 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1843 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1843 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:08:49,900] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.07, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:08:54,034] [INFO] [logging.py:128:log_dist] [Rank 0] step=1850, skipped=17, lr=[5.803689621959219e-06, 5.803689621959219e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:08:54,054] [INFO] [timer.py:264:stop] epoch=0/micro_step=3700/global_step=1850, RunningAvgSamplesPerSec=46.51260312244167, CurrSamplesPerSec=46.48744657153694, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:09:00,972] [INFO] [logging.py:128:log_dist] [Rank 0] step=1860, skipped=17, lr=[5.5916641313404325e-06, 5.5916641313404325e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:09:00,991] [INFO] [timer.py:264:stop] epoch=0/micro_step=3720/global_step=1860, RunningAvgSamplesPerSec=46.51238471774673, CurrSamplesPerSec=46.237791067849386, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:09:07,897] [INFO] [logging.py:128:log_dist] [Rank 0] step=1870, skipped=17, lr=[5.383095396373447e-06, 5.383095396373447e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:09:07,918] [INFO] [timer.py:264:stop] epoch=0/micro_step=3740/global_step=1870, RunningAvgSamplesPerSec=46.51254589556029, CurrSamplesPerSec=46.49611069658042, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:09:14,834] [INFO] [logging.py:128:log_dist] [Rank 0] step=1880, skipped=17, lr=[5.178020564558106e-06, 5.178020564558106e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:09:14,855] [INFO] [timer.py:264:stop] epoch=0/micro_step=3760/global_step=1880, RunningAvgSamplesPerSec=46.51221216844289, CurrSamplesPerSec=46.48469340875124, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:09:21,777] [INFO] [logging.py:128:log_dist] [Rank 0] step=1890, skipped=17, lr=[4.976476161106478e-06, 4.976476161106478e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:09:21,798] [INFO] [timer.py:264:stop] epoch=0/micro_step=3780/global_step=1890, RunningAvgSamplesPerSec=46.512024339691905, CurrSamplesPerSec=46.72973962932564, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:09:28,720] [INFO] [logging.py:128:log_dist] [Rank 0] step=1900, skipped=17, lr=[4.778498082437544e-06, 4.778498082437544e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:09:28,738] [INFO] [timer.py:264:stop] epoch=0/micro_step=3800/global_step=1900, RunningAvgSamplesPerSec=46.511614340757966, CurrSamplesPerSec=45.800076335051905, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:09:35,653] [INFO] [logging.py:128:log_dist] [Rank 0] step=1910, skipped=17, lr=[4.584121589783738e-06, 4.584121589783738e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:09:35,674] [INFO] [timer.py:264:stop] epoch=0/micro_step=3820/global_step=1910, RunningAvgSamplesPerSec=46.51156047823584, CurrSamplesPerSec=46.491182378142085, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:09:42,583] [INFO] [logging.py:128:log_dist] [Rank 0] step=1920, skipped=17, lr=[4.39338130291071e-06, 4.39338130291071e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:09:42,604] [INFO] [timer.py:264:stop] epoch=0/micro_step=3840/global_step=1920, RunningAvgSamplesPerSec=46.51154169085933, CurrSamplesPerSec=46.508935662282894, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:09:49,550] [INFO] [logging.py:128:log_dist] [Rank 0] step=1930, skipped=17, lr=[4.206311193951332e-06, 4.206311193951332e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:09:49,571] [INFO] [timer.py:264:stop] epoch=0/micro_step=3860/global_step=1930, RunningAvgSamplesPerSec=46.51063862305707, CurrSamplesPerSec=46.89421645226965, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:09:56,483] [INFO] [logging.py:128:log_dist] [Rank 0] step=1940, skipped=17, lr=[4.022944581354981e-06, 4.022944581354981e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:09:56,503] [INFO] [timer.py:264:stop] epoch=0/micro_step=3880/global_step=1940, RunningAvgSamplesPerSec=46.51063920775755, CurrSamplesPerSec=46.44960709050509, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:09:59,929] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:09:59,929] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:09:59,929] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:09:59,929] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:09:59,930] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:09:59,935] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:09:59,935] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:10:03,413] [INFO] [logging.py:128:log_dist] [Rank 0] step=1950, skipped=17, lr=[3.843314123953354e-06, 3.843314123953354e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:10:03,434] [INFO] [timer.py:264:stop] epoch=0/micro_step=3900/global_step=1950, RunningAvgSamplesPerSec=46.51056111915757, CurrSamplesPerSec=46.50838771759699, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:10:10,344] [INFO] [logging.py:128:log_dist] [Rank 0] step=1960, skipped=17, lr=[3.6674518151436744e-06, 3.6674518151436744e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:10:10,364] [INFO] [timer.py:264:stop] epoch=0/micro_step=3920/global_step=1960, RunningAvgSamplesPerSec=46.51048027244313, CurrSamplesPerSec=46.48772029560139, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:10:17,273] [INFO] [logging.py:128:log_dist] [Rank 0] step=1970, skipped=17, lr=[3.4953889771904475e-06, 3.4953889771904475e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:10:17,293] [INFO] [timer.py:264:stop] epoch=0/micro_step=3940/global_step=1970, RunningAvgSamplesPerSec=46.51043553683364, CurrSamplesPerSec=46.44023718641015, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1973 +[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1973 +[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1973 +[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1973 +[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:10:20,011] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1973 +[2025-01-02 16:10:20,011] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0 +[2025-01-02 16:10:20,012] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1973 +[2025-01-02 16:10:20,012] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:10:20,012] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 1973 +[2025-01-02 16:10:20,012] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.05, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:10:24,143] [INFO] [logging.py:128:log_dist] [Rank 0] step=1980, skipped=18, lr=[3.343806313235337e-06, 3.343806313235337e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:10:24,163] [INFO] [timer.py:264:stop] epoch=0/micro_step=3960/global_step=1980, RunningAvgSamplesPerSec=46.51242203854685, CurrSamplesPerSec=46.519187831305196, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:10:31,073] [INFO] [logging.py:128:log_dist] [Rank 0] step=1990, skipped=18, lr=[3.1790463347058543e-06, 3.1790463347058543e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:10:31,094] [INFO] [timer.py:264:stop] epoch=0/micro_step=3980/global_step=1990, RunningAvgSamplesPerSec=46.512401921343596, CurrSamplesPerSec=46.516979041899226, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:10:38,016] [INFO] [logging.py:128:log_dist] [Rank 0] step=2000, skipped=18, lr=[3.0181728153463233e-06, 3.0181728153463233e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:10:38,035] [INFO] [timer.py:264:stop] epoch=0/micro_step=4000/global_step=2000, RunningAvgSamplesPerSec=46.51207795653655, CurrSamplesPerSec=45.77321075988233, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.58, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.95, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.94, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:10:44,970] [INFO] [logging.py:128:log_dist] [Rank 0] step=2010, skipped=18, lr=[2.8612144078166593e-06, 2.8612144078166593e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:10:44,990] [INFO] [timer.py:264:stop] epoch=0/micro_step=4020/global_step=2010, RunningAvgSamplesPerSec=46.511671133770086, CurrSamplesPerSec=46.50012176602313, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:10:51,899] [INFO] [logging.py:128:log_dist] [Rank 0] step=2020, skipped=18, lr=[2.708199067468939e-06, 2.708199067468939e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:10:51,919] [INFO] [timer.py:264:stop] epoch=0/micro_step=4040/global_step=2020, RunningAvgSamplesPerSec=46.5116397897557, CurrSamplesPerSec=46.49754429253596, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.56, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.96, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:10:58,848] [INFO] [logging.py:128:log_dist] [Rank 0] step=2030, skipped=18, lr=[2.5591540473683453e-06, 2.5591540473683453e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:10:58,868] [INFO] [timer.py:264:stop] epoch=0/micro_step=4060/global_step=2030, RunningAvgSamplesPerSec=46.511126243282945, CurrSamplesPerSec=46.50880673295993, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:11:05,780] [INFO] [logging.py:128:log_dist] [Rank 0] step=2040, skipped=18, lr=[2.414105893439225e-06, 2.414105893439225e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:11:05,801] [INFO] [timer.py:264:stop] epoch=0/micro_step=4080/global_step=2040, RunningAvgSamplesPerSec=46.5110025907912, CurrSamplesPerSec=46.43461384529517, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.55, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.97, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:11:12,744] [INFO] [logging.py:128:log_dist] [Rank 0] step=2050, skipped=18, lr=[2.2730804397370688e-06, 2.2730804397370688e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:11:12,764] [INFO] [timer.py:264:stop] epoch=0/micro_step=4100/global_step=2050, RunningAvgSamplesPerSec=46.51005633195471, CurrSamplesPerSec=46.5211065850435, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:11:19,691] [INFO] [logging.py:128:log_dist] [Rank 0] step=2060, skipped=18, lr=[2.1361028038473034e-06, 2.1361028038473034e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:11:19,707] [INFO] [timer.py:264:stop] epoch=0/micro_step=4120/global_step=2060, RunningAvgSamplesPerSec=46.509589050477125, CurrSamplesPerSec=45.671672575913625, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.98, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:11:26,611] [INFO] [logging.py:128:log_dist] [Rank 0] step=2070, skipped=18, lr=[2.003197382411673e-06, 2.003197382411673e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:11:26,631] [INFO] [timer.py:264:stop] epoch=0/micro_step=4140/global_step=2070, RunningAvgSamplesPerSec=46.509808656833094, CurrSamplesPerSec=46.522928741415484, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:11:30,057] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:11:30,057] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:11:30,057] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:11:30,057] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:11:30,057] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:11:30,057] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:11:30,058] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:11:30,062] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:11:30,063] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:11:33,543] [INFO] [logging.py:128:log_dist] [Rank 0] step=2080, skipped=18, lr=[1.8743878467830294e-06, 1.8743878467830294e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:11:33,564] [INFO] [timer.py:264:stop] epoch=0/micro_step=4160/global_step=2080, RunningAvgSamplesPerSec=46.50968619314036, CurrSamplesPerSec=46.45285449163494, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.71, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.12, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:11:36,281] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2083 +[2025-01-02 16:11:36,281] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:11:36,281] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2083 +[2025-01-02 16:11:36,281] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:11:36,281] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2083 +[2025-01-02 16:11:36,281] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2083 +[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2083 +[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:11:36,282] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0 +[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2083 +[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from [2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2083 +[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:11:36,282] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.07, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.00, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:11:40,414] [INFO] [logging.py:128:log_dist] [Rank 0] step=2090, skipped=19, lr=[1.7619802236591875e-06, 1.7619802236591875e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:11:40,434] [INFO] [timer.py:264:stop] epoch=0/micro_step=4180/global_step=2090, RunningAvgSamplesPerSec=46.51158895789503, CurrSamplesPerSec=46.4917138132042, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:11:47,345] [INFO] [logging.py:128:log_dist] [Rank 0] step=2100, skipped=19, lr=[1.6410154696242603e-06, 1.6410154696242603e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:11:47,365] [INFO] [timer.py:264:stop] epoch=0/micro_step=4200/global_step=2100, RunningAvgSamplesPerSec=46.51148902071231, CurrSamplesPerSec=46.4236281604466, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:11:54,277] [INFO] [logging.py:128:log_dist] [Rank 0] step=2110, skipped=19, lr=[1.5242111084402238e-06, 1.5242111084402238e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:11:54,297] [INFO] [timer.py:264:stop] epoch=0/micro_step=4220/global_step=2110, RunningAvgSamplesPerSec=46.511339445793595, CurrSamplesPerSec=46.41922892018923, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:12:01,207] [INFO] [logging.py:128:log_dist] [Rank 0] step=2120, skipped=19, lr=[1.4115879437524043e-06, 1.4115879437524043e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:12:01,228] [INFO] [timer.py:264:stop] epoch=0/micro_step=4240/global_step=2120, RunningAvgSamplesPerSec=46.51130295982599, CurrSamplesPerSec=46.511304865842796, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.19, Samples/sec: 15.57, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:12:08,136] [INFO] [logging.py:128:log_dist] [Rank 0] step=2130, skipped=19, lr=[1.3031660345068002e-06, 1.3031660345068002e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:12:08,156] [INFO] [timer.py:264:stop] epoch=0/micro_step=4260/global_step=2130, RunningAvgSamplesPerSec=46.51131404937642, CurrSamplesPerSec=46.51030558048654, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:12:15,059] [INFO] [logging.py:128:log_dist] [Rank 0] step=2140, skipped=19, lr=[1.1989646913774466e-06, 1.1989646913774466e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:12:15,080] [INFO] [timer.py:264:stop] epoch=0/micro_step=4280/global_step=2140, RunningAvgSamplesPerSec=46.5114519203876, CurrSamplesPerSec=46.79242903203494, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.57, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.98, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.55, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:12:22,020] [INFO] [logging.py:128:log_dist] [Rank 0] step=2150, skipped=19, lr=[1.0990024733270572e-06, 1.0990024733270572e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:12:22,040] [INFO] [timer.py:264:stop] epoch=0/micro_step=4300/global_step=2150, RunningAvgSamplesPerSec=46.51052237053515, CurrSamplesPerSec=46.532267517101225, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:12:28,947] [INFO] [logging.py:128:log_dist] [Rank 0] step=2160, skipped=19, lr=[1.0032971843015576e-06, 1.0032971843015576e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:12:28,967] [INFO] [timer.py:264:stop] epoch=0/micro_step=4320/global_step=2160, RunningAvgSamplesPerSec=46.5105816299913, CurrSamplesPerSec=46.438469703874325, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:12:35,881] [INFO] [logging.py:128:log_dist] [Rank 0] step=2170, skipped=19, lr=[9.118658700590616e-07, 9.118658700590616e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:12:35,902] [INFO] [timer.py:264:stop] epoch=0/micro_step=4340/global_step=2170, RunningAvgSamplesPerSec=46.510394643211036, CurrSamplesPerSec=46.47535762417345, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:12:42,807] [INFO] [logging.py:128:log_dist] [Rank 0] step=2180, skipped=19, lr=[8.247248151339343e-07, 8.247248151339343e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:12:42,828] [INFO] [timer.py:264:stop] epoch=0/micro_step=4360/global_step=2180, RunningAvgSamplesPerSec=46.510493742010965, CurrSamplesPerSec=46.50587378396359, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:12:46,253] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:12:46,254] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +[2025-01-02 16:12:46,259] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:12:46,259] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 13.99, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:12:49,737] [INFO] [logging.py:128:log_dist] [Rank 0] step=2190, skipped=19, lr=[7.418895399363746e-07, 7.418895399363746e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:12:49,758] [INFO] [timer.py:264:stop] epoch=0/micro_step=4380/global_step=2190, RunningAvgSamplesPerSec=46.510429443817735, CurrSamplesPerSec=46.49342092923379, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:12:56,670] [INFO] [logging.py:128:log_dist] [Rank 0] step=2200, skipped=19, lr=[6.633747979881533e-07, 6.633747979881533e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:12:56,690] [INFO] [timer.py:264:stop] epoch=0/micro_step=4400/global_step=2200, RunningAvgSamplesPerSec=46.51038262533071, CurrSamplesPerSec=46.55770605348971, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.03, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:13:03,604] [INFO] [logging.py:128:log_dist] [Rank 0] step=2210, skipped=19, lr=[5.891945732949017e-07, 5.891945732949017e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:13:03,623] [INFO] [timer.py:264:stop] epoch=0/micro_step=4420/global_step=2210, RunningAvgSamplesPerSec=46.51031258811781, CurrSamplesPerSec=46.33170675886358, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.63, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:13:10,546] [INFO] [logging.py:128:log_dist] [Rank 0] step=2220, skipped=19, lr=[5.193620778554536e-07, 5.193620778554536e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:13:10,567] [INFO] [timer.py:264:stop] epoch=0/micro_step=4440/global_step=2220, RunningAvgSamplesPerSec=46.509899108741436, CurrSamplesPerSec=46.37364797007833, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.61, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:13:17,446] [INFO] [logging.py:128:log_dist] [Rank 0] step=2230, skipped=19, lr=[4.538897493087113e-07, 4.538897493087113e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:13:17,467] [INFO] [timer.py:264:stop] epoch=0/micro_step=4460/global_step=2230, RunningAvgSamplesPerSec=46.51082145039772, CurrSamplesPerSec=46.73201748416546, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.96, Samples/sec: 13.99, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.64, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.65, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:13:24,357] [INFO] [logging.py:128:log_dist] [Rank 0] step=2240, skipped=19, lr=[3.927892487184254e-07, 3.927892487184254e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:13:24,377] [INFO] [timer.py:264:stop] epoch=0/micro_step=4480/global_step=2240, RunningAvgSamplesPerSec=46.51141811493153, CurrSamplesPerSec=46.78347475499735, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.01, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:13:31,255] [INFO] [logging.py:128:log_dist] [Rank 0] step=2250, skipped=19, lr=[3.360714584962621e-07, 3.360714584962621e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:13:31,275] [INFO] [timer.py:264:stop] epoch=0/micro_step=4500/global_step=2250, RunningAvgSamplesPerSec=46.512457104244405, CurrSamplesPerSec=46.71571944298993, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:13:38,143] [INFO] [logging.py:128:log_dist] [Rank 0] step=2260, skipped=19, lr=[2.83746480463587e-07, 2.83746480463587e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:13:38,164] [INFO] [timer.py:264:stop] epoch=0/micro_step=4520/global_step=2260, RunningAvgSamplesPerSec=46.51361995388357, CurrSamplesPerSec=46.76160094278518, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:13:45,038] [INFO] [logging.py:128:log_dist] [Rank 0] step=2270, skipped=19, lr=[2.3582363405225405e-07, 2.3582363405225405e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:13:45,059] [INFO] [timer.py:264:stop] epoch=0/micro_step=4540/global_step=2270, RunningAvgSamplesPerSec=46.514651946554, CurrSamplesPerSec=46.784420582288035, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.67, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.02, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:13:51,929] [INFO] [logging.py:128:log_dist] [Rank 0] step=2280, skipped=19, lr=[1.9231145464475297e-07, 1.9231145464475297e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:13:51,949] [INFO] [timer.py:264:stop] epoch=0/micro_step=4560/global_step=2280, RunningAvgSamplesPerSec=46.51576697162558, CurrSamplesPerSec=46.77840381607327, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:13:55,353] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:13:55,353] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:13:55,354] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:13:55,355] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:13:55,355] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:13:55,355] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +[2025-01-02 16:13:55,357] [INFO] [fused_optimizer.py:400:_update_scale] No Grad overflow for 100 iterations +[2025-01-02 16:13:55,357] [INFO] [fused_optimizer.py:401:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:13:56,027] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2285 +[2025-01-02 16:13:56,027] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2285 +[2025-01-02 16:13:56,027] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:13:56,027] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:13:56,027] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2285 +[2025-01-02 16:13:56,027] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2285 +[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2285 +[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2285 +[2025-01-02 16:13:56,028] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 8192.0, reducing to 4096.0 +[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2285 +[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2285 +[2025-01-02 16:13:56,028] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.57, Samples/sec: 4.07, Time/seq 0.25s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:13:58,763] [INFO] [logging.py:128:log_dist] [Rank 0] step=2290, skipped=20, lr=[1.5692803494115337e-07, 1.5692803494115337e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:13:58,783] [INFO] [timer.py:264:stop] epoch=0/micro_step=4580/global_step=2290, RunningAvgSamplesPerSec=46.51851714180861, CurrSamplesPerSec=46.79658928835533, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.60, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.62, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:14:05,670] [INFO] [logging.py:128:log_dist] [Rank 0] step=2300, skipped=20, lr=[1.2181682318773424e-07, 1.2181682318773424e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:14:05,691] [INFO] [timer.py:264:stop] epoch=0/micro_step=4600/global_step=2300, RunningAvgSamplesPerSec=46.519098342938314, CurrSamplesPerSec=46.81861017418855, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:14:12,559] [INFO] [logging.py:128:log_dist] [Rank 0] step=2310, skipped=20, lr=[9.11365838208722e-08, 9.11365838208722e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:14:12,579] [INFO] [timer.py:264:stop] epoch=0/micro_step=4620/global_step=2310, RunningAvgSamplesPerSec=46.520263336208856, CurrSamplesPerSec=46.77643117546071, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:14:18,029] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2317 +[2025-01-02 16:14:18,029] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:14:18,029] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2317 +[2025-01-02 16:14:18,029] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2317 +[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2317 +[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:14:18,030] [INFO] [logging.py:128:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0 +[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2317 +[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2317 +[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:392:_update_scale] +Grad overflow on iteration 2317 +[2025-01-02 16:14:18,030] [INFO] [fused_optimizer.py:393:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0 +Model Parameters: 0.331 B, Latency: 0.49s, TFLOPs: 0.58, Samples/sec: 4.11, Time/seq 0.24s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:14:19,386] [INFO] [logging.py:128:log_dist] [Rank 0] step=2320, skipped=21, lr=[6.731738239555418e-08, 6.731738239555418e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:14:19,407] [INFO] [timer.py:264:stop] epoch=0/micro_step=4640/global_step=2320, RunningAvgSamplesPerSec=46.52321387815545, CurrSamplesPerSec=46.80081556093043, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.13s, TFLOPs: 2.19, Samples/sec: 15.57, Time/seq 0.06s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.56s, TFLOPs: 0.50, Samples/sec: 3.59, Time/seq 0.28s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:14:26,272] [INFO] [logging.py:128:log_dist] [Rank 0] step=2330, skipped=21, lr=[4.507039181994299e-08, 4.507039181994299e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:14:26,293] [INFO] [timer.py:264:stop] epoch=0/micro_step=4660/global_step=2330, RunningAvgSamplesPerSec=46.52443592782957, CurrSamplesPerSec=46.81611158184509, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.55s, TFLOPs: 0.51, Samples/sec: 3.66, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.05, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.70, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:14:33,160] [INFO] [logging.py:128:log_dist] [Rank 0] step=2340, skipped=21, lr=[2.726804268846084e-08, 2.726804268846084e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:14:33,181] [INFO] [timer.py:264:stop] epoch=0/micro_step=4680/global_step=2340, RunningAvgSamplesPerSec=46.52560854377931, CurrSamplesPerSec=46.75790299843455, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.11, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.07, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.10, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.97, Samples/sec: 14.04, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +[2025-01-02 16:14:40,046] [INFO] [logging.py:128:log_dist] [Rank 0] step=2350, skipped=21, lr=[1.3913505719678755e-08, 1.3913505719678755e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2025-01-02 16:14:40,067] [INFO] [timer.py:264:stop] epoch=0/micro_step=4700/global_step=2350, RunningAvgSamplesPerSec=46.526791452423346, CurrSamplesPerSec=46.81642185005165, MemAllocated=5.53GB, MaxMemAllocated=8.61GB +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.06, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.69, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.09, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.14s, TFLOPs: 1.98, Samples/sec: 14.08, Time/seq 0.07s, Batch Size: 2, Sequence Length: 512 +Model Parameters: 0.331 B, Latency: 0.54s, TFLOPs: 0.52, Samples/sec: 3.68, Time/seq 0.27s, Batch Size: 2, Sequence Length: 512 +***** Evaluating rewards, Epoch 1/1 ***** +chosen: -31.9375, rejected: -33.96875, loss: 0.6650807857513428 +saving the final model ... +[rank0]:[W102 16:15:18.696314488 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator()) +[2025-01-02 16:15:21,891] [INFO] [launch.py:351:main] Process 106033 exits successfully. +[2025-01-02 16:15:22,894] [INFO] [launch.py:351:main] Process 106032 exits successfully. +[2025-01-02 16:15:22,895] [INFO] [launch.py:351:main] Process 106038 exits successfully. +[2025-01-02 16:15:22,896] [INFO] [launch.py:351:main] Process 106031 exits successfully. +[2025-01-02 16:15:22,896] [INFO] [launch.py:351:main] Process 106037 exits successfully. +[2025-01-02 16:15:23,898] [INFO] [launch.py:351:main] Process 106034 exits successfully. +[2025-01-02 16:15:23,898] [INFO] [launch.py:351:main] Process 106036 exits successfully. +[2025-01-02 16:15:23,898] [INFO] [launch.py:351:main] Process 106035 exits successfully. diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/README.md b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/README.md new file mode 100644 index 000000000..ca2d5eb70 --- /dev/null +++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/README.md @@ -0,0 +1,6 @@ +### πŸ’For each folder, the bash scripts are examples of "facebook/opt" family. + +If you want to change your model such as EleutherAI/gpt-j-6b, you may simply replace +`` --model_name_or_path facebook/opt-350m`` to ``--model_name_or_path EleutherAI/gpt-neo-125m ``. + +For the models we support, please see [our landing page](./../../../README.md#-supported-models-) diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b.sh new file mode 100755 index 000000000..aaa059bbe --- /dev/null +++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +OUTPUT=$1 +ZERO_STAGE=$2 +if [ "$OUTPUT" == "" ]; then + OUTPUT=./output_step2_llama_7b_epoch1_lr9.65e-6 +fi +if [ "$ZERO_STAGE" == "" ]; then + ZERO_STAGE=3 +fi +mkdir -p $OUTPUT + +deepspeed main.py \ + --data_path Dahoas/rm-static \ + --data_split 2,4,4 \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --per_device_train_batch_size 8 \ + --per_device_eval_batch_size 8 \ + --max_seq_len 512 \ + --learning_rate 9.65e-6 \ + --weight_decay 0.1 \ + --num_train_epochs 1 \ + --gradient_accumulation_steps 1 \ + --lr_scheduler_type cosine \ + --num_warmup_steps 0 \ + --seed 1234 \ + --gradient_checkpointing \ + --zero_stage $ZERO_STAGE \ + --deepspeed \ + --offload \ + --output_dir $OUTPUT \ + &> $OUTPUT/training.log diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh new file mode 100755 index 000000000..ec48de78a --- /dev/null +++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +OUTPUT=$1 +ZERO_STAGE=$2 +if [ "$OUTPUT" == "" ]; then + OUTPUT=./output_step2_llama_7b_epoch1_lr9.65e-6 +fi +if [ "$ZERO_STAGE" == "" ]; then + ZERO_STAGE=3 +fi +mkdir -p $OUTPUT + +deepspeed main.py \ + --data_path Dahoas/rm-static \ + --data_split 2,4,4 \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --per_device_train_batch_size 8 \ + --per_device_eval_batch_size 8 \ + --max_seq_len 512 \ + --learning_rate 9.65e-6 \ + --weight_decay 0.1 \ + --num_train_epochs 1 \ + --gradient_accumulation_steps 1 \ + --lr_scheduler_type cosine \ + --num_warmup_steps 0 \ + --seed 1234 \ + --gradient_checkpointing \ + --zero_stage $ZERO_STAGE \ + --deepspeed \ + --offload \ + --lora_dim 128 \ + --lora_module_name "layers." \ + --output_dir $OUTPUT \ + &> $OUTPUT/training.log diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/multi_node/run_350m.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/multi_node/run_350m.sh new file mode 100644 index 000000000..b55beef98 --- /dev/null +++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/multi_node/run_350m.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +OUTPUT=$1 +ZERO_STAGE=$2 +if [ "$OUTPUT" == "" ]; then + OUTPUT=./output +fi +if [ "$ZERO_STAGE" == "" ]; then + ZERO_STAGE=0 +fi +mkdir -p $OUTPUT + +deepspeed main.py \ + --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \ + --data_split 2,4,4 \ + --model_name_or_path facebook/opt-350m \ + --per_device_train_batch_size 2 \ + --per_device_eval_batch_size 2 \ + --max_seq_len 512 \ + --learning_rate 5e-5 \ + --weight_decay 0.1 \ + --dropout 0.0 \ + --num_train_epochs 1 \ + --gradient_accumulation_steps 1 \ + --lr_scheduler_type cosine \ + --num_warmup_steps 0 \ + --seed 1234 \ + --zero_stage $ZERO_STAGE \ + --deepspeed \ + --output_dir $OUTPUT \ + &> $OUTPUT/training.log diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_gpu/run_350m.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_gpu/run_350m.sh new file mode 100644 index 000000000..8157865a5 --- /dev/null +++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_gpu/run_350m.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +OUTPUT=$1 +ZERO_STAGE=$2 +if [ "$OUTPUT" == "" ]; then + OUTPUT=./output +fi +if [ "$ZERO_STAGE" == "" ]; then + ZERO_STAGE=0 +fi +mkdir -p $OUTPUT + +deepspeed --num_gpus 1 main.py --model_name_or_path facebook/opt-350m \ + --weight_decay 0.1 --dropout 0.0 --gradient_accumulation_steps 4 --zero_stage $ZERO_STAGE \ + --enable_tensorboard \ + --tensorboard_path $OUTPUT \ + --deepspeed --output_dir $OUTPUT &> $OUTPUT/training.log diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_350m.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_350m.sh new file mode 100644 index 000000000..16aed6a42 --- /dev/null +++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_350m.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +OUTPUT=$1 +ZERO_STAGE=$2 +if [ "$OUTPUT" == "" ]; then + OUTPUT=./output +fi +if [ "$ZERO_STAGE" == "" ]; then + ZERO_STAGE=0 +fi +mkdir -p $OUTPUT + +deepspeed main.py \ + --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \ + --data_split 2,4,4 \ + --model_name_or_path facebook/opt-350m \ + --per_device_train_batch_size 4 \ + --per_device_eval_batch_size 4 \ + --max_seq_len 512 \ + --learning_rate 5e-5 \ + --weight_decay 0.1 \ + --num_train_epochs 1 \ + --dropout 0.0 \ + --gradient_accumulation_steps 1 \ + --lr_scheduler_type cosine \ + --num_warmup_steps 0 \ + --seed 1234 \ + --zero_stage $ZERO_STAGE \ + --deepspeed \ + --output_dir $OUTPUT \ + &> $OUTPUT/training.log diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/README.md b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/README.md new file mode 100644 index 000000000..1f90b9f65 --- /dev/null +++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/README.md @@ -0,0 +1,20 @@ +# DeepSpeed Characterization Script + +# Contents + * [Introduction](#introduction) + * [Usage](#usage) + +# Introduction +The step 2 characterization script sweeps across various training parameters. Currently, the following parameters are swept: +
+Zero Stage: 2, 3
+Offload: True, False
+
+ +The `run_step2_sweep.sh` script passes configuration arguments to `run_single.sh`, which can be extended to sweep beyond the parameters listed above (e.g. learning rate, weight decay, etc). + +# Usage +The sweep script can be run as follows: +
+DeepSpeedExamples/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning$ bash training_scripts/opt/single_node/sweep/run_step2_sweep.sh
+
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_single.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_single.sh new file mode 100644 index 000000000..6f5453af1 --- /dev/null +++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_single.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +ZERO_STAGE=$1 +OFFLOAD=$2 +OUTPUT=$3 +if [ "$ZERO_STAGE" == "" ]; then + ZERO_STAGE=0 +fi +if [ "$OFFLOAD" == true ]; then + OFFLOAD="--offload" +else + OFFLOAD="" +fi +if [ "$OUTPUT" == "" ]; then + OUTPUT=./output +fi +mkdir -p $OUTPUT + +cmd="deepspeed main.py \ + --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \ + --data_split 2,4,4 \ + --model_name_or_path facebook/opt-350m \ + --num_padding_at_beginning 1 \ + --per_device_train_batch_size 4 \ + --per_device_eval_batch_size 4 \ + --max_seq_len 512 \ + --learning_rate 5e-5 \ + --weight_decay 0.1 \ + --num_train_epochs 1 \ + --dropout 0.0 \ + --gradient_accumulation_steps 1 \ + --lr_scheduler_type cosine \ + --num_warmup_steps 0 \ + --seed 1234 \ + --zero_stage $ZERO_STAGE \ + --deepspeed \ + --output_dir $OUTPUT \ + $OFFLOAD" + +echo "----------------------------- DS COMMAND -----------------------------" +echo $cmd + +$cmd &> $OUTPUT/${OUTPUT}.log diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_step2_sweep.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_step2_sweep.sh new file mode 100644 index 000000000..ad9849e38 --- /dev/null +++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_step2_sweep.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team +for z in {2..3} +do + for offload in true false + do + cmd="bash training_scripts/opt/single_node/sweep/run_single.sh \ + ${z} \ + ${offload} \ + z${z}_offload_${offload}" + echo "----------------------------- CALLING SHELL SCRIPT -----------------------------" + echo $cmd + $cmd + pkill -9 python + sleep 60 + echo "" + done +done From b965b9cd827f096a24e382dcd3b4cd810c9595da Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Mon, 20 Jan 2025 16:24:13 -0800 Subject: [PATCH 3/3] Update references to torchvision (#949) --- training/cifar/cifar10_deepspeed.py | 2 +- training/cifar/cifar10_tutorial.py | 2 +- .../vit_finetuning/main_imagenet.py | 46 +++++++++---------- .../vit_finetuning/utils/get_data.py | 22 ++++----- training/gan/gan_baseline_train.py | 2 +- training/gan/gan_deepspeed_train.py | 2 +- training/imagenet/main.py | 16 +++---- training/pipeline_parallelism/train.py | 2 +- 8 files changed, 47 insertions(+), 47 deletions(-) diff --git a/training/cifar/cifar10_deepspeed.py b/training/cifar/cifar10_deepspeed.py index 9888544d5..e92c43937 100755 --- a/training/cifar/cifar10_deepspeed.py +++ b/training/cifar/cifar10_deepspeed.py @@ -6,7 +6,7 @@ import torch.nn as nn import torch.nn.functional as F import torchvision -import torchvision.transforms as transforms +from torchvision import transforms from deepspeed.accelerator import get_accelerator from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer diff --git a/training/cifar/cifar10_tutorial.py b/training/cifar/cifar10_tutorial.py index 114e8c5fa..b7c7e01bd 100644 --- a/training/cifar/cifar10_tutorial.py +++ b/training/cifar/cifar10_tutorial.py @@ -57,7 +57,7 @@ """ import torch import torchvision -import torchvision.transforms as transforms +from torchvision import transforms ######################################################################## # The output of torchvision datasets are PILImage images of range [0, 1]. diff --git a/training/data_efficiency/vit_finetuning/main_imagenet.py b/training/data_efficiency/vit_finetuning/main_imagenet.py index 4d39ac9af..0042b49b4 100644 --- a/training/data_efficiency/vit_finetuning/main_imagenet.py +++ b/training/data_efficiency/vit_finetuning/main_imagenet.py @@ -19,8 +19,8 @@ import torch.multiprocessing as mp import torch.utils.data import torch.utils.data.distributed -import torchvision.transforms as transforms -import torchvision.datasets as datasets +from torchvision import transforms +from torchvision import datasets import torchvision.models as models from torch.utils.data import Subset import models @@ -105,7 +105,7 @@ def _get_model(args): nchannels = 3 model = models.__dict__[args.arch](num_classes=nclasses, nchannels=nchannels) return model - + def _get_dist_model(gpu, args): ngpus_per_node = torch.cuida.device_count() if args.distributed: @@ -149,9 +149,9 @@ def _get_dist_model(gpu, args): else: model = torch.nn.DataParallel(model).cuda() return model - + def main(): - + args = parser.parse_args() if args.seed is not None: @@ -190,7 +190,7 @@ def main(): def main_worker(gpu, ngpus_per_node, args): global best_acc1 global history - + if args.deepspeed: gpu = args.local_rank args.gpu = gpu @@ -205,7 +205,7 @@ def main_worker(gpu, ngpus_per_node, args): deepspeed.init_distributed() print(f'created model on gpu {gpu}') # exit () - + # define loss function (criterion), optimizer, and learning rate scheduler criterion = nn.CrossEntropyLoss().cuda(args.gpu) @@ -284,14 +284,14 @@ def main_worker(gpu, ngpus_per_node, args): validate(val_loader, model, criterion, args) # return args.completed_step = 0 - + optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) - + """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" scheduler = StepLR(optimizer, step_size=int(len(train_loader)*args.epochs//3), gamma=0.1)# None # - + model, optimizer, _, scheduler = deepspeed.initialize( model=model, @@ -311,17 +311,17 @@ def main_worker(gpu, ngpus_per_node, args): time_epoch = time.time() - start_time # evaluate on validation set top5_val, top1_val, losses_val = validate(val_loader, model, criterion, args) - if args.gpu==0: + if args.gpu==0: history["epoch"].append(epoch) history["val_loss"].append(losses_val) - history["val_acc1"].append(top1_val) - history["val_acc5"].append(top5_val) + history["val_acc1"].append(top1_val) + history["val_acc5"].append(top5_val) history["train_loss"].append(losses_train) - history["train_acc1"].append(top1_train) + history["train_acc1"].append(top1_train) history["train_acc5"].append(top5_train) - torch.save(history,f"{args.out_dir}/stat.pt") + torch.save(history,f"{args.out_dir}/stat.pt") try: - print (f'{epoch} epoch at time {time_epoch}s and learning rate {scheduler.get_last_lr()}') + print (f'{epoch} epoch at time {time_epoch}s and learning rate {scheduler.get_last_lr()}') except: print (f'{epoch} epoch at time {time_epoch}s and learning rate {args.lr}') print (f"finish epoch {epoch} or iteration {args.completed_step}, train_accuracy is {top1_train}, val_accuracy {top1_val}") @@ -393,14 +393,14 @@ def train(scheduler, train_loader, model, criterion, optimizer, epoch, args): loss.backward() optimizer.step() scheduler.step() - + # measure elapsed time batch_time.update(time.time() - end) end = time.time() - if i % args.print_freq == 0 and args.gpu==0: + if i % args.print_freq == 0 and args.gpu==0: progress.display(i + 1) - + if args.distributed: losses.all_reduce() top1.all_reduce() @@ -432,7 +432,7 @@ def run_validate(loader, base_progress=0): batch_time.update(time.time() - end) end = time.time() - if i % args.print_freq == 0 and args.gpu==0: + if i % args.print_freq == 0 and args.gpu==0: progress.display(i + 1) batch_time = AverageMeter('Time', ':6.3f', Summary.NONE) @@ -509,7 +509,7 @@ def all_reduce(self): def __str__(self): fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' return fmtstr.format(**self.__dict__) - + def summary(self): fmtstr = '' if self.summary_type is Summary.NONE: @@ -522,7 +522,7 @@ def summary(self): fmtstr = '{name} {count:.3f}' else: raise ValueError('invalid summary type %r' % self.summary_type) - + return fmtstr.format(**self.__dict__) @@ -536,7 +536,7 @@ def display(self, batch): entries = [self.prefix + self.batch_fmtstr.format(batch)] entries += [str(meter) for meter in self.meters] print ('\t'.join(entries)) - + def display_summary(self): entries = [" *"] entries += [meter.summary() for meter in self.meters] diff --git a/training/data_efficiency/vit_finetuning/utils/get_data.py b/training/data_efficiency/vit_finetuning/utils/get_data.py index dfad5f3ba..c2505fd17 100644 --- a/training/data_efficiency/vit_finetuning/utils/get_data.py +++ b/training/data_efficiency/vit_finetuning/utils/get_data.py @@ -13,18 +13,18 @@ # limitations under the License. import torch import os -import torchvision.transforms as transforms -import torchvision.datasets as datasets +from torchvision import transforms +from torchvision import datasets def get_dataset(dataset_name, data_dir, split, rand_fraction=None,clean=False, transform=None, imsize=None, bucket='pytorch-data', **kwargs): if dataset_name in [ 'cifar10', 'cifar100']: - dataset = globals()[f'get_{dataset_name}'](dataset_name, data_dir, split, imsize=imsize, bucket=bucket, **kwargs) + dataset = globals()[f'get_{dataset_name}'](dataset_name, data_dir, split, imsize=imsize, bucket=bucket, **kwargs) elif dataset_name in [ 'cifar10vit224', 'cifar100vit224','cifar10vit384', 'cifar100vit384',]: imsize = int(dataset_name.split('vit')[-1]) dataset_name = dataset_name.split('vit')[0] #print ('here') - dataset = globals()['get_cifar_vit'](dataset_name, data_dir, split, imsize=imsize, bucket=bucket, **kwargs) + dataset = globals()['get_cifar_vit'](dataset_name, data_dir, split, imsize=imsize, bucket=bucket, **kwargs) else: assert 'cifar' in dataset_name print (dataset_name) @@ -59,10 +59,10 @@ def get_transform(split, normalize=None, transform=None, imsize=None, aug='large if transform is None: if normalize is None: if aug == 'large': - + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) else: - normalize = transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]) + normalize = transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010]) transform = transforms.Compose(get_aug(split, imsize=imsize, aug=aug) + [transforms.ToTensor(), normalize]) return transform @@ -71,7 +71,7 @@ def get_transform(split, normalize=None, transform=None, imsize=None, aug='large def get_cifar10(dataset_name, data_dir, split, transform=None, imsize=None, bucket='pytorch-data', **kwargs): if imsize==224: transform = get_transform(split, transform=transform, imsize=imsize, aug='large') - else: + else: transform = get_transform(split, transform=transform, imsize=imsize, aug='small') return datasets.CIFAR10(data_dir, train=(split=='train'), transform=transform, download=True, **kwargs) @@ -88,7 +88,7 @@ def get_cifar100N(dataset_name, data_dir, split, rand_fraction=None,transform=No if split=='train': return CIFAR100N(root=data_dir, train=(split=='train'), transform=transform, download=True, rand_fraction=rand_fraction) else: - return datasets.CIFAR100(data_dir, train=(split=='train'), transform=transform, download=True, **kwargs) + return datasets.CIFAR100(data_dir, train=(split=='train'), transform=transform, download=True, **kwargs) def get_cifar_vit(dataset_name, data_dir, split, transform=None, imsize=None, bucket='pytorch-data', **kwargs): if imsize==224: @@ -111,12 +111,12 @@ def get_cifar_vit(dataset_name, data_dir, split, transform=None, imsize=None, bu if dataset_name =='cifar10': return datasets.CIFAR10(data_dir, train=(split=='train'), transform=transform_data, download=True, **kwargs) elif dataset_name =='cifar100': - + return datasets.CIFAR100(data_dir, train=(split=='train'), transform=transform_data, download=True, **kwargs) else: assert dataset_name in ['cifar10', 'cifar100'] else: - + if split=='train': transform_data = transforms.Compose([# transforms.ColorJitter(brightness= 0.4, contrast= 0.4, saturation= 0.4, hue= 0.1), transforms.Resize(imsize), @@ -164,4 +164,4 @@ def get_imagenet_vit(dataset_name, data_dir, split, transform=None, imsize=None, #return torch.utils.data.distributed.DistributedSampler(train_dataset) else: return datasets.ImageFolder(valdir, transform_data) - #Ereturn torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=True) \ No newline at end of file + #Ereturn torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=True) diff --git a/training/gan/gan_baseline_train.py b/training/gan/gan_baseline_train.py index 3d223542f..ab2d67740 100755 --- a/training/gan/gan_baseline_train.py +++ b/training/gan/gan_baseline_train.py @@ -3,7 +3,7 @@ import torch.nn as nn import torch.utils.data import torchvision.datasets as dset -import torchvision.transforms as transforms +from torchvision import transforms import torchvision.utils as vutils from torch.utils.tensorboard import SummaryWriter from time import time diff --git a/training/gan/gan_deepspeed_train.py b/training/gan/gan_deepspeed_train.py index f209a4273..ffaf59375 100755 --- a/training/gan/gan_deepspeed_train.py +++ b/training/gan/gan_deepspeed_train.py @@ -3,7 +3,7 @@ import torch.nn as nn import torch.utils.data import torchvision.datasets as dset -import torchvision.transforms as transforms +from torchvision import transforms import torchvision.utils as vutils from torch.utils.tensorboard import SummaryWriter from time import time diff --git a/training/imagenet/main.py b/training/imagenet/main.py index 1558e6ae0..414d152f1 100644 --- a/training/imagenet/main.py +++ b/training/imagenet/main.py @@ -18,9 +18,9 @@ import torch.optim import torch.utils.data import torch.utils.data.distributed -import torchvision.datasets as datasets import torchvision.models as models -import torchvision.transforms as transforms +from torchvision import transforms +from torchvision import datasets from torch.optim.lr_scheduler import StepLR from torch.utils.data import Subset @@ -94,7 +94,7 @@ def main(): 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') - + if args.gpu is not None: warnings.warn('You have chosen a specific GPU. This will completely ' 'disable data parallelism.') @@ -112,7 +112,7 @@ def main(): args.world_size = ngpus_per_node * args.world_size t_losses, t_acc1s = main_worker(args.gpu, ngpus_per_node, args) #dist.barrier() - + # Write the losses to an excel file if dist.get_rank() ==0: all_losses = [torch.empty_like(t_losses) for _ in range(ngpus_per_node)] @@ -278,7 +278,7 @@ def print_rank_0(msg): acc1s[epoch] = acc1 scheduler.step() - + # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) @@ -449,7 +449,7 @@ def all_reduce(self): def __str__(self): fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' return fmtstr.format(**self.__dict__) - + def summary(self): fmtstr = '' if self.summary_type is Summary.NONE: @@ -462,7 +462,7 @@ def summary(self): fmtstr = '{name} {count:.3f}' else: raise ValueError('invalid summary type %r' % self.summary_type) - + return fmtstr.format(**self.__dict__) @@ -476,7 +476,7 @@ def display(self, batch): entries = [self.prefix + self.batch_fmtstr.format(batch)] entries += [str(meter) for meter in self.meters] print('\t'.join(entries)) - + def display_summary(self): entries = [" *"] entries += [meter.summary() for meter in self.meters] diff --git a/training/pipeline_parallelism/train.py b/training/pipeline_parallelism/train.py index 1a418b427..b4bc49bf6 100755 --- a/training/pipeline_parallelism/train.py +++ b/training/pipeline_parallelism/train.py @@ -7,7 +7,7 @@ import torch.distributed as dist import torchvision -import torchvision.transforms as transforms +from torchvision import transforms from torchvision.models import AlexNet from torchvision.models import vgg19