-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Open
Labels
bugSomething isn't workingSomething isn't working
Description
Hello,
I am encountering an issue with the GPT-NeoX library. When I set either pipe_parallel_size or model_parallel_size to 2, I get the following assertion error:
[2024-07-11 06:28:59,215] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[WARNING] async_io requires the dev libaio .so object and headers but these were not found.
[WARNING] async_io: please install the libaio-dev package with apt
[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
[WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
[WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.0
[WARNING] using untested triton version (2.0.0), only 1.0.0 is known to be compatible
fatal: not a git repository (or any parent up to mount point /)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
NeoXArgs.from_ymls() ['configs/2-7B.yml', 'configs/local_setup.yml']
INFO:root:NeoXArgs.calculate_derived() Total number of GPUs determined to be: 4
-------------------- arguments --------------------
attention_config ................ ['global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global', 'global']updated
batch_size ...................... 4...........................updated
checkpoint_activations .......... True........................updated
checkpoint_factor ............... 10000.......................updated
config_files .................... {'2-7B.yml': '# GPT-2 pretraining setup\n{\n # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages\n # across the node boundaries )\n "pipe_parallel_size": 2,\n "model_parallel_size": 1,\n #"gradient_accumulation_steps": 2,\n\n # model settings\n "num_layers": 32,\n "hidden_size": 2560,\n "num_attention_heads": 32,\n "seq_length": 2048,\n "max_position_embeddings": 2048,\n "norm": "layernorm",\n "pos_emb": "rotary",\n "no_weight_tying": true,\n "gpt_j_residual": false,\n "output_layer_parallelism": "column",\n\n # these should provide some speedup but takes a while to build, set to true if desired\n "scaled_upper_triang_masked_softmax_fusion": false,\n "bias_gelu_fusion": false,\n "rope_fusion": false,\n "layernorm_fusion": false,\n\n # init methods\n "init_method": "small_init",\n "output_layer_init_method": "wang_init",\n\n # optimizer settings\n "optimizer": {\n "type": "Adam",\n "params": {\n "lr": 0.00016,\n "betas": [0.9, 0.95],\n "eps": 1.0e-8,\n }\n },\n "min_lr": 0.000016,\n\n # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training\n "zero_optimization": {\n "stage": 3,\n "allgather_partitions": True,\n "allgather_bucket_size": 500000000,\n "overlap_comm": True,\n "reduce_scatter": True,\n "reduce_bucket_size": 500000000,\n "contiguous_gradients": True,\n },\n\n # batch / data settings\n "train_micro_batch_size_per_gpu": 4,\n "data_impl": "mmap",\n\n # activation checkpointing\n "checkpoint_activations": true,\n "checkpoint_num_layers": 1,\n "partition_activations": true,\n "synchronize_each_layer": true,\n\n # regularization\n "gradient_clipping": 1.0,\n "weight_decay": 0.1,\n "hidden_dropout": 0,\n "attention_dropout": 0,\n\n # precision settings\n "fp16": {\n "fp16": true,\n "enabled": true,\n "loss_scale": 0,\n "loss_scale_window": 1000,\n "hysteresis": 2,\n "min_loss_scale": 1\n },\n\n # misc. training settings\n "train_iters": 320000,\n "lr_decay_iters": 320000,\n "distributed_backend": "nccl",\n "lr_decay_style": "cosine",\n "warmup": 0.01,\n "checkpoint_factor": 10000,\n "eval_interval": 1000,\n "eval_iters": 10,\n\n # logging\n "log_interval": 100,\n "steps_per_print": 10,\n "keep_last_n_checkpoints": 4,\n "wall_clock_breakdown": true,\n}\n', 'local_setup.yml': '# Suggested data paths when using GPT-NeoX locally\n{\n "data_path": "data/processed_data/mydataset_text_document",\n "tokenizer_type":"HFTokenizer",\n # or for weighted datasets:\n # "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],\n # "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],\n # "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"],\n # "train-data-weights": [1., 2.],\n # "test-data-weights": [2., 1.],\n # "valid-data-weights": [0.5, 0.4],\n\n # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.\n # WARNING: setting this to True will override any user provided weights\n # "weight_by_num_documents": false,\n # "weighted_sampler_alpha": 0.3,\n\n "vocab_file": "ckpts/20B_tokenizer.json",\n "merge_file": "data/gpt2-merges.txt",\n\n "save": "checkpoints",\n "load": "checkpoints",\n "checkpoint_validation_with_forward_pass": False,\n\n "tensorboard_dir": "tensorboard",\n "log_dir": "logs",\n "use_wandb": False,\n "wandb_host": "https://api.wandb.ai",\n "wandb_project": "neox"\n}\n'}updated
data_impl ....................... mmap........................updated
data_path ....................... data/processed_data/mydataset_text_documentupdated
dynamic_loss_scale .............. True........................updated
eval_iters ...................... 10..........................updated
fp16 ............................ {'fp16': True, 'enabled': True, 'loss_scale': 0, 'loss_scale_window': 1000, 'hysteresis': 2, 'min_loss_scale': 1}updated
global_num_gpus ................. 4...........................updated
hidden_size ..................... 2560........................updated
init_method ..................... small_init..................updated
is_pipe_parallel ................ True........................updated
keep_last_n_checkpoints ......... 4...........................updated
load ............................ checkpoints.................updated
log_dir ......................... logs........................updated
lr .............................. 0.00016.....................updated
lr_decay_iters .................. 320000......................updated
lr_decay_style .................. cosine......................updated
max_position_embeddings ......... 2048........................updated
merge_file ...................... data/gpt2-merges.txt........updated
min_lr .......................... 1.6e-05.....................updated
no_weight_tying ................. True........................updated
num_attention_heads ............. 32..........................updated
num_layers ...................... 32..........................updated
optimizer ....................... {'type': 'Adam', 'params': {'lr': 0.00016, 'betas': [0.9, 0.95], 'eps': 1e-08}}updated
optimizer_type .................. Adam........................updated
output_layer_init_method ........ wang_init...................updated
partition_activations ........... True........................updated
pipe_parallel_size .............. 2...........................updated
pos_emb ......................... rotary......................updated
precision ....................... fp16........................updated
save ............................ checkpoints.................updated
save_iters ...................... [10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000, 110000, 120000, 130000, 140000, 150000, 160000, 170000, 180000, 190000, 200000, 210000, 220000, 230000, 240000, 250000, 260000, 270000, 280000, 290000, 300000, 310000]updated
seq_length ...................... 2048........................updated
sparsity_config ................. {}..........................updated
synchronize_each_layer .......... True........................updated
tensorboard_dir ................. tensorboard.................updated
text_gen_type ................... unconditional...............updated
tokenizer_type .................. HFTokenizer.................updated
train_batch_size ................ 8...........................updated
train_iters ..................... 320000......................updated
train_micro_batch_size_per_gpu .. 4...........................updated
use_wandb ....................... False.......................updated
user_script ..................... train.py....................updated
vocab_file ...................... ckpts/20B_tokenizer.json....updated
wall_clock_breakdown ............ True........................updated
zero_allgather_bucket_size ...... 500000000...................updated
zero_contiguous_gradients ....... True........................updated
zero_optimization ............... {'stage': 3, 'allgather_partitions': True, 'allgather_bucket_size': 500000000, 'overlap_comm': True, 'reduce_scatter': True, 'reduce_bucket_size': 500000000, 'contiguous_gradients': True}updated
zero_reduce_bucket_size ......... 500000000...................updated
zero_reduce_scatter ............. True........................updated
zero_stage ...................... 3...........................updated
account ......................... None........................default
activation ...................... gelu........................default
activation_checkpointing ........ None........................default
adlr_autoresume ................. False.......................default
adlr_autoresume_interval ........ 1000........................default
amp ............................. None........................default
apply_query_key_layer_scaling ... False.......................default
attention_dropout ............... 0...........................default
attention_softmax_in_fp32 ....... False.......................default
autotuning ...................... None........................default
autotuning_run .................. None........................default
base_shapes_file ................ None........................default
bf16 ............................ None........................default
bias_dropout_fusion ............. False.......................default
bias_gelu_fusion ................ False.......................default
char_level_ppl .................. False.......................default
checkpoint ...................... None........................default
checkpoint_in_cpu ............... False.......................default
checkpoint_num_layers ........... 1...........................default
checkpoint_scale ................ linear......................default
checkpoint_validation_with_forward_pass False................default
clip_grad ....................... 1.0.........................default
comment ......................... None........................default
comms_logger .................... None........................default
communication_data_type ......... None........................default
compression_training ............ None........................default
contiguous_checkpointing ........ False.......................default
coord_check ..................... False.......................default
create_moe_param_group .......... True........................default
csv_monitor ..................... None........................default
curriculum_learning ............. None........................default
curriculum_seqlen ............... 0...........................default
data_efficiency ................. None........................default
data_types ...................... None........................default
deepscale ....................... False.......................default
deepscale_config ................ None........................default
deepspeed ....................... True........................default
deepspeed_activation_checkpointing True......................default
deepspeed_extra_args ............ None........................default
deepspeed_mpi ................... False.......................default
deepspeed_slurm ................. False.......................default
detect_nvlink_pairs ............. False.......................default
distributed_backend ............. nccl........................default
do_test ......................... None........................default
do_train ........................ None........................default
do_valid ........................ None........................default
dump_state ...................... False.......................default
elasticity ...................... None........................default
enable_expert_tensor_parallelism False.......................default
eod_mask_loss ................... False.......................default
eval_interval ................... 1000........................default
eval_results_prefix ............. ............................default
eval_tasks ...................... None........................default
exclude ......................... None........................default
exit_interval ................... None........................default
expert_interval ................. 2...........................default
extra_save_iters ................ None........................default
finetune ........................ False.......................default
flops_profiler .................. None........................default
force_multi ..................... False.......................default
fp16_lm_cross_entropy ........... False.......................default
fp32_allreduce .................. False.......................default
git_hash ........................ None........................default
gmlp_attn_dim ................... 64..........................default
gpt_j_residual .................. False.......................default
gpt_j_tied ...................... False.......................default
gradient_accumulation_steps ..... 1...........................default
gradient_clipping ............... 1.0.........................default
gradient_noise_scale_cpu_offload False.......................default
gradient_noise_scale_n_batches .. 5...........................default
gradient_predivide_factor ....... 1.0.........................default
hidden_dropout .................. 0...........................default
hostfile ........................ None........................default
hysteresis ...................... 2...........................default
include ......................... None........................default
init_method_std ................. 0.02........................default
intermediate_size ............... None........................default
iteration ....................... None........................default
label_data_paths ................ None........................default
launcher ........................ pdsh........................default
layernorm_epsilon ............... 1e-05.......................default
layernorm_fusion ................ False.......................default
lazy_mpu_init ................... False.......................default
local_rank ...................... None........................default
log_grad_norm ................... False.......................default
log_grad_pct_zeros .............. False.......................default
log_gradient_noise_scale ........ False.......................default
log_interval .................... 100.........................default
log_optimizer_states ............ False.......................default
log_param_norm .................. False.......................default
loss_scale ...................... None........................default
loss_scale_window ............... 1000.0......................default
make_vocab_size_divisible_by .... 128.........................default
mamba_causal_conv_fusion ........ False.......................default
mamba_inner_func_fusion ......... False.......................default
mamba_selective_fp32_params ..... True........................default
mamba_selective_scan_fusion ..... False.......................default
mamba_use_bias_in_conv .......... True........................default
mamba_use_bias_in_linears ....... False.......................default
master_addr ..................... None........................default
master_port ..................... 29500.......................default
maximum_tokens .................. 64..........................default
memory_profiling ................ False.......................default
memory_profiling_path ........... None........................default
min_scale ....................... 1.0.........................default
mlp_type ........................ regular.....................default
mmap_warmup ..................... False.......................default
model_parallel_size ............. 1...........................default
moe_eval_capacity_factor ........ 1.0.........................default
moe_expert_parallel_size ........ 1...........................default
moe_glu ......................... False.......................default
moe_jitter_eps .................. None........................default
moe_lbl_in_fp32 ................. False.......................default
moe_loss_coeff .................. 0.1.........................default
moe_min_capacity ................ 4...........................default
moe_num_experts ................. 1...........................default
moe_token_dropping .............. False.......................default
moe_top_k ....................... 1...........................default
moe_train_capacity_factor ....... 1.0.........................default
moe_type ........................ megablocks..................default
moe_use_residual ................ True........................default
mup_attn_temp ................... 1.0.........................default
mup_embedding_mult .............. 1.0.........................default
mup_init_scale .................. 1.0.........................default
mup_output_temp ................. 1.0.........................default
mup_rp_embedding_mult ........... 1.0.........................default
mup_width_scale ................. 2...........................default
no_load_optim ................... False.......................default
no_load_rng ..................... False.......................default
no_save_optim ................... False.......................default
no_save_rng ..................... False.......................default
no_ssh_check .................... False.......................default
norm ............................ layernorm...................default
num_gpus ........................ None........................default
num_kv_heads .................... None........................default
num_nodes ....................... -1..........................default
num_samples ..................... 1...........................default
num_unique_layers ............... None........................default
num_workers ..................... 2...........................default
onnx_safe ....................... False.......................default
opt_pos_emb_offset .............. 0...........................default
output_layer_parallelism ........ column......................default
override_lr_scheduler ........... False.......................default
padded_vocab_size ............... None........................default
param_sharing_style ............. grouped.....................default
pipe_partition_method ........... type:transformer|mlp........default
prescale_gradients .............. False.......................default
profile ......................... False.......................default
profile_backward ................ False.......................default
profile_step_start .............. 10..........................default
profile_step_stop ............... 12..........................default
prompt_end ......................
...........................default
rank ............................ None........................default
recompute ....................... False.......................default
return_logits ................... False.......................default
rms_norm_epsilon ................ 1e-08.......................default
rope_fusion ..................... False.......................default
rotary_emb_base ................. 10000.......................default
rotary_pct ...................... 1.0.........................default
rotary_save_freqs_buffer ........ False.......................default
rpe_max_distance ................ 128.........................default
rpe_num_buckets ................. 32..........................default
s3_chunk_size ................... 104857600...................default
s3_path ......................... None........................default
sample_input_file ............... None........................default
sample_output_file .............. samples.txt.................default
save_base_shapes ................ False.......................default
scaled_masked_softmax_fusion .... False.......................default
scaled_upper_triang_masked_softmax_fusion False..............default
scalenorm_epsilon ............... 1e-08.......................default
scheduler ....................... None........................default
seed ............................ 1234........................default
short_seq_prob .................. 0.1.........................default
sliding_window_width ............ None........................default
soft_prompt_tuning .............. None........................default
sparse_attention ................ None........................default
sparse_gradients ................ False.......................default
split ........................... 969, 30, 1..................default
steps_per_print ................. 10..........................default
temperature ..................... 0.0.........................default
tensorboard ..................... None........................default
test_data_paths ................. None........................default
test_data_weights ............... None........................default
top_k ........................... 0...........................default
top_p ........................... 0.0.........................default
train_data_paths ................ None........................default
train_data_weights .............. None........................default
use_bias_in_attn_linear ......... True........................default
use_bias_in_norms ............... True........................default
use_bnb_optimizer ............... False.......................default
use_checkpoint_lr_scheduler ..... False.......................default
use_cpu_initialization .......... False.......................default
use_mup ......................... False.......................default
use_qk_layernorm ................ False.......................default
use_shared_fs ................... True........................default
use_tutel ....................... False.......................default
valid_data_paths ................ None........................default
valid_data_weights .............. None........................default
wandb ........................... None........................default
wandb_group ..................... None........................default
wandb_host ...................... https://api.wandb.ai........default
wandb_init_all_ranks ............ False.......................default
wandb_project ................... neox........................default
wandb_team ...................... None........................default
warmup .......................... 0.01........................default
weight_by_num_documents ......... False.......................default
weight_decay .................... 0.1.........................default
weighted_sampler_alpha .......... 1.0.........................default
world_size ...................... None........................default
---------------- end of arguments ----------------
NeoXArgs.configure_distributed_args() using world size: 1 and model-parallel size: 1
[2024-07-11 06:29:02,427] [WARNING] [runner.py:202:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
[2024-07-11 06:29:02,427] [INFO] [runner.py:568:main] cmd = /usr/local/miniconda3/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgM119 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None train.py --deepspeed_config eyJ0cmFpbl9iYXRjaF9zaXplIjogOCwgInRyYWluX21pY3JvX2JhdGNoX3NpemVfcGVyX2dwdSI6IDQsICJvcHRpbWl6ZXIiOiB7InR5cGUiOiAiQWRhbSIsICJwYXJhbXMiOiB7ImxyIjogMC4wMDAxNiwgImJldGFzIjogWzAuOSwgMC45NV0sICJlcHMiOiAxZS0wOH19LCAiZnAxNiI6IHsiZnAxNiI6IHRydWUsICJlbmFibGVkIjogdHJ1ZSwgImxvc3Nfc2NhbGUiOiAwLCAibG9zc19zY2FsZV93aW5kb3ciOiAxMDAwLCAiaHlzdGVyZXNpcyI6IDIsICJtaW5fbG9zc19zY2FsZSI6IDF9LCAiemVyb19vcHRpbWl6YXRpb24iOiB7InN0YWdlIjogMywgImFsbGdhdGhlcl9wYXJ0aXRpb25zIjogdHJ1ZSwgImFsbGdhdGhlcl9idWNrZXRfc2l6ZSI6IDUwMDAwMDAwMCwgIm92ZXJsYXBfY29tbSI6IHRydWUsICJyZWR1Y2Vfc2NhdHRlciI6IHRydWUsICJyZWR1Y2VfYnVja2V0X3NpemUiOiA1MDAwMDAwMDAsICJjb250aWd1b3VzX2dyYWRpZW50cyI6IHRydWV9LCAid2FsbF9jbG9ja19icmVha2Rvd24iOiB0cnVlfQ== --megatron_config eyJ0cmFpbl9iYXRjaF9zaXplIjogOCwgInRyYWluX21pY3JvX2JhdGNoX3NpemVfcGVyX2dwdSI6IDQsICJvcHRpbWl6ZXIiOiB7InR5cGUiOiAiQWRhbSIsICJwYXJhbXMiOiB7ImxyIjogMC4wMDAxNiwgImJldGFzIjogWzAuOSwgMC45NV0sICJlcHMiOiAxZS0wOH19LCAiZnAxNiI6IHsiZnAxNiI6IHRydWUsICJlbmFibGVkIjogdHJ1ZSwgImxvc3Nfc2NhbGUiOiAwLCAibG9zc19zY2FsZV93aW5kb3ciOiAxMDAwLCAiaHlzdGVyZXNpcyI6IDIsICJtaW5fbG9zc19zY2FsZSI6IDF9LCAiemVyb19vcHRpbWl6YXRpb24iOiB7InN0YWdlIjogMywgImFsbGdhdGhlcl9wYXJ0aXRpb25zIjogdHJ1ZSwgImFsbGdhdGhlcl9idWNrZXRfc2l6ZSI6IDUwMDAwMDAwMCwgIm92ZXJsYXBfY29tbSI6IHRydWUsICJyZWR1Y2Vfc2NhdHRlciI6IHRydWUsICJyZWR1Y2VfYnVja2V0X3NpemUiOiA1MDAwMDAwMDAsICJjb250aWd1b3VzX2dyYWRpZW50cyI6IHRydWV9LCAid2FsbF9jbG9ja19icmVha2Rvd24iOiB0cnVlLCAicHJlY2lzaW9uIjogImZwMTYiLCAibnVtX2xheWVycyI6IDMyLCAiaGlkZGVuX3NpemUiOiAyNTYwLCAibnVtX2F0dGVudGlvbl9oZWFkcyI6IDMyLCAic2VxX2xlbmd0aCI6IDIwNDgsICJtYXhfcG9zaXRpb25fZW1iZWRkaW5ncyI6IDIwNDgsICJwb3NfZW1iIjogInJvdGFyeSIsICJub193ZWlnaHRfdHlpbmciOiB0cnVlLCAiYXR0ZW50aW9uX2NvbmZpZyI6IFsiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCJdLCAic3BhcnNpdHlfY29uZmlnIjoge30sICJpbml0X21ldGhvZCI6ICJzbWFsbF9pbml0IiwgIm91dHB1dF9sYXllcl9pbml0X21ldGhvZCI6ICJ3YW5nX2luaXQiLCAibHJfZGVjYXlfc3R5bGUiOiAiY29zaW5lIiwgImxyX2RlY2F5X2l0ZXJzIjogMzIwMDAwLCAibWluX2xyIjogMS42ZS0wNSwgIm9wdGltaXplcl90eXBlIjogIkFkYW0iLCAiemVyb19zdGFnZSI6IDMsICJ6ZXJvX3JlZHVjZV9zY2F0dGVyIjogdHJ1ZSwgInplcm9fY29udGlndW91c19ncmFkaWVudHMiOiB0cnVlLCAiemVyb19yZWR1Y2VfYnVja2V0X3NpemUiOiA1MDAwMDAwMDAsICJ6ZXJvX2FsbGdhdGhlcl9idWNrZXRfc2l6ZSI6IDUwMDAwMDAwMCwgImxyIjogMC4wMDAxNiwgInRva2VuaXplcl90eXBlIjogIkhGVG9rZW5pemVyIiwgImRhdGFfcGF0aCI6ICJkYXRhL3Byb2Nlc3NlZF9kYXRhL215ZGF0YXNldF90ZXh0X2RvY3VtZW50IiwgImRhdGFfaW1wbCI6ICJtbWFwIiwgInNhdmUiOiAiY2hlY2twb2ludHMiLCAiY29uZmlnX2ZpbGVzIjogeyIyLTdCLnltbCI6ICIjIEdQVC0yIHByZXRyYWluaW5nIHNldHVwXG57XG4gICAjIHBhcmFsbGVsaXNtIHNldHRpbmdzICggeW91IHdpbGwgd2FudCB0byBjaGFuZ2UgdGhlc2UgYmFzZWQgb24geW91ciBjbHVzdGVyIHNldHVwLCBpZGVhbGx5IHNjaGVkdWxpbmcgcGlwZWxpbmUgc3RhZ2VzXG4gICAjIGFjcm9zcyB0aGUgbm9kZSBib3VuZGFyaWVzIClcbiAgIFwicGlwZV9wYXJhbGxlbF9zaXplXCI6IDIsXG4gICBcIm1vZGVsX3BhcmFsbGVsX3NpemVcIjogMSxcbiAgICNcImdyYWRpZW50X2FjY3VtdWxhdGlvbl9zdGVwc1wiOiAyLFxuXG4gICAjIG1vZGVsIHNldHRpbmdzXG4gICBcIm51bV9sYXllcnNcIjogMzIsXG4gICBcImhpZGRlbl9zaXplXCI6IDI1NjAsXG4gICBcIm51bV9hdHRlbnRpb25faGVhZHNcIjogMzIsXG4gICBcInNlcV9sZW5ndGhcIjogMjA0OCxcbiAgIFwibWF4X3Bvc2l0aW9uX2VtYmVkZGluZ3NcIjogMjA0OCxcbiAgIFwibm9ybVwiOiBcImxheWVybm9ybVwiLFxuICAgXCJwb3NfZW1iXCI6IFwicm90YXJ5XCIsXG4gICBcIm5vX3dlaWdodF90eWluZ1wiOiB0cnVlLFxuICAgXCJncHRfal9yZXNpZHVhbFwiOiBmYWxzZSxcbiAgIFwib3V0cHV0X2xheWVyX3BhcmFsbGVsaXNtXCI6IFwiY29sdW1uXCIsXG5cbiAgICMgdGhlc2Ugc2hvdWxkIHByb3ZpZGUgc29tZSBzcGVlZHVwIGJ1dCB0YWtlcyBhIHdoaWxlIHRvIGJ1aWxkLCBzZXQgdG8gdHJ1ZSBpZiBkZXNpcmVkXG4gICBcInNjYWxlZF91cHBlcl90cmlhbmdfbWFza2VkX3NvZnRtYXhfZnVzaW9uXCI6IGZhbHNlLFxuICAgXCJiaWFzX2dlbHVfZnVzaW9uXCI6IGZhbHNlLFxuICAgXCJyb3BlX2Z1c2lvblwiOiBmYWxzZSxcbiAgIFwibGF5ZXJub3JtX2Z1c2lvblwiOiBmYWxzZSxcblxuICAgIyBpbml0IG1ldGhvZHNcbiAgIFwiaW5pdF9tZXRob2RcIjogXCJzbWFsbF9pbml0XCIsXG4gICBcIm91dHB1dF9sYXllcl9pbml0X21ldGhvZFwiOiBcIndhbmdfaW5pdFwiLFxuXG4gICAjIG9wdGltaXplciBzZXR0aW5nc1xuICAgXCJvcHRpbWl6ZXJcIjoge1xuICAgICBcInR5cGVcIjogXCJBZGFtXCIsXG4gICAgIFwicGFyYW1zXCI6IHtcbiAgICAgICBcImxyXCI6IDAuMDAwMTYsXG4gICAgICAgXCJiZXRhc1wiOiBbMC45LCAwLjk1XSxcbiAgICAgICBcImVwc1wiOiAxLjBlLTgsXG4gICAgIH1cbiAgIH0sXG4gICBcIm1pbl9sclwiOiAwLjAwMDAxNixcblxuICAgIyBmb3IgYWxsIHplcm9fb3B0aW1pemF0aW9uIG9wdGlvbnMsIHNlZSBodHRwczovL3d3dy5kZWVwc3BlZWQuYWkvZG9jcy9jb25maWctanNvbi8jemVyby1vcHRpbWl6YXRpb25zLWZvci1mcDE2LXRyYWluaW5nXG4gICBcInplcm9fb3B0aW1pemF0aW9uXCI6IHtcbiAgICBcInN0YWdlXCI6IDMsXG4gICAgXCJhbGxnYXRoZXJfcGFydGl0aW9uc1wiOiBUcnVlLFxuICAgIFwiYWxsZ2F0aGVyX2J1Y2tldF9zaXplXCI6IDUwMDAwMDAwMCxcbiAgICBcIm92ZXJsYXBfY29tbVwiOiBUcnVlLFxuICAgIFwicmVkdWNlX3NjYXR0ZXJcIjogVHJ1ZSxcbiAgICBcInJlZHVjZV9idWNrZXRfc2l6ZVwiOiA1MDAwMDAwMDAsXG4gICAgXCJjb250aWd1b3VzX2dyYWRpZW50c1wiOiBUcnVlLFxuICB9LFxuXG4gICAjIGJhdGNoIC8gZGF0YSBzZXR0aW5nc1xuICAgXCJ0cmFpbl9taWNyb19iYXRjaF9zaXplX3Blcl9ncHVcIjogNCxcbiAgIFwiZGF0YV9pbXBsXCI6IFwibW1hcFwiLFxuXG4gICAjIGFjdGl2YXRpb24gY2hlY2twb2ludGluZ1xuICAgXCJjaGVja3BvaW50X2FjdGl2YXRpb25zXCI6IHRydWUsXG4gICBcImNoZWNrcG9pbnRfbnVtX2xheWVyc1wiOiAxLFxuICAgXCJwYXJ0aXRpb25fYWN0aXZhdGlvbnNcIjogdHJ1ZSxcbiAgIFwic3luY2hyb25pemVfZWFjaF9sYXllclwiOiB0cnVlLFxuXG4gICAjIHJlZ3VsYXJpemF0aW9uXG4gICBcImdyYWRpZW50X2NsaXBwaW5nXCI6IDEuMCxcbiAgIFwid2VpZ2h0X2RlY2F5XCI6IDAuMSxcbiAgIFwiaGlkZGVuX2Ryb3BvdXRcIjogMCxcbiAgIFwiYXR0ZW50aW9uX2Ryb3BvdXRcIjogMCxcblxuICAgIyBwcmVjaXNpb24gc2V0dGluZ3NcbiAgIFwiZnAxNlwiOiB7XG4gICAgIFwiZnAxNlwiOiB0cnVlLFxuICAgICBcImVuYWJsZWRcIjogdHJ1ZSxcbiAgICAgXCJsb3NzX3NjYWxlXCI6IDAsXG4gICAgIFwibG9zc19zY2FsZV93aW5kb3dcIjogMTAwMCxcbiAgICAgXCJoeXN0ZXJlc2lzXCI6IDIsXG4gICAgIFwibWluX2xvc3Nfc2NhbGVcIjogMVxuICAgfSxcblxuICAgIyBtaXNjLiB0cmFpbmluZyBzZXR0aW5nc1xuICAgXCJ0cmFpbl9pdGVyc1wiOiAzMjAwMDAsXG4gICBcImxyX2RlY2F5X2l0ZXJzXCI6IDMyMDAwMCxcbiAgIFwiZGlzdHJpYnV0ZWRfYmFja2VuZFwiOiBcIm5jY2xcIixcbiAgIFwibHJfZGVjYXlfc3R5bGVcIjogXCJjb3NpbmVcIixcbiAgIFwid2FybXVwXCI6IDAuMDEsXG4gICBcImNoZWNrcG9pbnRfZmFjdG9yXCI6IDEwMDAwLFxuICAgXCJldmFsX2ludGVydmFsXCI6IDEwMDAsXG4gICBcImV2YWxfaXRlcnNcIjogMTAsXG5cbiAgICMgbG9nZ2luZ1xuICAgXCJsb2dfaW50ZXJ2YWxcIjogMTAwLFxuICAgXCJzdGVwc19wZXJfcHJpbnRcIjogMTAsXG4gICBcImtlZXBfbGFzdF9uX2NoZWNrcG9pbnRzXCI6IDQsXG4gICBcIndhbGxfY2xvY2tfYnJlYWtkb3duXCI6IHRydWUsXG59XG4iLCAibG9jYWxfc2V0dXAueW1sIjogIiMgU3VnZ2VzdGVkIGRhdGEgcGF0aHMgd2hlbiB1c2luZyBHUFQtTmVvWCBsb2NhbGx5XG57XG4gIFwiZGF0YV9wYXRoXCI6IFwiZGF0YS9wcm9jZXNzZWRfZGF0YS9teWRhdGFzZXRfdGV4dF9kb2N1bWVudFwiLFxuICBcInRva2VuaXplcl90eXBlXCI6XCJIRlRva2VuaXplclwiLFxuICAjIG9yIGZvciB3ZWlnaHRlZCBkYXRhc2V0czpcbiAgIyBcInRyYWluLWRhdGEtcGF0aHNcIjogW1wiZGF0YS9lbndpazgvZW53aWs4X3RleHRfZG9jdW1lbnRcIiwgXCJkYXRhL2Vud2lrOC9lbndpazhfdGV4dF9kb2N1bWVudFwiXSxcbiAgIyBcInRlc3QtZGF0YS1wYXRoc1wiOiBbXCJkYXRhL2Vud2lrOC9lbndpazhfdGV4dF9kb2N1bWVudFwiLCBcImRhdGEvZW53aWs4L2Vud2lrOF90ZXh0X2RvY3VtZW50XCJdLFxuICAjIFwidmFsaWQtZGF0YS1wYXRoc1wiOiBbXCJkYXRhL2Vud2lrOC9lbndpazhfdGV4dF9kb2N1bWVudFwiLCBcImRhdGEvZW53aWs4L2Vud2lrOF90ZXh0X2RvY3VtZW50XCJdLFxuICAjIFwidHJhaW4tZGF0YS13ZWlnaHRzXCI6IFsxLiwgMi5dLFxuICAjIFwidGVzdC1kYXRhLXdlaWdodHNcIjogWzIuLCAxLl0sXG4gICMgXCJ2YWxpZC1kYXRhLXdlaWdodHNcIjogWzAuNSwgMC40XSxcblxuICAjIElmIHdlaWdodF9ieV9udW1fZG9jdW1lbnRzIGlzIFRydWUsIEJ1aWxkcyBkYXRhc2V0IHdlaWdodHMgZnJvbSBhIG11bHRpbm9taWFsIGRpc3RyaWJ1dGlvbiBvdmVyIGdyb3VwcyBvZiBkYXRhIGFjY29yZGluZyB0byB0aGUgbnVtYmVyIG9mIGRvY3VtZW50cyBpbiBlYWNoIGdyb3VwLlxuICAjIFdBUk5JTkc6IHNldHRpbmcgdGhpcyB0byBUcnVlIHdpbGwgb3ZlcnJpZGUgYW55IHVzZXIgcHJvdmlkZWQgd2VpZ2h0c1xuICAjIFwid2VpZ2h0X2J5X251bV9kb2N1bWVudHNcIjogZmFsc2UsXG4gICMgXCJ3ZWlnaHRlZF9zYW1wbGVyX2FscGhhXCI6IDAuMyxcblxuICBcInZvY2FiX2ZpbGVcIjogXCJja3B0cy8yMEJfdG9rZW5pemVyLmpzb25cIixcbiAgXCJtZXJnZV9maWxlXCI6IFwiZGF0YS9ncHQyLW1lcmdlcy50eHRcIixcblxuICBcInNhdmVcIjogXCJjaGVja3BvaW50c1wiLFxuICBcImxvYWRcIjogXCJjaGVja3BvaW50c1wiLFxuICBcImNoZWNrcG9pbnRfdmFsaWRhdGlvbl93aXRoX2ZvcndhcmRfcGFzc1wiOiBGYWxzZSxcblxuICBcInRlbnNvcmJvYXJkX2RpclwiOiBcInRlbnNvcmJvYXJkXCIsXG4gIFwibG9nX2RpclwiOiBcImxvZ3NcIixcbiAgXCJ1c2Vfd2FuZGJcIjogRmFsc2UsXG4gIFwid2FuZGJfaG9zdFwiOiBcImh0dHBzOi8vYXBpLndhbmRiLmFpXCIsXG4gIFwid2FuZGJfcHJvamVjdFwiOiBcIm5lb3hcIlxufVxuIn0sICJsb2FkIjogImNoZWNrcG9pbnRzIiwgImNoZWNrcG9pbnRfZmFjdG9yIjogMTAwMDAsICJiYXRjaF9zaXplIjogNCwgInRyYWluX2l0ZXJzIjogMzIwMDAwLCAiZXZhbF9pdGVycyI6IDEwLCAia2VlcF9sYXN0X25fY2hlY2twb2ludHMiOiA0LCAidm9jYWJfZmlsZSI6ICJja3B0cy8yMEJfdG9rZW5pemVyLmpzb24iLCAibWVyZ2VfZmlsZSI6ICJkYXRhL2dwdDItbWVyZ2VzLnR4dCIsICJjaGVja3BvaW50X2FjdGl2YXRpb25zIjogdHJ1ZSwgInN5bmNocm9uaXplX2VhY2hfbGF5ZXIiOiB0cnVlLCAicGFydGl0aW9uX2FjdGl2YXRpb25zIjogdHJ1ZSwgImR5bmFtaWNfbG9zc19zY2FsZSI6IHRydWUsICJwaXBlX3BhcmFsbGVsX3NpemUiOiAyLCAid29ybGRfc2l6ZSI6IDEsICJpc19waXBlX3BhcmFsbGVsIjogdHJ1ZSwgInVzZV93YW5kYiI6IGZhbHNlLCAibG9nX2RpciI6ICJsb2dzIiwgInRlbnNvcmJvYXJkX2RpciI6ICJ0ZW5zb3Jib2FyZCIsICJ0ZXh0X2dlbl90eXBlIjogInVuY29uZGl0aW9uYWwiLCAibG9jYWxfcmFuayI6IDAsICJyYW5rIjogMCwgInVzZXJfc2NyaXB0IjogInRyYWluLnB5IiwgInNhdmVfaXRlcnMiOiBbMTAwMDAsIDIwMDAwLCAzMDAwMCwgNDAwMDAsIDUwMDAwLCA2MDAwMCwgNzAwMDAsIDgwMDAwLCA5MDAwMCwgMTAwMDAwLCAxMTAwMDAsIDEyMDAwMCwgMTMwMDAwLCAxNDAwMDAsIDE1MDAwMCwgMTYwMDAwLCAxNzAwMDAsIDE4MDAwMCwgMTkwMDAwLCAyMDAwMDAsIDIxMDAwMCwgMjIwMDAwLCAyMzAwMDAsIDI0MDAwMCwgMjUwMDAwLCAyNjAwMDAsIDI3MDAwMCwgMjgwMDAwLCAyOTAwMDAsIDMwMDAwMCwgMzEwMDAwXSwgImdsb2JhbF9udW1fZ3B1cyI6IDR9
[2024-07-11 06:29:04,147] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[WARNING] async_io requires the dev libaio .so object and headers but these were not found.
[WARNING] async_io: please install the libaio-dev package with apt
[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
[WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
[WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.0
[WARNING] using untested triton version (2.0.0), only 1.0.0 is known to be compatible
[2024-07-11 06:29:06,944] [INFO] [launch.py:139:main] 0 NV_LIBNCCL_PACKAGE_VERSION=2.16.2-1
[2024-07-11 06:29:06,944] [INFO] [launch.py:139:main] 0 NV_LIBNCCL_DEV_PACKAGE_VERSION=2.16.2-1
[2024-07-11 06:29:06,944] [INFO] [launch.py:139:main] 0 NV_LIBNCCL_PACKAGE_NAME=libnccl2
[2024-07-11 06:29:06,944] [INFO] [launch.py:139:main] 0 NV_LIBNCCL_DEV_PACKAGE_NAME=libnccl-dev
[2024-07-11 06:29:06,944] [INFO] [launch.py:139:main] 0 NV_LIBNCCL_PACKAGE=libnccl2=2.16.2-1+cuda11.8
[2024-07-11 06:29:06,944] [INFO] [launch.py:139:main] 0 NV_LIBNCCL_DEV_PACKAGE=libnccl-dev=2.16.2-1+cuda11.8
[2024-07-11 06:29:06,944] [INFO] [launch.py:139:main] 0 NCCL_VERSION=2.16.2-1
[2024-07-11 06:29:06,944] [INFO] [launch.py:146:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3]}
[2024-07-11 06:29:06,944] [INFO] [launch.py:152:main] nnodes=1, num_local_procs=4, node_rank=0
[2024-07-11 06:29:06,944] [INFO] [launch.py:163:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1, 2, 3]})
[2024-07-11 06:29:06,944] [INFO] [launch.py:164:main] dist_world_size=4
[2024-07-11 06:29:06,944] [INFO] [launch.py:168:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3
[2024-07-11 06:29:06,954] [INFO] [launch.py:256:main] process 2159 spawned with command: ['/usr/local/miniconda3/bin/python', '-u', 'train.py', '--local_rank=0', '--deepspeed_config', 'eyJ0cmFpbl9iYXRjaF9zaXplIjogOCwgInRyYWluX21pY3JvX2JhdGNoX3NpemVfcGVyX2dwdSI6IDQsICJvcHRpbWl6ZXIiOiB7InR5cGUiOiAiQWRhbSIsICJwYXJhbXMiOiB7ImxyIjogMC4wMDAxNiwgImJldGFzIjogWzAuOSwgMC45NV0sICJlcHMiOiAxZS0wOH19LCAiZnAxNiI6IHsiZnAxNiI6IHRydWUsICJlbmFibGVkIjogdHJ1ZSwgImxvc3Nfc2NhbGUiOiAwLCAibG9zc19zY2FsZV93aW5kb3ciOiAxMDAwLCAiaHlzdGVyZXNpcyI6IDIsICJtaW5fbG9zc19zY2FsZSI6IDF9LCAiemVyb19vcHRpbWl6YXRpb24iOiB7InN0YWdlIjogMywgImFsbGdhdGhlcl9wYXJ0aXRpb25zIjogdHJ1ZSwgImFsbGdhdGhlcl9idWNrZXRfc2l6ZSI6IDUwMDAwMDAwMCwgIm92ZXJsYXBfY29tbSI6IHRydWUsICJyZWR1Y2Vfc2NhdHRlciI6IHRydWUsICJyZWR1Y2VfYnVja2V0X3NpemUiOiA1MDAwMDAwMDAsICJjb250aWd1b3VzX2dyYWRpZW50cyI6IHRydWV9LCAid2FsbF9jbG9ja19icmVha2Rvd24iOiB0cnVlfQ==', '--megatron_config', 'eyJ0cmFpbl9iYXRjaF9zaXplIjogOCwgInRyYWluX21pY3JvX2JhdGNoX3NpemVfcGVyX2dwdSI6IDQsICJvcHRpbWl6ZXIiOiB7InR5cGUiOiAiQWRhbSIsICJwYXJhbXMiOiB7ImxyIjogMC4wMDAxNiwgImJldGFzIjogWzAuOSwgMC45NV0sICJlcHMiOiAxZS0wOH19LCAiZnAxNiI6IHsiZnAxNiI6IHRydWUsICJlbmFibGVkIjogdHJ1ZSwgImxvc3Nfc2NhbGUiOiAwLCAibG9zc19zY2FsZV93aW5kb3ciOiAxMDAwLCAiaHlzdGVyZXNpcyI6IDIsICJtaW5fbG9zc19zY2FsZSI6IDF9LCAiemVyb19vcHRpbWl6YXRpb24iOiB7InN0YWdlIjogMywgImFsbGdhdGhlcl9wYXJ0aXRpb25zIjogdHJ1ZSwgImFsbGdhdGhlcl9idWNrZXRfc2l6ZSI6IDUwMDAwMDAwMCwgIm92ZXJsYXBfY29tbSI6IHRydWUsICJyZWR1Y2Vfc2NhdHRlciI6IHRydWUsICJyZWR1Y2VfYnVja2V0X3NpemUiOiA1MDAwMDAwMDAsICJjb250aWd1b3VzX2dyYWRpZW50cyI6IHRydWV9LCAid2FsbF9jbG9ja19icmVha2Rvd24iOiB0cnVlLCAicHJlY2lzaW9uIjogImZwMTYiLCAibnVtX2xheWVycyI6IDMyLCAiaGlkZGVuX3NpemUiOiAyNTYwLCAibnVtX2F0dGVudGlvbl9oZWFkcyI6IDMyLCAic2VxX2xlbmd0aCI6IDIwNDgsICJtYXhfcG9zaXRpb25fZW1iZWRkaW5ncyI6IDIwNDgsICJwb3NfZW1iIjogInJvdGFyeSIsICJub193ZWlnaHRfdHlpbmciOiB0cnVlLCAiYXR0ZW50aW9uX2NvbmZpZyI6IFsiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCJdLCAic3BhcnNpdHlfY29uZmlnIjoge30sICJpbml0X21ldGhvZCI6ICJzbWFsbF9pbml0IiwgIm91dHB1dF9sYXllcl9pbml0X21ldGhvZCI6ICJ3YW5nX2luaXQiLCAibHJfZGVjYXlfc3R5bGUiOiAiY29zaW5lIiwgImxyX2RlY2F5X2l0ZXJzIjogMzIwMDAwLCAibWluX2xyIjogMS42ZS0wNSwgIm9wdGltaXplcl90eXBlIjogIkFkYW0iLCAiemVyb19zdGFnZSI6IDMsICJ6ZXJvX3JlZHVjZV9zY2F0dGVyIjogdHJ1ZSwgInplcm9fY29udGlndW91c19ncmFkaWVudHMiOiB0cnVlLCAiemVyb19yZWR1Y2VfYnVja2V0X3NpemUiOiA1MDAwMDAwMDAsICJ6ZXJvX2FsbGdhdGhlcl9idWNrZXRfc2l6ZSI6IDUwMDAwMDAwMCwgImxyIjogMC4wMDAxNiwgInRva2VuaXplcl90eXBlIjogIkhGVG9rZW5pemVyIiwgImRhdGFfcGF0aCI6ICJkYXRhL3Byb2Nlc3NlZF9kYXRhL215ZGF0YXNldF90ZXh0X2RvY3VtZW50IiwgImRhdGFfaW1wbCI6ICJtbWFwIiwgInNhdmUiOiAiY2hlY2twb2ludHMiLCAiY29uZmlnX2ZpbGVzIjogeyIyLTdCLnltbCI6ICIjIEdQVC0yIHByZXRyYWluaW5nIHNldHVwXG57XG4gICAjIHBhcmFsbGVsaXNtIHNldHRpbmdzICggeW91IHdpbGwgd2FudCB0byBjaGFuZ2UgdGhlc2UgYmFzZWQgb24geW91ciBjbHVzdGVyIHNldHVwLCBpZGVhbGx5IHNjaGVkdWxpbmcgcGlwZWxpbmUgc3RhZ2VzXG4gICAjIGFjcm9zcyB0aGUgbm9kZSBib3VuZGFyaWVzIClcbiAgIFwicGlwZV9wYXJhbGxlbF9zaXplXCI6IDIsXG4gICBcIm1vZGVsX3BhcmFsbGVsX3NpemVcIjogMSxcbiAgICNcImdyYWRpZW50X2FjY3VtdWxhdGlvbl9zdGVwc1wiOiAyLFxuXG4gICAjIG1vZGVsIHNldHRpbmdzXG4gICBcIm51bV9sYXllcnNcIjogMzIsXG4gICBcImhpZGRlbl9zaXplXCI6IDI1NjAsXG4gICBcIm51bV9hdHRlbnRpb25faGVhZHNcIjogMzIsXG4gICBcInNlcV9sZW5ndGhcIjogMjA0OCxcbiAgIFwibWF4X3Bvc2l0aW9uX2VtYmVkZGluZ3NcIjogMjA0OCxcbiAgIFwibm9ybVwiOiBcImxheWVybm9ybVwiLFxuICAgXCJwb3NfZW1iXCI6IFwicm90YXJ5XCIsXG4gICBcIm5vX3dlaWdodF90eWluZ1wiOiB0cnVlLFxuICAgXCJncHRfal9yZXNpZHVhbFwiOiBmYWxzZSxcbiAgIFwib3V0cHV0X2xheWVyX3BhcmFsbGVsaXNtXCI6IFwiY29sdW1uXCIsXG5cbiAgICMgdGhlc2Ugc2hvdWxkIHByb3ZpZGUgc29tZSBzcGVlZHVwIGJ1dCB0YWtlcyBhIHdoaWxlIHRvIGJ1aWxkLCBzZXQgdG8gdHJ1ZSBpZiBkZXNpcmVkXG4gICBcInNjYWxlZF91cHBlcl90cmlhbmdfbWFza2VkX3NvZnRtYXhfZnVzaW9uXCI6IGZhbHNlLFxuICAgXCJiaWFzX2dlbHVfZnVzaW9uXCI6IGZhbHNlLFxuICAgXCJyb3BlX2Z1c2lvblwiOiBmYWxzZSxcbiAgIFwibGF5ZXJub3JtX2Z1c2lvblwiOiBmYWxzZSxcblxuICAgIyBpbml0IG1ldGhvZHNcbiAgIFwiaW5pdF9tZXRob2RcIjogXCJzbWFsbF9pbml0XCIsXG4gICBcIm91dHB1dF9sYXllcl9pbml0X21ldGhvZFwiOiBcIndhbmdfaW5pdFwiLFxuXG4gICAjIG9wdGltaXplciBzZXR0aW5nc1xuICAgXCJvcHRpbWl6ZXJcIjoge1xuICAgICBcInR5cGVcIjogXCJBZGFtXCIsXG4gICAgIFwicGFyYW1zXCI6IHtcbiAgICAgICBcImxyXCI6IDAuMDAwMTYsXG4gICAgICAgXCJiZXRhc1wiOiBbMC45LCAwLjk1XSxcbiAgICAgICBcImVwc1wiOiAxLjBlLTgsXG4gICAgIH1cbiAgIH0sXG4gICBcIm1pbl9sclwiOiAwLjAwMDAxNixcblxuICAgIyBmb3IgYWxsIHplcm9fb3B0aW1pemF0aW9uIG9wdGlvbnMsIHNlZSBodHRwczovL3d3dy5kZWVwc3BlZWQuYWkvZG9jcy9jb25maWctanNvbi8jemVyby1vcHRpbWl6YXRpb25zLWZvci1mcDE2LXRyYWluaW5nXG4gICBcInplcm9fb3B0aW1pemF0aW9uXCI6IHtcbiAgICBcInN0YWdlXCI6IDMsXG4gICAgXCJhbGxnYXRoZXJfcGFydGl0aW9uc1wiOiBUcnVlLFxuICAgIFwiYWxsZ2F0aGVyX2J1Y2tldF9zaXplXCI6IDUwMDAwMDAwMCxcbiAgICBcIm92ZXJsYXBfY29tbVwiOiBUcnVlLFxuICAgIFwicmVkdWNlX3NjYXR0ZXJcIjogVHJ1ZSxcbiAgICBcInJlZHVjZV9idWNrZXRfc2l6ZVwiOiA1MDAwMDAwMDAsXG4gICAgXCJjb250aWd1b3VzX2dyYWRpZW50c1wiOiBUcnVlLFxuICB9LFxuXG4gICAjIGJhdGNoIC8gZGF0YSBzZXR0aW5nc1xuICAgXCJ0cmFpbl9taWNyb19iYXRjaF9zaXplX3Blcl9ncHVcIjogNCxcbiAgIFwiZGF0YV9pbXBsXCI6IFwibW1hcFwiLFxuXG4gICAjIGFjdGl2YXRpb24gY2hlY2twb2ludGluZ1xuICAgXCJjaGVja3BvaW50X2FjdGl2YXRpb25zXCI6IHRydWUsXG4gICBcImNoZWNrcG9pbnRfbnVtX2xheWVyc1wiOiAxLFxuICAgXCJwYXJ0aXRpb25fYWN0aXZhdGlvbnNcIjogdHJ1ZSxcbiAgIFwic3luY2hyb25pemVfZWFjaF9sYXllclwiOiB0cnVlLFxuXG4gICAjIHJlZ3VsYXJpemF0aW9uXG4gICBcImdyYWRpZW50X2NsaXBwaW5nXCI6IDEuMCxcbiAgIFwid2VpZ2h0X2RlY2F5XCI6IDAuMSxcbiAgIFwiaGlkZGVuX2Ryb3BvdXRcIjogMCxcbiAgIFwiYXR0ZW50aW9uX2Ryb3BvdXRcIjogMCxcblxuICAgIyBwcmVjaXNpb24gc2V0dGluZ3NcbiAgIFwiZnAxNlwiOiB7XG4gICAgIFwiZnAxNlwiOiB0cnVlLFxuICAgICBcImVuYWJsZWRcIjogdHJ1ZSxcbiAgICAgXCJsb3NzX3NjYWxlXCI6IDAsXG4gICAgIFwibG9zc19zY2FsZV93aW5kb3dcIjogMTAwMCxcbiAgICAgXCJoeXN0ZXJlc2lzXCI6IDIsXG4gICAgIFwibWluX2xvc3Nfc2NhbGVcIjogMVxuICAgfSxcblxuICAgIyBtaXNjLiB0cmFpbmluZyBzZXR0aW5nc1xuICAgXCJ0cmFpbl9pdGVyc1wiOiAzMjAwMDAsXG4gICBcImxyX2RlY2F5X2l0ZXJzXCI6IDMyMDAwMCxcbiAgIFwiZGlzdHJpYnV0ZWRfYmFja2VuZFwiOiBcIm5jY2xcIixcbiAgIFwibHJfZGVjYXlfc3R5bGVcIjogXCJjb3NpbmVcIixcbiAgIFwid2FybXVwXCI6IDAuMDEsXG4gICBcImNoZWNrcG9pbnRfZmFjdG9yXCI6IDEwMDAwLFxuICAgXCJldmFsX2ludGVydmFsXCI6IDEwMDAsXG4gICBcImV2YWxfaXRlcnNcIjogMTAsXG5cbiAgICMgbG9nZ2luZ1xuICAgXCJsb2dfaW50ZXJ2YWxcIjogMTAwLFxuICAgXCJzdGVwc19wZXJfcHJpbnRcIjogMTAsXG4gICBcImtlZXBfbGFzdF9uX2NoZWNrcG9pbnRzXCI6IDQsXG4gICBcIndhbGxfY2xvY2tfYnJlYWtkb3duXCI6IHRydWUsXG59XG4iLCAibG9jYWxfc2V0dXAueW1sIjogIiMgU3VnZ2VzdGVkIGRhdGEgcGF0aHMgd2hlbiB1c2luZyBHUFQtTmVvWCBsb2NhbGx5XG57XG4gIFwiZGF0YV9wYXRoXCI6IFwiZGF0YS9wcm9jZXNzZWRfZGF0YS9teWRhdGFzZXRfdGV4dF9kb2N1bWVudFwiLFxuICBcInRva2VuaXplcl90eXBlXCI6XCJIRlRva2VuaXplclwiLFxuICAjIG9yIGZvciB3ZWlnaHRlZCBkYXRhc2V0czpcbiAgIyBcInRyYWluLWRhdGEtcGF0aHNcIjogW1wiZGF0YS9lbndpazgvZW53aWs4X3RleHRfZG9jdW1lbnRcIiwgXCJkYXRhL2Vud2lrOC9lbndpazhfdGV4dF9kb2N1bWVudFwiXSxcbiAgIyBcInRlc3QtZGF0YS1wYXRoc1wiOiBbXCJkYXRhL2Vud2lrOC9lbndpazhfdGV4dF9kb2N1bWVudFwiLCBcImRhdGEvZW53aWs4L2Vud2lrOF90ZXh0X2RvY3VtZW50XCJdLFxuICAjIFwidmFsaWQtZGF0YS1wYXRoc1wiOiBbXCJkYXRhL2Vud2lrOC9lbndpazhfdGV4dF9kb2N1bWVudFwiLCBcImRhdGEvZW53aWs4L2Vud2lrOF90ZXh0X2RvY3VtZW50XCJdLFxuICAjIFwidHJhaW4tZGF0YS13ZWlnaHRzXCI6IFsxLiwgMi5dLFxuICAjIFwidGVzdC1kYXRhLXdlaWdodHNcIjogWzIuLCAxLl0sXG4gICMgXCJ2YWxpZC1kYXRhLXdlaWdodHNcIjogWzAuNSwgMC40XSxcblxuICAjIElmIHdlaWdodF9ieV9udW1fZG9jdW1lbnRzIGlzIFRydWUsIEJ1aWxkcyBkYXRhc2V0IHdlaWdodHMgZnJvbSBhIG11bHRpbm9taWFsIGRpc3RyaWJ1dGlvbiBvdmVyIGdyb3VwcyBvZiBkYXRhIGFjY29yZGluZyB0byB0aGUgbnVtYmVyIG9mIGRvY3VtZW50cyBpbiBlYWNoIGdyb3VwLlxuICAjIFdBUk5JTkc6IHNldHRpbmcgdGhpcyB0byBUcnVlIHdpbGwgb3ZlcnJpZGUgYW55IHVzZXIgcHJvdmlkZWQgd2VpZ2h0c1xuICAjIFwid2VpZ2h0X2J5X251bV9kb2N1bWVudHNcIjogZmFsc2UsXG4gICMgXCJ3ZWlnaHRlZF9zYW1wbGVyX2FscGhhXCI6IDAuMyxcblxuICBcInZvY2FiX2ZpbGVcIjogXCJja3B0cy8yMEJfdG9rZW5pemVyLmpzb25cIixcbiAgXCJtZXJnZV9maWxlXCI6IFwiZGF0YS9ncHQyLW1lcmdlcy50eHRcIixcblxuICBcInNhdmVcIjogXCJjaGVja3BvaW50c1wiLFxuICBcImxvYWRcIjogXCJjaGVja3BvaW50c1wiLFxuICBcImNoZWNrcG9pbnRfdmFsaWRhdGlvbl93aXRoX2ZvcndhcmRfcGFzc1wiOiBGYWxzZSxcblxuICBcInRlbnNvcmJvYXJkX2RpclwiOiBcInRlbnNvcmJvYXJkXCIsXG4gIFwibG9nX2RpclwiOiBcImxvZ3NcIixcbiAgXCJ1c2Vfd2FuZGJcIjogRmFsc2UsXG4gIFwid2FuZGJfaG9zdFwiOiBcImh0dHBzOi8vYXBpLndhbmRiLmFpXCIsXG4gIFwid2FuZGJfcHJvamVjdFwiOiBcIm5lb3hcIlxufVxuIn0sICJsb2FkIjogImNoZWNrcG9pbnRzIiwgImNoZWNrcG9pbnRfZmFjdG9yIjogMTAwMDAsICJiYXRjaF9zaXplIjogNCwgInRyYWluX2l0ZXJzIjogMzIwMDAwLCAiZXZhbF9pdGVycyI6IDEwLCAia2VlcF9sYXN0X25fY2hlY2twb2ludHMiOiA0LCAidm9jYWJfZmlsZSI6ICJja3B0cy8yMEJfdG9rZW5pemVyLmpzb24iLCAibWVyZ2VfZmlsZSI6ICJkYXRhL2dwdDItbWVyZ2VzLnR4dCIsICJjaGVja3BvaW50X2FjdGl2YXRpb25zIjogdHJ1ZSwgInN5bmNocm9uaXplX2VhY2hfbGF5ZXIiOiB0cnVlLCAicGFydGl0aW9uX2FjdGl2YXRpb25zIjogdHJ1ZSwgImR5bmFtaWNfbG9zc19zY2FsZSI6IHRydWUsICJwaXBlX3BhcmFsbGVsX3NpemUiOiAyLCAid29ybGRfc2l6ZSI6IDEsICJpc19waXBlX3BhcmFsbGVsIjogdHJ1ZSwgInVzZV93YW5kYiI6IGZhbHNlLCAibG9nX2RpciI6ICJsb2dzIiwgInRlbnNvcmJvYXJkX2RpciI6ICJ0ZW5zb3Jib2FyZCIsICJ0ZXh0X2dlbl90eXBlIjogInVuY29uZGl0aW9uYWwiLCAibG9jYWxfcmFuayI6IDAsICJyYW5rIjogMCwgInVzZXJfc2NyaXB0IjogInRyYWluLnB5IiwgInNhdmVfaXRlcnMiOiBbMTAwMDAsIDIwMDAwLCAzMDAwMCwgNDAwMDAsIDUwMDAwLCA2MDAwMCwgNzAwMDAsIDgwMDAwLCA5MDAwMCwgMTAwMDAwLCAxMTAwMDAsIDEyMDAwMCwgMTMwMDAwLCAxNDAwMDAsIDE1MDAwMCwgMTYwMDAwLCAxNzAwMDAsIDE4MDAwMCwgMTkwMDAwLCAyMDAwMDAsIDIxMDAwMCwgMjIwMDAwLCAyMzAwMDAsIDI0MDAwMCwgMjUwMDAwLCAyNjAwMDAsIDI3MDAwMCwgMjgwMDAwLCAyOTAwMDAsIDMwMDAwMCwgMzEwMDAwXSwgImdsb2JhbF9udW1fZ3B1cyI6IDR9']
[2024-07-11 06:29:06,961] [INFO] [launch.py:256:main] process 2160 spawned with command: ['/usr/local/miniconda3/bin/python', '-u', 'train.py', '--local_rank=1', '--deepspeed_config', 'eyJ0cmFpbl9iYXRjaF9zaXplIjogOCwgInRyYWluX21pY3JvX2JhdGNoX3NpemVfcGVyX2dwdSI6IDQsICJvcHRpbWl6ZXIiOiB7InR5cGUiOiAiQWRhbSIsICJwYXJhbXMiOiB7ImxyIjogMC4wMDAxNiwgImJldGFzIjogWzAuOSwgMC45NV0sICJlcHMiOiAxZS0wOH19LCAiZnAxNiI6IHsiZnAxNiI6IHRydWUsICJlbmFibGVkIjogdHJ1ZSwgImxvc3Nfc2NhbGUiOiAwLCAibG9zc19zY2FsZV93aW5kb3ciOiAxMDAwLCAiaHlzdGVyZXNpcyI6IDIsICJtaW5fbG9zc19zY2FsZSI6IDF9LCAiemVyb19vcHRpbWl6YXRpb24iOiB7InN0YWdlIjogMywgImFsbGdhdGhlcl9wYXJ0aXRpb25zIjogdHJ1ZSwgImFsbGdhdGhlcl9idWNrZXRfc2l6ZSI6IDUwMDAwMDAwMCwgIm92ZXJsYXBfY29tbSI6IHRydWUsICJyZWR1Y2Vfc2NhdHRlciI6IHRydWUsICJyZWR1Y2VfYnVja2V0X3NpemUiOiA1MDAwMDAwMDAsICJjb250aWd1b3VzX2dyYWRpZW50cyI6IHRydWV9LCAid2FsbF9jbG9ja19icmVha2Rvd24iOiB0cnVlfQ==', '--megatron_config', 'eyJ0cmFpbl9iYXRjaF9zaXplIjogOCwgInRyYWluX21pY3JvX2JhdGNoX3NpemVfcGVyX2dwdSI6IDQsICJvcHRpbWl6ZXIiOiB7InR5cGUiOiAiQWRhbSIsICJwYXJhbXMiOiB7ImxyIjogMC4wMDAxNiwgImJldGFzIjogWzAuOSwgMC45NV0sICJlcHMiOiAxZS0wOH19LCAiZnAxNiI6IHsiZnAxNiI6IHRydWUsICJlbmFibGVkIjogdHJ1ZSwgImxvc3Nfc2NhbGUiOiAwLCAibG9zc19zY2FsZV93aW5kb3ciOiAxMDAwLCAiaHlzdGVyZXNpcyI6IDIsICJtaW5fbG9zc19zY2FsZSI6IDF9LCAiemVyb19vcHRpbWl6YXRpb24iOiB7InN0YWdlIjogMywgImFsbGdhdGhlcl9wYXJ0aXRpb25zIjogdHJ1ZSwgImFsbGdhdGhlcl9idWNrZXRfc2l6ZSI6IDUwMDAwMDAwMCwgIm92ZXJsYXBfY29tbSI6IHRydWUsICJyZWR1Y2Vfc2NhdHRlciI6IHRydWUsICJyZWR1Y2VfYnVja2V0X3NpemUiOiA1MDAwMDAwMDAsICJjb250aWd1b3VzX2dyYWRpZW50cyI6IHRydWV9LCAid2FsbF9jbG9ja19icmVha2Rvd24iOiB0cnVlLCAicHJlY2lzaW9uIjogImZwMTYiLCAibnVtX2xheWVycyI6IDMyLCAiaGlkZGVuX3NpemUiOiAyNTYwLCAibnVtX2F0dGVudGlvbl9oZWFkcyI6IDMyLCAic2VxX2xlbmd0aCI6IDIwNDgsICJtYXhfcG9zaXRpb25fZW1iZWRkaW5ncyI6IDIwNDgsICJwb3NfZW1iIjogInJvdGFyeSIsICJub193ZWlnaHRfdHlpbmciOiB0cnVlLCAiYXR0ZW50aW9uX2NvbmZpZyI6IFsiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCJdLCAic3BhcnNpdHlfY29uZmlnIjoge30sICJpbml0X21ldGhvZCI6ICJzbWFsbF9pbml0IiwgIm91dHB1dF9sYXllcl9pbml0X21ldGhvZCI6ICJ3YW5nX2luaXQiLCAibHJfZGVjYXlfc3R5bGUiOiAiY29zaW5lIiwgImxyX2RlY2F5X2l0ZXJzIjogMzIwMDAwLCAibWluX2xyIjogMS42ZS0wNSwgIm9wdGltaXplcl90eXBlIjogIkFkYW0iLCAiemVyb19zdGFnZSI6IDMsICJ6ZXJvX3JlZHVjZV9zY2F0dGVyIjogdHJ1ZSwgInplcm9fY29udGlndW91c19ncmFkaWVudHMiOiB0cnVlLCAiemVyb19yZWR1Y2VfYnVja2V0X3NpemUiOiA1MDAwMDAwMDAsICJ6ZXJvX2FsbGdhdGhlcl9idWNrZXRfc2l6ZSI6IDUwMDAwMDAwMCwgImxyIjogMC4wMDAxNiwgInRva2VuaXplcl90eXBlIjogIkhGVG9rZW5pemVyIiwgImRhdGFfcGF0aCI6ICJkYXRhL3Byb2Nlc3NlZF9kYXRhL215ZGF0YXNldF90ZXh0X2RvY3VtZW50IiwgImRhdGFfaW1wbCI6ICJtbWFwIiwgInNhdmUiOiAiY2hlY2twb2ludHMiLCAiY29uZmlnX2ZpbGVzIjogeyIyLTdCLnltbCI6ICIjIEdQVC0yIHByZXRyYWluaW5nIHNldHVwXG57XG4gICAjIHBhcmFsbGVsaXNtIHNldHRpbmdzICggeW91IHdpbGwgd2FudCB0byBjaGFuZ2UgdGhlc2UgYmFzZWQgb24geW91ciBjbHVzdGVyIHNldHVwLCBpZGVhbGx5IHNjaGVkdWxpbmcgcGlwZWxpbmUgc3RhZ2VzXG4gICAjIGFjcm9zcyB0aGUgbm9kZSBib3VuZGFyaWVzIClcbiAgIFwicGlwZV9wYXJhbGxlbF9zaXplXCI6IDIsXG4gICBcIm1vZGVsX3BhcmFsbGVsX3NpemVcIjogMSxcbiAgICNcImdyYWRpZW50X2FjY3VtdWxhdGlvbl9zdGVwc1wiOiAyLFxuXG4gICAjIG1vZGVsIHNldHRpbmdzXG4gICBcIm51bV9sYXllcnNcIjogMzIsXG4gICBcImhpZGRlbl9zaXplXCI6IDI1NjAsXG4gICBcIm51bV9hdHRlbnRpb25faGVhZHNcIjogMzIsXG4gICBcInNlcV9sZW5ndGhcIjogMjA0OCxcbiAgIFwibWF4X3Bvc2l0aW9uX2VtYmVkZGluZ3NcIjogMjA0OCxcbiAgIFwibm9ybVwiOiBcImxheWVybm9ybVwiLFxuICAgXCJwb3NfZW1iXCI6IFwicm90YXJ5XCIsXG4gICBcIm5vX3dlaWdodF90eWluZ1wiOiB0cnVlLFxuICAgXCJncHRfal9yZXNpZHVhbFwiOiBmYWxzZSxcbiAgIFwib3V0cHV0X2xheWVyX3BhcmFsbGVsaXNtXCI6IFwiY29sdW1uXCIsXG5cbiAgICMgdGhlc2Ugc2hvdWxkIHByb3ZpZGUgc29tZSBzcGVlZHVwIGJ1dCB0YWtlcyBhIHdoaWxlIHRvIGJ1aWxkLCBzZXQgdG8gdHJ1ZSBpZiBkZXNpcmVkXG4gICBcInNjYWxlZF91cHBlcl90cmlhbmdfbWFza2VkX3NvZnRtYXhfZnVzaW9uXCI6IGZhbHNlLFxuICAgXCJiaWFzX2dlbHVfZnVzaW9uXCI6IGZhbHNlLFxuICAgXCJyb3BlX2Z1c2lvblwiOiBmYWxzZSxcbiAgIFwibGF5ZXJub3JtX2Z1c2lvblwiOiBmYWxzZSxcblxuICAgIyBpbml0IG1ldGhvZHNcbiAgIFwiaW5pdF9tZXRob2RcIjogXCJzbWFsbF9pbml0XCIsXG4gICBcIm91dHB1dF9sYXllcl9pbml0X21ldGhvZFwiOiBcIndhbmdfaW5pdFwiLFxuXG4gICAjIG9wdGltaXplciBzZXR0aW5nc1xuICAgXCJvcHRpbWl6ZXJcIjoge1xuICAgICBcInR5cGVcIjogXCJBZGFtXCIsXG4gICAgIFwicGFyYW1zXCI6IHtcbiAgICAgICBcImxyXCI6IDAuMDAwMTYsXG4gICAgICAgXCJiZXRhc1wiOiBbMC45LCAwLjk1XSxcbiAgICAgICBcImVwc1wiOiAxLjBlLTgsXG4gICAgIH1cbiAgIH0sXG4gICBcIm1pbl9sclwiOiAwLjAwMDAxNixcblxuICAgIyBmb3IgYWxsIHplcm9fb3B0aW1pemF0aW9uIG9wdGlvbnMsIHNlZSBodHRwczovL3d3dy5kZWVwc3BlZWQuYWkvZG9jcy9jb25maWctanNvbi8jemVyby1vcHRpbWl6YXRpb25zLWZvci1mcDE2LXRyYWluaW5nXG4gICBcInplcm9fb3B0aW1pemF0aW9uXCI6IHtcbiAgICBcInN0YWdlXCI6IDMsXG4gICAgXCJhbGxnYXRoZXJfcGFydGl0aW9uc1wiOiBUcnVlLFxuICAgIFwiYWxsZ2F0aGVyX2J1Y2tldF9zaXplXCI6IDUwMDAwMDAwMCxcbiAgICBcIm92ZXJsYXBfY29tbVwiOiBUcnVlLFxuICAgIFwicmVkdWNlX3NjYXR0ZXJcIjogVHJ1ZSxcbiAgICBcInJlZHVjZV9idWNrZXRfc2l6ZVwiOiA1MDAwMDAwMDAsXG4gICAgXCJjb250aWd1b3VzX2dyYWRpZW50c1wiOiBUcnVlLFxuICB9LFxuXG4gICAjIGJhdGNoIC8gZGF0YSBzZXR0aW5nc1xuICAgXCJ0cmFpbl9taWNyb19iYXRjaF9zaXplX3Blcl9ncHVcIjogNCxcbiAgIFwiZGF0YV9pbXBsXCI6IFwibW1hcFwiLFxuXG4gICAjIGFjdGl2YXRpb24gY2hlY2twb2ludGluZ1xuICAgXCJjaGVja3BvaW50X2FjdGl2YXRpb25zXCI6IHRydWUsXG4gICBcImNoZWNrcG9pbnRfbnVtX2xheWVyc1wiOiAxLFxuICAgXCJwYXJ0aXRpb25fYWN0aXZhdGlvbnNcIjogdHJ1ZSxcbiAgIFwic3luY2hyb25pemVfZWFjaF9sYXllclwiOiB0cnVlLFxuXG4gICAjIHJlZ3VsYXJpemF0aW9uXG4gICBcImdyYWRpZW50X2NsaXBwaW5nXCI6IDEuMCxcbiAgIFwid2VpZ2h0X2RlY2F5XCI6IDAuMSxcbiAgIFwiaGlkZGVuX2Ryb3BvdXRcIjogMCxcbiAgIFwiYXR0ZW50aW9uX2Ryb3BvdXRcIjogMCxcblxuICAgIyBwcmVjaXNpb24gc2V0dGluZ3NcbiAgIFwiZnAxNlwiOiB7XG4gICAgIFwiZnAxNlwiOiB0cnVlLFxuICAgICBcImVuYWJsZWRcIjogdHJ1ZSxcbiAgICAgXCJsb3NzX3NjYWxlXCI6IDAsXG4gICAgIFwibG9zc19zY2FsZV93aW5kb3dcIjogMTAwMCxcbiAgICAgXCJoeXN0ZXJlc2lzXCI6IDIsXG4gICAgIFwibWluX2xvc3Nfc2NhbGVcIjogMVxuICAgfSxcblxuICAgIyBtaXNjLiB0cmFpbmluZyBzZXR0aW5nc1xuICAgXCJ0cmFpbl9pdGVyc1wiOiAzMjAwMDAsXG4gICBcImxyX2RlY2F5X2l0ZXJzXCI6IDMyMDAwMCxcbiAgIFwiZGlzdHJpYnV0ZWRfYmFja2VuZFwiOiBcIm5jY2xcIixcbiAgIFwibHJfZGVjYXlfc3R5bGVcIjogXCJjb3NpbmVcIixcbiAgIFwid2FybXVwXCI6IDAuMDEsXG4gICBcImNoZWNrcG9pbnRfZmFjdG9yXCI6IDEwMDAwLFxuICAgXCJldmFsX2ludGVydmFsXCI6IDEwMDAsXG4gICBcImV2YWxfaXRlcnNcIjogMTAsXG5cbiAgICMgbG9nZ2luZ1xuICAgXCJsb2dfaW50ZXJ2YWxcIjogMTAwLFxuICAgXCJzdGVwc19wZXJfcHJpbnRcIjogMTAsXG4gICBcImtlZXBfbGFzdF9uX2NoZWNrcG9pbnRzXCI6IDQsXG4gICBcIndhbGxfY2xvY2tfYnJlYWtkb3duXCI6IHRydWUsXG59XG4iLCAibG9jYWxfc2V0dXAueW1sIjogIiMgU3VnZ2VzdGVkIGRhdGEgcGF0aHMgd2hlbiB1c2luZyBHUFQtTmVvWCBsb2NhbGx5XG57XG4gIFwiZGF0YV9wYXRoXCI6IFwiZGF0YS9wcm9jZXNzZWRfZGF0YS9teWRhdGFzZXRfdGV4dF9kb2N1bWVudFwiLFxuICBcInRva2VuaXplcl90eXBlXCI6XCJIRlRva2VuaXplclwiLFxuICAjIG9yIGZvciB3ZWlnaHRlZCBkYXRhc2V0czpcbiAgIyBcInRyYWluLWRhdGEtcGF0aHNcIjogW1wiZGF0YS9lbndpazgvZW53aWs4X3RleHRfZG9jdW1lbnRcIiwgXCJkYXRhL2Vud2lrOC9lbndpazhfdGV4dF9kb2N1bWVudFwiXSxcbiAgIyBcInRlc3QtZGF0YS1wYXRoc1wiOiBbXCJkYXRhL2Vud2lrOC9lbndpazhfdGV4dF9kb2N1bWVudFwiLCBcImRhdGEvZW53aWs4L2Vud2lrOF90ZXh0X2RvY3VtZW50XCJdLFxuICAjIFwidmFsaWQtZGF0YS1wYXRoc1wiOiBbXCJkYXRhL2Vud2lrOC9lbndpazhfdGV4dF9kb2N1bWVudFwiLCBcImRhdGEvZW53aWs4L2Vud2lrOF90ZXh0X2RvY3VtZW50XCJdLFxuICAjIFwidHJhaW4tZGF0YS13ZWlnaHRzXCI6IFsxLiwgMi5dLFxuICAjIFwidGVzdC1kYXRhLXdlaWdodHNcIjogWzIuLCAxLl0sXG4gICMgXCJ2YWxpZC1kYXRhLXdlaWdodHNcIjogWzAuNSwgMC40XSxcblxuICAjIElmIHdlaWdodF9ieV9udW1fZG9jdW1lbnRzIGlzIFRydWUsIEJ1aWxkcyBkYXRhc2V0IHdlaWdodHMgZnJvbSBhIG11bHRpbm9taWFsIGRpc3RyaWJ1dGlvbiBvdmVyIGdyb3VwcyBvZiBkYXRhIGFjY29yZGluZyB0byB0aGUgbnVtYmVyIG9mIGRvY3VtZW50cyBpbiBlYWNoIGdyb3VwLlxuICAjIFdBUk5JTkc6IHNldHRpbmcgdGhpcyB0byBUcnVlIHdpbGwgb3ZlcnJpZGUgYW55IHVzZXIgcHJvdmlkZWQgd2VpZ2h0c1xuICAjIFwid2VpZ2h0X2J5X251bV9kb2N1bWVudHNcIjogZmFsc2UsXG4gICMgXCJ3ZWlnaHRlZF9zYW1wbGVyX2FscGhhXCI6IDAuMyxcblxuICBcInZvY2FiX2ZpbGVcIjogXCJja3B0cy8yMEJfdG9rZW5pemVyLmpzb25cIixcbiAgXCJtZXJnZV9maWxlXCI6IFwiZGF0YS9ncHQyLW1lcmdlcy50eHRcIixcblxuICBcInNhdmVcIjogXCJjaGVja3BvaW50c1wiLFxuICBcImxvYWRcIjogXCJjaGVja3BvaW50c1wiLFxuICBcImNoZWNrcG9pbnRfdmFsaWRhdGlvbl93aXRoX2ZvcndhcmRfcGFzc1wiOiBGYWxzZSxcblxuICBcInRlbnNvcmJvYXJkX2RpclwiOiBcInRlbnNvcmJvYXJkXCIsXG4gIFwibG9nX2RpclwiOiBcImxvZ3NcIixcbiAgXCJ1c2Vfd2FuZGJcIjogRmFsc2UsXG4gIFwid2FuZGJfaG9zdFwiOiBcImh0dHBzOi8vYXBpLndhbmRiLmFpXCIsXG4gIFwid2FuZGJfcHJvamVjdFwiOiBcIm5lb3hcIlxufVxuIn0sICJsb2FkIjogImNoZWNrcG9pbnRzIiwgImNoZWNrcG9pbnRfZmFjdG9yIjogMTAwMDAsICJiYXRjaF9zaXplIjogNCwgInRyYWluX2l0ZXJzIjogMzIwMDAwLCAiZXZhbF9pdGVycyI6IDEwLCAia2VlcF9sYXN0X25fY2hlY2twb2ludHMiOiA0LCAidm9jYWJfZmlsZSI6ICJja3B0cy8yMEJfdG9rZW5pemVyLmpzb24iLCAibWVyZ2VfZmlsZSI6ICJkYXRhL2dwdDItbWVyZ2VzLnR4dCIsICJjaGVja3BvaW50X2FjdGl2YXRpb25zIjogdHJ1ZSwgInN5bmNocm9uaXplX2VhY2hfbGF5ZXIiOiB0cnVlLCAicGFydGl0aW9uX2FjdGl2YXRpb25zIjogdHJ1ZSwgImR5bmFtaWNfbG9zc19zY2FsZSI6IHRydWUsICJwaXBlX3BhcmFsbGVsX3NpemUiOiAyLCAid29ybGRfc2l6ZSI6IDEsICJpc19waXBlX3BhcmFsbGVsIjogdHJ1ZSwgInVzZV93YW5kYiI6IGZhbHNlLCAibG9nX2RpciI6ICJsb2dzIiwgInRlbnNvcmJvYXJkX2RpciI6ICJ0ZW5zb3Jib2FyZCIsICJ0ZXh0X2dlbl90eXBlIjogInVuY29uZGl0aW9uYWwiLCAibG9jYWxfcmFuayI6IDAsICJyYW5rIjogMCwgInVzZXJfc2NyaXB0IjogInRyYWluLnB5IiwgInNhdmVfaXRlcnMiOiBbMTAwMDAsIDIwMDAwLCAzMDAwMCwgNDAwMDAsIDUwMDAwLCA2MDAwMCwgNzAwMDAsIDgwMDAwLCA5MDAwMCwgMTAwMDAwLCAxMTAwMDAsIDEyMDAwMCwgMTMwMDAwLCAxNDAwMDAsIDE1MDAwMCwgMTYwMDAwLCAxNzAwMDAsIDE4MDAwMCwgMTkwMDAwLCAyMDAwMDAsIDIxMDAwMCwgMjIwMDAwLCAyMzAwMDAsIDI0MDAwMCwgMjUwMDAwLCAyNjAwMDAsIDI3MDAwMCwgMjgwMDAwLCAyOTAwMDAsIDMwMDAwMCwgMzEwMDAwXSwgImdsb2JhbF9udW1fZ3B1cyI6IDR9']
[2024-07-11 06:29:06,967] [INFO] [launch.py:256:main] process 2161 spawned with command: ['/usr/local/miniconda3/bin/python', '-u', 'train.py', '--local_rank=2', '--deepspeed_config', 'eyJ0cmFpbl9iYXRjaF9zaXplIjogOCwgInRyYWluX21pY3JvX2JhdGNoX3NpemVfcGVyX2dwdSI6IDQsICJvcHRpbWl6ZXIiOiB7InR5cGUiOiAiQWRhbSIsICJwYXJhbXMiOiB7ImxyIjogMC4wMDAxNiwgImJldGFzIjogWzAuOSwgMC45NV0sICJlcHMiOiAxZS0wOH19LCAiZnAxNiI6IHsiZnAxNiI6IHRydWUsICJlbmFibGVkIjogdHJ1ZSwgImxvc3Nfc2NhbGUiOiAwLCAibG9zc19zY2FsZV93aW5kb3ciOiAxMDAwLCAiaHlzdGVyZXNpcyI6IDIsICJtaW5fbG9zc19zY2FsZSI6IDF9LCAiemVyb19vcHRpbWl6YXRpb24iOiB7InN0YWdlIjogMywgImFsbGdhdGhlcl9wYXJ0aXRpb25zIjogdHJ1ZSwgImFsbGdhdGhlcl9idWNrZXRfc2l6ZSI6IDUwMDAwMDAwMCwgIm92ZXJsYXBfY29tbSI6IHRydWUsICJyZWR1Y2Vfc2NhdHRlciI6IHRydWUsICJyZWR1Y2VfYnVja2V0X3NpemUiOiA1MDAwMDAwMDAsICJjb250aWd1b3VzX2dyYWRpZW50cyI6IHRydWV9LCAid2FsbF9jbG9ja19icmVha2Rvd24iOiB0cnVlfQ==', '--megatron_config', 'eyJ0cmFpbl9iYXRjaF9zaXplIjogOCwgInRyYWluX21pY3JvX2JhdGNoX3NpemVfcGVyX2dwdSI6IDQsICJvcHRpbWl6ZXIiOiB7InR5cGUiOiAiQWRhbSIsICJwYXJhbXMiOiB7ImxyIjogMC4wMDAxNiwgImJldGFzIjogWzAuOSwgMC45NV0sICJlcHMiOiAxZS0wOH19LCAiZnAxNiI6IHsiZnAxNiI6IHRydWUsICJlbmFibGVkIjogdHJ1ZSwgImxvc3Nfc2NhbGUiOiAwLCAibG9zc19zY2FsZV93aW5kb3ciOiAxMDAwLCAiaHlzdGVyZXNpcyI6IDIsICJtaW5fbG9zc19zY2FsZSI6IDF9LCAiemVyb19vcHRpbWl6YXRpb24iOiB7InN0YWdlIjogMywgImFsbGdhdGhlcl9wYXJ0aXRpb25zIjogdHJ1ZSwgImFsbGdhdGhlcl9idWNrZXRfc2l6ZSI6IDUwMDAwMDAwMCwgIm92ZXJsYXBfY29tbSI6IHRydWUsICJyZWR1Y2Vfc2NhdHRlciI6IHRydWUsICJyZWR1Y2VfYnVja2V0X3NpemUiOiA1MDAwMDAwMDAsICJjb250aWd1b3VzX2dyYWRpZW50cyI6IHRydWV9LCAid2FsbF9jbG9ja19icmVha2Rvd24iOiB0cnVlLCAicHJlY2lzaW9uIjogImZwMTYiLCAibnVtX2xheWVycyI6IDMyLCAiaGlkZGVuX3NpemUiOiAyNTYwLCAibnVtX2F0dGVudGlvbl9oZWFkcyI6IDMyLCAic2VxX2xlbmd0aCI6IDIwNDgsICJtYXhfcG9zaXRpb25fZW1iZWRkaW5ncyI6IDIwNDgsICJwb3NfZW1iIjogInJvdGFyeSIsICJub193ZWlnaHRfdHlpbmciOiB0cnVlLCAiYXR0ZW50aW9uX2NvbmZpZyI6IFsiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCJdLCAic3BhcnNpdHlfY29uZmlnIjoge30sICJpbml0X21ldGhvZCI6ICJzbWFsbF9pbml0IiwgIm91dHB1dF9sYXllcl9pbml0X21ldGhvZCI6ICJ3YW5nX2luaXQiLCAibHJfZGVjYXlfc3R5bGUiOiAiY29zaW5lIiwgImxyX2RlY2F5X2l0ZXJzIjogMzIwMDAwLCAibWluX2xyIjogMS42ZS0wNSwgIm9wdGltaXplcl90eXBlIjogIkFkYW0iLCAiemVyb19zdGFnZSI6IDMsICJ6ZXJvX3JlZHVjZV9zY2F0dGVyIjogdHJ1ZSwgInplcm9fY29udGlndW91c19ncmFkaWVudHMiOiB0cnVlLCAiemVyb19yZWR1Y2VfYnVja2V0X3NpemUiOiA1MDAwMDAwMDAsICJ6ZXJvX2FsbGdhdGhlcl9idWNrZXRfc2l6ZSI6IDUwMDAwMDAwMCwgImxyIjogMC4wMDAxNiwgInRva2VuaXplcl90eXBlIjogIkhGVG9rZW5pemVyIiwgImRhdGFfcGF0aCI6ICJkYXRhL3Byb2Nlc3NlZF9kYXRhL215ZGF0YXNldF90ZXh0X2RvY3VtZW50IiwgImRhdGFfaW1wbCI6ICJtbWFwIiwgInNhdmUiOiAiY2hlY2twb2ludHMiLCAiY29uZmlnX2ZpbGVzIjogeyIyLTdCLnltbCI6ICIjIEdQVC0yIHByZXRyYWluaW5nIHNldHVwXG57XG4gICAjIHBhcmFsbGVsaXNtIHNldHRpbmdzICggeW91IHdpbGwgd2FudCB0byBjaGFuZ2UgdGhlc2UgYmFzZWQgb24geW91ciBjbHVzdGVyIHNldHVwLCBpZGVhbGx5IHNjaGVkdWxpbmcgcGlwZWxpbmUgc3RhZ2VzXG4gICAjIGFjcm9zcyB0aGUgbm9kZSBib3VuZGFyaWVzIClcbiAgIFwicGlwZV9wYXJhbGxlbF9zaXplXCI6IDIsXG4gICBcIm1vZGVsX3BhcmFsbGVsX3NpemVcIjogMSxcbiAgICNcImdyYWRpZW50X2FjY3VtdWxhdGlvbl9zdGVwc1wiOiAyLFxuXG4gICAjIG1vZGVsIHNldHRpbmdzXG4gICBcIm51bV9sYXllcnNcIjogMzIsXG4gICBcImhpZGRlbl9zaXplXCI6IDI1NjAsXG4gICBcIm51bV9hdHRlbnRpb25faGVhZHNcIjogMzIsXG4gICBcInNlcV9sZW5ndGhcIjogMjA0OCxcbiAgIFwibWF4X3Bvc2l0aW9uX2VtYmVkZGluZ3NcIjogMjA0OCxcbiAgIFwibm9ybVwiOiBcImxheWVybm9ybVwiLFxuICAgXCJwb3NfZW1iXCI6IFwicm90YXJ5XCIsXG4gICBcIm5vX3dlaWdodF90eWluZ1wiOiB0cnVlLFxuICAgXCJncHRfal9yZXNpZHVhbFwiOiBmYWxzZSxcbiAgIFwib3V0cHV0X2xheWVyX3BhcmFsbGVsaXNtXCI6IFwiY29sdW1uXCIsXG5cbiAgICMgdGhlc2Ugc2hvdWxkIHByb3ZpZGUgc29tZSBzcGVlZHVwIGJ1dCB0YWtlcyBhIHdoaWxlIHRvIGJ1aWxkLCBzZXQgdG8gdHJ1ZSBpZiBkZXNpcmVkXG4gICBcInNjYWxlZF91cHBlcl90cmlhbmdfbWFza2VkX3NvZnRtYXhfZnVzaW9uXCI6IGZhbHNlLFxuICAgXCJiaWFzX2dlbHVfZnVzaW9uXCI6IGZhbHNlLFxuICAgXCJyb3BlX2Z1c2lvblwiOiBmYWxzZSxcbiAgIFwibGF5ZXJub3JtX2Z1c2lvblwiOiBmYWxzZSxcblxuICAgIyBpbml0IG1ldGhvZHNcbiAgIFwiaW5pdF9tZXRob2RcIjogXCJzbWFsbF9pbml0XCIsXG4gICBcIm91dHB1dF9sYXllcl9pbml0X21ldGhvZFwiOiBcIndhbmdfaW5pdFwiLFxuXG4gICAjIG9wdGltaXplciBzZXR0aW5nc1xuICAgXCJvcHRpbWl6ZXJcIjoge1xuICAgICBcInR5cGVcIjogXCJBZGFtXCIsXG4gICAgIFwicGFyYW1zXCI6IHtcbiAgICAgICBcImxyXCI6IDAuMDAwMTYsXG4gICAgICAgXCJiZXRhc1wiOiBbMC45LCAwLjk1XSxcbiAgICAgICBcImVwc1wiOiAxLjBlLTgsXG4gICAgIH1cbiAgIH0sXG4gICBcIm1pbl9sclwiOiAwLjAwMDAxNixcblxuICAgIyBmb3IgYWxsIHplcm9fb3B0aW1pemF0aW9uIG9wdGlvbnMsIHNlZSBodHRwczovL3d3dy5kZWVwc3BlZWQuYWkvZG9jcy9jb25maWctanNvbi8jemVyby1vcHRpbWl6YXRpb25zLWZvci1mcDE2LXRyYWluaW5nXG4gICBcInplcm9fb3B0aW1pemF0aW9uXCI6IHtcbiAgICBcInN0YWdlXCI6IDMsXG4gICAgXCJhbGxnYXRoZXJfcGFydGl0aW9uc1wiOiBUcnVlLFxuICAgIFwiYWxsZ2F0aGVyX2J1Y2tldF9zaXplXCI6IDUwMDAwMDAwMCxcbiAgICBcIm92ZXJsYXBfY29tbVwiOiBUcnVlLFxuICAgIFwicmVkdWNlX3NjYXR0ZXJcIjogVHJ1ZSxcbiAgICBcInJlZHVjZV9idWNrZXRfc2l6ZVwiOiA1MDAwMDAwMDAsXG4gICAgXCJjb250aWd1b3VzX2dyYWRpZW50c1wiOiBUcnVlLFxuICB9LFxuXG4gICAjIGJhdGNoIC8gZGF0YSBzZXR0aW5nc1xuICAgXCJ0cmFpbl9taWNyb19iYXRjaF9zaXplX3Blcl9ncHVcIjogNCxcbiAgIFwiZGF0YV9pbXBsXCI6IFwibW1hcFwiLFxuXG4gICAjIGFjdGl2YXRpb24gY2hlY2twb2ludGluZ1xuICAgXCJjaGVja3BvaW50X2FjdGl2YXRpb25zXCI6IHRydWUsXG4gICBcImNoZWNrcG9pbnRfbnVtX2xheWVyc1wiOiAxLFxuICAgXCJwYXJ0aXRpb25fYWN0aXZhdGlvbnNcIjogdHJ1ZSxcbiAgIFwic3luY2hyb25pemVfZWFjaF9sYXllclwiOiB0cnVlLFxuXG4gICAjIHJlZ3VsYXJpemF0aW9uXG4gICBcImdyYWRpZW50X2NsaXBwaW5nXCI6IDEuMCxcbiAgIFwid2VpZ2h0X2RlY2F5XCI6IDAuMSxcbiAgIFwiaGlkZGVuX2Ryb3BvdXRcIjogMCxcbiAgIFwiYXR0ZW50aW9uX2Ryb3BvdXRcIjogMCxcblxuICAgIyBwcmVjaXNpb24gc2V0dGluZ3NcbiAgIFwiZnAxNlwiOiB7XG4gICAgIFwiZnAxNlwiOiB0cnVlLFxuICAgICBcImVuYWJsZWRcIjogdHJ1ZSxcbiAgICAgXCJsb3NzX3NjYWxlXCI6IDAsXG4gICAgIFwibG9zc19zY2FsZV93aW5kb3dcIjogMTAwMCxcbiAgICAgXCJoeXN0ZXJlc2lzXCI6IDIsXG4gICAgIFwibWluX2xvc3Nfc2NhbGVcIjogMVxuICAgfSxcblxuICAgIyBtaXNjLiB0cmFpbmluZyBzZXR0aW5nc1xuICAgXCJ0cmFpbl9pdGVyc1wiOiAzMjAwMDAsXG4gICBcImxyX2RlY2F5X2l0ZXJzXCI6IDMyMDAwMCxcbiAgIFwiZGlzdHJpYnV0ZWRfYmFja2VuZFwiOiBcIm5jY2xcIixcbiAgIFwibHJfZGVjYXlfc3R5bGVcIjogXCJjb3NpbmVcIixcbiAgIFwid2FybXVwXCI6IDAuMDEsXG4gICBcImNoZWNrcG9pbnRfZmFjdG9yXCI6IDEwMDAwLFxuICAgXCJldmFsX2ludGVydmFsXCI6IDEwMDAsXG4gICBcImV2YWxfaXRlcnNcIjogMTAsXG5cbiAgICMgbG9nZ2luZ1xuICAgXCJsb2dfaW50ZXJ2YWxcIjogMTAwLFxuICAgXCJzdGVwc19wZXJfcHJpbnRcIjogMTAsXG4gICBcImtlZXBfbGFzdF9uX2NoZWNrcG9pbnRzXCI6IDQsXG4gICBcIndhbGxfY2xvY2tfYnJlYWtkb3duXCI6IHRydWUsXG59XG4iLCAibG9jYWxfc2V0dXAueW1sIjogIiMgU3VnZ2VzdGVkIGRhdGEgcGF0aHMgd2hlbiB1c2luZyBHUFQtTmVvWCBsb2NhbGx5XG57XG4gIFwiZGF0YV9wYXRoXCI6IFwiZGF0YS9wcm9jZXNzZWRfZGF0YS9teWRhdGFzZXRfdGV4dF9kb2N1bWVudFwiLFxuICBcInRva2VuaXplcl90eXBlXCI6XCJIRlRva2VuaXplclwiLFxuICAjIG9yIGZvciB3ZWlnaHRlZCBkYXRhc2V0czpcbiAgIyBcInRyYWluLWRhdGEtcGF0aHNcIjogW1wiZGF0YS9lbndpazgvZW53aWs4X3RleHRfZG9jdW1lbnRcIiwgXCJkYXRhL2Vud2lrOC9lbndpazhfdGV4dF9kb2N1bWVudFwiXSxcbiAgIyBcInRlc3QtZGF0YS1wYXRoc1wiOiBbXCJkYXRhL2Vud2lrOC9lbndpazhfdGV4dF9kb2N1bWVudFwiLCBcImRhdGEvZW53aWs4L2Vud2lrOF90ZXh0X2RvY3VtZW50XCJdLFxuICAjIFwidmFsaWQtZGF0YS1wYXRoc1wiOiBbXCJkYXRhL2Vud2lrOC9lbndpazhfdGV4dF9kb2N1bWVudFwiLCBcImRhdGEvZW53aWs4L2Vud2lrOF90ZXh0X2RvY3VtZW50XCJdLFxuICAjIFwidHJhaW4tZGF0YS13ZWlnaHRzXCI6IFsxLiwgMi5dLFxuICAjIFwidGVzdC1kYXRhLXdlaWdodHNcIjogWzIuLCAxLl0sXG4gICMgXCJ2YWxpZC1kYXRhLXdlaWdodHNcIjogWzAuNSwgMC40XSxcblxuICAjIElmIHdlaWdodF9ieV9udW1fZG9jdW1lbnRzIGlzIFRydWUsIEJ1aWxkcyBkYXRhc2V0IHdlaWdodHMgZnJvbSBhIG11bHRpbm9taWFsIGRpc3RyaWJ1dGlvbiBvdmVyIGdyb3VwcyBvZiBkYXRhIGFjY29yZGluZyB0byB0aGUgbnVtYmVyIG9mIGRvY3VtZW50cyBpbiBlYWNoIGdyb3VwLlxuICAjIFdBUk5JTkc6IHNldHRpbmcgdGhpcyB0byBUcnVlIHdpbGwgb3ZlcnJpZGUgYW55IHVzZXIgcHJvdmlkZWQgd2VpZ2h0c1xuICAjIFwid2VpZ2h0X2J5X251bV9kb2N1bWVudHNcIjogZmFsc2UsXG4gICMgXCJ3ZWlnaHRlZF9zYW1wbGVyX2FscGhhXCI6IDAuMyxcblxuICBcInZvY2FiX2ZpbGVcIjogXCJja3B0cy8yMEJfdG9rZW5pemVyLmpzb25cIixcbiAgXCJtZXJnZV9maWxlXCI6IFwiZGF0YS9ncHQyLW1lcmdlcy50eHRcIixcblxuICBcInNhdmVcIjogXCJjaGVja3BvaW50c1wiLFxuICBcImxvYWRcIjogXCJjaGVja3BvaW50c1wiLFxuICBcImNoZWNrcG9pbnRfdmFsaWRhdGlvbl93aXRoX2ZvcndhcmRfcGFzc1wiOiBGYWxzZSxcblxuICBcInRlbnNvcmJvYXJkX2RpclwiOiBcInRlbnNvcmJvYXJkXCIsXG4gIFwibG9nX2RpclwiOiBcImxvZ3NcIixcbiAgXCJ1c2Vfd2FuZGJcIjogRmFsc2UsXG4gIFwid2FuZGJfaG9zdFwiOiBcImh0dHBzOi8vYXBpLndhbmRiLmFpXCIsXG4gIFwid2FuZGJfcHJvamVjdFwiOiBcIm5lb3hcIlxufVxuIn0sICJsb2FkIjogImNoZWNrcG9pbnRzIiwgImNoZWNrcG9pbnRfZmFjdG9yIjogMTAwMDAsICJiYXRjaF9zaXplIjogNCwgInRyYWluX2l0ZXJzIjogMzIwMDAwLCAiZXZhbF9pdGVycyI6IDEwLCAia2VlcF9sYXN0X25fY2hlY2twb2ludHMiOiA0LCAidm9jYWJfZmlsZSI6ICJja3B0cy8yMEJfdG9rZW5pemVyLmpzb24iLCAibWVyZ2VfZmlsZSI6ICJkYXRhL2dwdDItbWVyZ2VzLnR4dCIsICJjaGVja3BvaW50X2FjdGl2YXRpb25zIjogdHJ1ZSwgInN5bmNocm9uaXplX2VhY2hfbGF5ZXIiOiB0cnVlLCAicGFydGl0aW9uX2FjdGl2YXRpb25zIjogdHJ1ZSwgImR5bmFtaWNfbG9zc19zY2FsZSI6IHRydWUsICJwaXBlX3BhcmFsbGVsX3NpemUiOiAyLCAid29ybGRfc2l6ZSI6IDEsICJpc19waXBlX3BhcmFsbGVsIjogdHJ1ZSwgInVzZV93YW5kYiI6IGZhbHNlLCAibG9nX2RpciI6ICJsb2dzIiwgInRlbnNvcmJvYXJkX2RpciI6ICJ0ZW5zb3Jib2FyZCIsICJ0ZXh0X2dlbl90eXBlIjogInVuY29uZGl0aW9uYWwiLCAibG9jYWxfcmFuayI6IDAsICJyYW5rIjogMCwgInVzZXJfc2NyaXB0IjogInRyYWluLnB5IiwgInNhdmVfaXRlcnMiOiBbMTAwMDAsIDIwMDAwLCAzMDAwMCwgNDAwMDAsIDUwMDAwLCA2MDAwMCwgNzAwMDAsIDgwMDAwLCA5MDAwMCwgMTAwMDAwLCAxMTAwMDAsIDEyMDAwMCwgMTMwMDAwLCAxNDAwMDAsIDE1MDAwMCwgMTYwMDAwLCAxNzAwMDAsIDE4MDAwMCwgMTkwMDAwLCAyMDAwMDAsIDIxMDAwMCwgMjIwMDAwLCAyMzAwMDAsIDI0MDAwMCwgMjUwMDAwLCAyNjAwMDAsIDI3MDAwMCwgMjgwMDAwLCAyOTAwMDAsIDMwMDAwMCwgMzEwMDAwXSwgImdsb2JhbF9udW1fZ3B1cyI6IDR9']
[2024-07-11 06:29:06,973] [INFO] [launch.py:256:main] process 2162 spawned with command: ['/usr/local/miniconda3/bin/python', '-u', 'train.py', '--local_rank=3', '--deepspeed_config', 'eyJ0cmFpbl9iYXRjaF9zaXplIjogOCwgInRyYWluX21pY3JvX2JhdGNoX3NpemVfcGVyX2dwdSI6IDQsICJvcHRpbWl6ZXIiOiB7InR5cGUiOiAiQWRhbSIsICJwYXJhbXMiOiB7ImxyIjogMC4wMDAxNiwgImJldGFzIjogWzAuOSwgMC45NV0sICJlcHMiOiAxZS0wOH19LCAiZnAxNiI6IHsiZnAxNiI6IHRydWUsICJlbmFibGVkIjogdHJ1ZSwgImxvc3Nfc2NhbGUiOiAwLCAibG9zc19zY2FsZV93aW5kb3ciOiAxMDAwLCAiaHlzdGVyZXNpcyI6IDIsICJtaW5fbG9zc19zY2FsZSI6IDF9LCAiemVyb19vcHRpbWl6YXRpb24iOiB7InN0YWdlIjogMywgImFsbGdhdGhlcl9wYXJ0aXRpb25zIjogdHJ1ZSwgImFsbGdhdGhlcl9idWNrZXRfc2l6ZSI6IDUwMDAwMDAwMCwgIm92ZXJsYXBfY29tbSI6IHRydWUsICJyZWR1Y2Vfc2NhdHRlciI6IHRydWUsICJyZWR1Y2VfYnVja2V0X3NpemUiOiA1MDAwMDAwMDAsICJjb250aWd1b3VzX2dyYWRpZW50cyI6IHRydWV9LCAid2FsbF9jbG9ja19icmVha2Rvd24iOiB0cnVlfQ==', '--megatron_config', 'eyJ0cmFpbl9iYXRjaF9zaXplIjogOCwgInRyYWluX21pY3JvX2JhdGNoX3NpemVfcGVyX2dwdSI6IDQsICJvcHRpbWl6ZXIiOiB7InR5cGUiOiAiQWRhbSIsICJwYXJhbXMiOiB7ImxyIjogMC4wMDAxNiwgImJldGFzIjogWzAuOSwgMC45NV0sICJlcHMiOiAxZS0wOH19LCAiZnAxNiI6IHsiZnAxNiI6IHRydWUsICJlbmFibGVkIjogdHJ1ZSwgImxvc3Nfc2NhbGUiOiAwLCAibG9zc19zY2FsZV93aW5kb3ciOiAxMDAwLCAiaHlzdGVyZXNpcyI6IDIsICJtaW5fbG9zc19zY2FsZSI6IDF9LCAiemVyb19vcHRpbWl6YXRpb24iOiB7InN0YWdlIjogMywgImFsbGdhdGhlcl9wYXJ0aXRpb25zIjogdHJ1ZSwgImFsbGdhdGhlcl9idWNrZXRfc2l6ZSI6IDUwMDAwMDAwMCwgIm92ZXJsYXBfY29tbSI6IHRydWUsICJyZWR1Y2Vfc2NhdHRlciI6IHRydWUsICJyZWR1Y2VfYnVja2V0X3NpemUiOiA1MDAwMDAwMDAsICJjb250aWd1b3VzX2dyYWRpZW50cyI6IHRydWV9LCAid2FsbF9jbG9ja19icmVha2Rvd24iOiB0cnVlLCAicHJlY2lzaW9uIjogImZwMTYiLCAibnVtX2xheWVycyI6IDMyLCAiaGlkZGVuX3NpemUiOiAyNTYwLCAibnVtX2F0dGVudGlvbl9oZWFkcyI6IDMyLCAic2VxX2xlbmd0aCI6IDIwNDgsICJtYXhfcG9zaXRpb25fZW1iZWRkaW5ncyI6IDIwNDgsICJwb3NfZW1iIjogInJvdGFyeSIsICJub193ZWlnaHRfdHlpbmciOiB0cnVlLCAiYXR0ZW50aW9uX2NvbmZpZyI6IFsiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCIsICJnbG9iYWwiLCAiZ2xvYmFsIiwgImdsb2JhbCJdLCAic3BhcnNpdHlfY29uZmlnIjoge30sICJpbml0X21ldGhvZCI6ICJzbWFsbF9pbml0IiwgIm91dHB1dF9sYXllcl9pbml0X21ldGhvZCI6ICJ3YW5nX2luaXQiLCAibHJfZGVjYXlfc3R5bGUiOiAiY29zaW5lIiwgImxyX2RlY2F5X2l0ZXJzIjogMzIwMDAwLCAibWluX2xyIjogMS42ZS0wNSwgIm9wdGltaXplcl90eXBlIjogIkFkYW0iLCAiemVyb19zdGFnZSI6IDMsICJ6ZXJvX3JlZHVjZV9zY2F0dGVyIjogdHJ1ZSwgInplcm9fY29udGlndW91c19ncmFkaWVudHMiOiB0cnVlLCAiemVyb19yZWR1Y2VfYnVja2V0X3NpemUiOiA1MDAwMDAwMDAsICJ6ZXJvX2FsbGdhdGhlcl9idWNrZXRfc2l6ZSI6IDUwMDAwMDAwMCwgImxyIjogMC4wMDAxNiwgInRva2VuaXplcl90eXBlIjogIkhGVG9rZW5pemVyIiwgImRhdGFfcGF0aCI6ICJkYXRhL3Byb2Nlc3NlZF9kYXRhL215ZGF0YXNldF90ZXh0X2RvY3VtZW50IiwgImRhdGFfaW1wbCI6ICJtbWFwIiwgInNhdmUiOiAiY2hlY2twb2ludHMiLCAiY29uZmlnX2ZpbGVzIjogeyIyLTdCLnltbCI6ICIjIEdQVC0yIHByZXRyYWluaW5nIHNldHVwXG57XG4gICAjIHBhcmFsbGVsaXNtIHNldHRpbmdzICggeW91IHdpbGwgd2FudCB0byBjaGFuZ2UgdGhlc2UgYmFzZWQgb24geW91ciBjbHVzdGVyIHNldHVwLCBpZGVhbGx5IHNjaGVkdWxpbmcgcGlwZWxpbmUgc3RhZ2VzXG4gICAjIGFjcm9zcyB0aGUgbm9kZSBib3VuZGFyaWVzIClcbiAgIFwicGlwZV9wYXJhbGxlbF9zaXplXCI6IDIsXG4gICBcIm1vZGVsX3BhcmFsbGVsX3NpemVcIjogMSxcbiAgICNcImdyYWRpZW50X2FjY3VtdWxhdGlvbl9zdGVwc1wiOiAyLFxuXG4gICAjIG1vZGVsIHNldHRpbmdzXG4gICBcIm51bV9sYXllcnNcIjogMzIsXG4gICBcImhpZGRlbl9zaXplXCI6IDI1NjAsXG4gICBcIm51bV9hdHRlbnRpb25faGVhZHNcIjogMzIsXG4gICBcInNlcV9sZW5ndGhcIjogMjA0OCxcbiAgIFwibWF4X3Bvc2l0aW9uX2VtYmVkZGluZ3NcIjogMjA0OCxcbiAgIFwibm9ybVwiOiBcImxheWVybm9ybVwiLFxuICAgXCJwb3NfZW1iXCI6IFwicm90YXJ5XCIsXG4gICBcIm5vX3dlaWdodF90eWluZ1wiOiB0cnVlLFxuICAgXCJncHRfal9yZXNpZHVhbFwiOiBmYWxzZSxcbiAgIFwib3V0cHV0X2xheWVyX3BhcmFsbGVsaXNtXCI6IFwiY29sdW1uXCIsXG5cbiAgICMgdGhlc2Ugc2hvdWxkIHByb3ZpZGUgc29tZSBzcGVlZHVwIGJ1dCB0YWtlcyBhIHdoaWxlIHRvIGJ1aWxkLCBzZXQgdG8gdHJ1ZSBpZiBkZXNpcmVkXG4gICBcInNjYWxlZF91cHBlcl90cmlhbmdfbWFza2VkX3NvZnRtYXhfZnVzaW9uXCI6IGZhbHNlLFxuICAgXCJiaWFzX2dlbHVfZnVzaW9uXCI6IGZhbHNlLFxuICAgXCJyb3BlX2Z1c2lvblwiOiBmYWxzZSxcbiAgIFwibGF5ZXJub3JtX2Z1c2lvblwiOiBmYWxzZSxcblxuICAgIyBpbml0IG1ldGhvZHNcbiAgIFwiaW5pdF9tZXRob2RcIjogXCJzbWFsbF9pbml0XCIsXG4gICBcIm91dHB1dF9sYXllcl9pbml0X21ldGhvZFwiOiBcIndhbmdfaW5pdFwiLFxuXG4gICAjIG9wdGltaXplciBzZXR0aW5nc1xuICAgXCJvcHRpbWl6ZXJcIjoge1xuICAgICBcInR5cGVcIjogXCJBZGFtXCIsXG4gICAgIFwicGFyYW1zXCI6IHtcbiAgICAgICBcImxyXCI6IDAuMDAwMTYsXG4gICAgICAgXCJiZXRhc1wiOiBbMC45LCAwLjk1XSxcbiAgICAgICBcImVwc1wiOiAxLjBlLTgsXG4gICAgIH1cbiAgIH0sXG4gICBcIm1pbl9sclwiOiAwLjAwMDAxNixcblxuICAgIyBmb3IgYWxsIHplcm9fb3B0aW1pemF0aW9uIG9wdGlvbnMsIHNlZSBodHRwczovL3d3dy5kZWVwc3BlZWQuYWkvZG9jcy9jb25maWctanNvbi8jemVyby1vcHRpbWl6YXRpb25zLWZvci1mcDE2LXRyYWluaW5nXG4gICBcInplcm9fb3B0aW1pemF0aW9uXCI6IHtcbiAgICBcInN0YWdlXCI6IDMsXG4gICAgXCJhbGxnYXRoZXJfcGFydGl0aW9uc1wiOiBUcnVlLFxuICAgIFwiYWxsZ2F0aGVyX2J1Y2tldF9zaXplXCI6IDUwMDAwMDAwMCxcbiAgICBcIm92ZXJsYXBfY29tbVwiOiBUcnVlLFxuICAgIFwicmVkdWNlX3NjYXR0ZXJcIjogVHJ1ZSxcbiAgICBcInJlZHVjZV9idWNrZXRfc2l6ZVwiOiA1MDAwMDAwMDAsXG4gICAgXCJjb250aWd1b3VzX2dyYWRpZW50c1wiOiBUcnVlLFxuICB9LFxuXG4gICAjIGJhdGNoIC8gZGF0YSBzZXR0aW5nc1xuICAgXCJ0cmFpbl9taWNyb19iYXRjaF9zaXplX3Blcl9ncHVcIjogNCxcbiAgIFwiZGF0YV9pbXBsXCI6IFwibW1hcFwiLFxuXG4gICAjIGFjdGl2YXRpb24gY2hlY2twb2ludGluZ1xuICAgXCJjaGVja3BvaW50X2FjdGl2YXRpb25zXCI6IHRydWUsXG4gICBcImNoZWNrcG9pbnRfbnVtX2xheWVyc1wiOiAxLFxuICAgXCJwYXJ0aXRpb25fYWN0aXZhdGlvbnNcIjogdHJ1ZSxcbiAgIFwic3luY2hyb25pemVfZWFjaF9sYXllclwiOiB0cnVlLFxuXG4gICAjIHJlZ3VsYXJpemF0aW9uXG4gICBcImdyYWRpZW50X2NsaXBwaW5nXCI6IDEuMCxcbiAgIFwid2VpZ2h0X2RlY2F5XCI6IDAuMSxcbiAgIFwiaGlkZGVuX2Ryb3BvdXRcIjogMCxcbiAgIFwiYXR0ZW50aW9uX2Ryb3BvdXRcIjogMCxcblxuICAgIyBwcmVjaXNpb24gc2V0dGluZ3NcbiAgIFwiZnAxNlwiOiB7XG4gICAgIFwiZnAxNlwiOiB0cnVlLFxuICAgICBcImVuYWJsZWRcIjogdHJ1ZSxcbiAgICAgXCJsb3NzX3NjYWxlXCI6IDAsXG4gICAgIFwibG9zc19zY2FsZV93aW5kb3dcIjogMTAwMCxcbiAgICAgXCJoeXN0ZXJlc2lzXCI6IDIsXG4gICAgIFwibWluX2xvc3Nfc2NhbGVcIjogMVxuICAgfSxcblxuICAgIyBtaXNjLiB0cmFpbmluZyBzZXR0aW5nc1xuICAgXCJ0cmFpbl9pdGVyc1wiOiAzMjAwMDAsXG4gICBcImxyX2RlY2F5X2l0ZXJzXCI6IDMyMDAwMCxcbiAgIFwiZGlzdHJpYnV0ZWRfYmFja2VuZFwiOiBcIm5jY2xcIixcbiAgIFwibHJfZGVjYXlfc3R5bGVcIjogXCJjb3NpbmVcIixcbiAgIFwid2FybXVwXCI6IDAuMDEsXG4gICBcImNoZWNrcG9pbnRfZmFjdG9yXCI6IDEwMDAwLFxuICAgXCJldmFsX2ludGVydmFsXCI6IDEwMDAsXG4gICBcImV2YWxfaXRlcnNcIjogMTAsXG5cbiAgICMgbG9nZ2luZ1xuICAgXCJsb2dfaW50ZXJ2YWxcIjogMTAwLFxuICAgXCJzdGVwc19wZXJfcHJpbnRcIjogMTAsXG4gICBcImtlZXBfbGFzdF9uX2NoZWNrcG9pbnRzXCI6IDQsXG4gICBcIndhbGxfY2xvY2tfYnJlYWtkb3duXCI6IHRydWUsXG59XG4iLCAibG9jYWxfc2V0dXAueW1sIjogIiMgU3VnZ2VzdGVkIGRhdGEgcGF0aHMgd2hlbiB1c2luZyBHUFQtTmVvWCBsb2NhbGx5XG57XG4gIFwiZGF0YV9wYXRoXCI6IFwiZGF0YS9wcm9jZXNzZWRfZGF0YS9teWRhdGFzZXRfdGV4dF9kb2N1bWVudFwiLFxuICBcInRva2VuaXplcl90eXBlXCI6XCJIRlRva2VuaXplclwiLFxuICAjIG9yIGZvciB3ZWlnaHRlZCBkYXRhc2V0czpcbiAgIyBcInRyYWluLWRhdGEtcGF0aHNcIjogW1wiZGF0YS9lbndpazgvZW53aWs4X3RleHRfZG9jdW1lbnRcIiwgXCJkYXRhL2Vud2lrOC9lbndpazhfdGV4dF9kb2N1bWVudFwiXSxcbiAgIyBcInRlc3QtZGF0YS1wYXRoc1wiOiBbXCJkYXRhL2Vud2lrOC9lbndpazhfdGV4dF9kb2N1bWVudFwiLCBcImRhdGEvZW53aWs4L2Vud2lrOF90ZXh0X2RvY3VtZW50XCJdLFxuICAjIFwidmFsaWQtZGF0YS1wYXRoc1wiOiBbXCJkYXRhL2Vud2lrOC9lbndpazhfdGV4dF9kb2N1bWVudFwiLCBcImRhdGEvZW53aWs4L2Vud2lrOF90ZXh0X2RvY3VtZW50XCJdLFxuICAjIFwidHJhaW4tZGF0YS13ZWlnaHRzXCI6IFsxLiwgMi5dLFxuICAjIFwidGVzdC1kYXRhLXdlaWdodHNcIjogWzIuLCAxLl0sXG4gICMgXCJ2YWxpZC1kYXRhLXdlaWdodHNcIjogWzAuNSwgMC40XSxcblxuICAjIElmIHdlaWdodF9ieV9udW1fZG9jdW1lbnRzIGlzIFRydWUsIEJ1aWxkcyBkYXRhc2V0IHdlaWdodHMgZnJvbSBhIG11bHRpbm9taWFsIGRpc3RyaWJ1dGlvbiBvdmVyIGdyb3VwcyBvZiBkYXRhIGFjY29yZGluZyB0byB0aGUgbnVtYmVyIG9mIGRvY3VtZW50cyBpbiBlYWNoIGdyb3VwLlxuICAjIFdBUk5JTkc6IHNldHRpbmcgdGhpcyB0byBUcnVlIHdpbGwgb3ZlcnJpZGUgYW55IHVzZXIgcHJvdmlkZWQgd2VpZ2h0c1xuICAjIFwid2VpZ2h0X2J5X251bV9kb2N1bWVudHNcIjogZmFsc2UsXG4gICMgXCJ3ZWlnaHRlZF9zYW1wbGVyX2FscGhhXCI6IDAuMyxcblxuICBcInZvY2FiX2ZpbGVcIjogXCJja3B0cy8yMEJfdG9rZW5pemVyLmpzb25cIixcbiAgXCJtZXJnZV9maWxlXCI6IFwiZGF0YS9ncHQyLW1lcmdlcy50eHRcIixcblxuICBcInNhdmVcIjogXCJjaGVja3BvaW50c1wiLFxuICBcImxvYWRcIjogXCJjaGVja3BvaW50c1wiLFxuICBcImNoZWNrcG9pbnRfdmFsaWRhdGlvbl93aXRoX2ZvcndhcmRfcGFzc1wiOiBGYWxzZSxcblxuICBcInRlbnNvcmJvYXJkX2RpclwiOiBcInRlbnNvcmJvYXJkXCIsXG4gIFwibG9nX2RpclwiOiBcImxvZ3NcIixcbiAgXCJ1c2Vfd2FuZGJcIjogRmFsc2UsXG4gIFwid2FuZGJfaG9zdFwiOiBcImh0dHBzOi8vYXBpLndhbmRiLmFpXCIsXG4gIFwid2FuZGJfcHJvamVjdFwiOiBcIm5lb3hcIlxufVxuIn0sICJsb2FkIjogImNoZWNrcG9pbnRzIiwgImNoZWNrcG9pbnRfZmFjdG9yIjogMTAwMDAsICJiYXRjaF9zaXplIjogNCwgInRyYWluX2l0ZXJzIjogMzIwMDAwLCAiZXZhbF9pdGVycyI6IDEwLCAia2VlcF9sYXN0X25fY2hlY2twb2ludHMiOiA0LCAidm9jYWJfZmlsZSI6ICJja3B0cy8yMEJfdG9rZW5pemVyLmpzb24iLCAibWVyZ2VfZmlsZSI6ICJkYXRhL2dwdDItbWVyZ2VzLnR4dCIsICJjaGVja3BvaW50X2FjdGl2YXRpb25zIjogdHJ1ZSwgInN5bmNocm9uaXplX2VhY2hfbGF5ZXIiOiB0cnVlLCAicGFydGl0aW9uX2FjdGl2YXRpb25zIjogdHJ1ZSwgImR5bmFtaWNfbG9zc19zY2FsZSI6IHRydWUsICJwaXBlX3BhcmFsbGVsX3NpemUiOiAyLCAid29ybGRfc2l6ZSI6IDEsICJpc19waXBlX3BhcmFsbGVsIjogdHJ1ZSwgInVzZV93YW5kYiI6IGZhbHNlLCAibG9nX2RpciI6ICJsb2dzIiwgInRlbnNvcmJvYXJkX2RpciI6ICJ0ZW5zb3Jib2FyZCIsICJ0ZXh0X2dlbl90eXBlIjogInVuY29uZGl0aW9uYWwiLCAibG9jYWxfcmFuayI6IDAsICJyYW5rIjogMCwgInVzZXJfc2NyaXB0IjogInRyYWluLnB5IiwgInNhdmVfaXRlcnMiOiBbMTAwMDAsIDIwMDAwLCAzMDAwMCwgNDAwMDAsIDUwMDAwLCA2MDAwMCwgNzAwMDAsIDgwMDAwLCA5MDAwMCwgMTAwMDAwLCAxMTAwMDAsIDEyMDAwMCwgMTMwMDAwLCAxNDAwMDAsIDE1MDAwMCwgMTYwMDAwLCAxNzAwMDAsIDE4MDAwMCwgMTkwMDAwLCAyMDAwMDAsIDIxMDAwMCwgMjIwMDAwLCAyMzAwMDAsIDI0MDAwMCwgMjUwMDAwLCAyNjAwMDAsIDI3MDAwMCwgMjgwMDAwLCAyOTAwMDAsIDMwMDAwMCwgMzEwMDAwXSwgImdsb2JhbF9udW1fZ3B1cyI6IDR9']
[2024-07-11 06:29:09,055] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-07-11 06:29:09,059] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-07-11 06:29:09,076] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-07-11 06:29:09,084] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[WARNING] async_io requires the dev libaio .so object and headers but these were not found.
[WARNING] async_io requires the dev libaio .so object and headers but these were not found.
[WARNING] async_io: please install the libaio-dev package with apt
[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
[WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
[WARNING] async_io: please install the libaio-dev package with apt
[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
[WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
[WARNING] async_io requires the dev libaio .so object and headers but these were not found.
[WARNING] async_io: please install the libaio-dev package with apt
[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
[WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
[WARNING] async_io requires the dev libaio .so object and headers but these were not found.
[WARNING] async_io: please install the libaio-dev package with apt
[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
[WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
[WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.0
[WARNING] using untested triton version (2.0.0), only 1.0.0 is known to be compatible
[WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.0
[WARNING] using untested triton version (2.0.0), only 1.0.0 is known to be compatible
[WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.0
[WARNING] using untested triton version (2.0.0), only 1.0.0 is known to be compatible
[WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.0
[WARNING] using untested triton version (2.0.0), only 1.0.0 is known to be compatible
fatal: not a git repository (or any parent up to mount point /)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
fatal: not a git repository (or any parent up to mount point /)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba
Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba
For s3 checkpointing, please install boto3 either using requirements/requirements-s3.txt or https://github.com/boto/boto3
For s3 checkpointing, please install hf_transfer either using requirements/requirements-s3.txt or https://github.com/huggingface/hf_transfer
For s3 checkpointing, please install boto3 either using requirements/requirements-s3.txt or https://github.com/boto/boto3
For s3 checkpointing, please install hf_transfer either using requirements/requirements-s3.txt or https://github.com/huggingface/hf_transfer
[2024-07-11 06:29:12,526] [INFO] [comm.py:637:init_distributed] cdb=None
[2024-07-11 06:29:12,526] [INFO] [comm.py:637:init_distributed] cdb=None
fatal: not a git repository (or any parent up to mount point /)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba
For s3 checkpointing, please install boto3 either using requirements/requirements-s3.txt or https://github.com/boto/boto3
For s3 checkpointing, please install hf_transfer either using requirements/requirements-s3.txt or https://github.com/huggingface/hf_transfer
[2024-07-11 06:29:12,613] [INFO] [comm.py:637:init_distributed] cdb=None
fatal: not a git repository (or any parent up to mount point /)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, or directly from https://github.com/state-spaces/mamba
For s3 checkpointing, please install boto3 either using requirements/requirements-s3.txt or https://github.com/boto/boto3
For s3 checkpointing, please install hf_transfer either using requirements/requirements-s3.txt or https://github.com/huggingface/hf_transfer
[2024-07-11 06:29:12,664] [INFO] [comm.py:637:init_distributed] cdb=None
[2024-07-11 06:29:12,664] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
NeoXArgs.configure_distributed_args() using world size: 4 and model-parallel size: 1
> building HFTokenizer tokenizer ...
> padded vocab (size: 50277) with 27 dummy tokens (new size: 50304)
> setting tensorboard ...
torch distributed is already initialized, skipping initialization ...
> initializing model parallel with size 1
MPU DP: [0, 1]
MPU DP: [2, 3]
MPU PP: [0, 2]
MPU PP: [1, 3]
MPU IO: [0, 1, 2, 3]
MPU MP: [0]
MPU MP: [1]
MPU MP: [2]
MPU MP: [3]
> setting random seeds to 1234 ...
[2024-07-11 06:29:13,784] [INFO] [checkpointing.py:229:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234
make: Entering directory '/hy-tmp/gpt-neox-main/megatron/data'
make: Nothing to be done for 'default'.
make: Leaving directory '/hy-tmp/gpt-neox-main/megatron/data'
Traceback (most recent call last):
File "train.py", line 35, in <module>
main()
File "train.py", line 31, in main
pretrain(neox_args=neox_args)
File "/hy-tmp/gpt-neox-main/megatron/training.py", line 195, in pretrain
model, optimizer, lr_scheduler = setup_model_and_optimizer(
File "/hy-tmp/gpt-neox-main/megatron/training.py", line 743, in setup_model_and_optimizer
model = get_model(neox_args=neox_args, use_cache=use_cache)
File "/hy-tmp/gpt-neox-main/megatron/training.py", line 486, in get_model
with deepspeed.zero.Init(
File "/usr/local/miniconda3/lib/python3.8/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 933, in __init__
_ds_config = deepspeed.runtime.config.DeepSpeedConfig(config_dict_or_path,
File "/usr/local/miniconda3/lib/python3.8/site-packages/deepspeed/runtime/config.py", line 796, in __init__
self._configure_train_batch_size()
File "/usr/local/miniconda3/lib/python3.8/site-packages/deepspeed/runtime/config.py", line 979, in _configure_train_batch_size
self._batch_assertion()
File "/usr/local/miniconda3/lib/python3.8/site-packages/deepspeed/runtime/config.py", line 925, in _batch_assertion
assert (grad_acc > 0), f"Gradient accumulation steps: {grad_acc} has to be greater than 0"
AssertionError: Gradient accumulation steps: 0 has to be greater than 0
Traceback (most recent call last):
File "train.py", line 35, in <module>
main()
File "train.py", line 31, in main
pretrain(neox_args=neox_args)
File "/hy-tmp/gpt-neox-main/megatron/training.py", line 195, in pretrain
model, optimizer, lr_scheduler = setup_model_and_optimizer(
File "/hy-tmp/gpt-neox-main/megatron/training.py", line 743, in setup_model_and_optimizer
model = get_model(neox_args=neox_args, use_cache=use_cache)
File "/hy-tmp/gpt-neox-main/megatron/training.py", line 486, in get_model
with deepspeed.zero.Init(
File "/usr/local/miniconda3/lib/python3.8/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 933, in __init__
_ds_config = deepspeed.runtime.config.DeepSpeedConfig(config_dict_or_path,
File "/usr/local/miniconda3/lib/python3.8/site-packages/deepspeed/runtime/config.py", line 796, in __init__
self._configure_train_batch_size()
File "/usr/local/miniconda3/lib/python3.8/site-packages/deepspeed/runtime/config.py", line 979, in _configure_train_batch_size
self._batch_assertion()
File "/usr/local/miniconda3/lib/python3.8/site-packages/deepspeed/runtime/config.py", line 925, in _batch_assertion
assert (grad_acc > 0), f"Gradient accumulation steps: {grad_acc} has to be greater than 0"
AssertionError: Gradient accumulation steps: 0 has to be greater than 0
Traceback (most recent call last):
File "train.py", line 35, in <module>
main()
File "train.py", line 31, in main
pretrain(neox_args=neox_args)
File "/hy-tmp/gpt-neox-main/megatron/training.py", line 195, in pretrain
model, optimizer, lr_scheduler = setup_model_and_optimizer(
File "/hy-tmp/gpt-neox-main/megatron/training.py", line 743, in setup_model_and_optimizer
model = get_model(neox_args=neox_args, use_cache=use_cache)
File "/hy-tmp/gpt-neox-main/megatron/training.py", line 486, in get_model
with deepspeed.zero.Init(
File "/usr/local/miniconda3/lib/python3.8/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 933, in __init__
_ds_config = deepspeed.runtime.config.DeepSpeedConfig(config_dict_or_path,
File "/usr/local/miniconda3/lib/python3.8/site-packages/deepspeed/runtime/config.py", line 796, in __init__
self._configure_train_batch_size()
File "/usr/local/miniconda3/lib/python3.8/site-packages/deepspeed/runtime/config.py", line 979, in _configure_train_batch_size
self._batch_assertion()
File "/usr/local/miniconda3/lib/python3.8/site-packages/deepspeed/runtime/config.py", line 925, in _batch_assertion
assert (grad_acc > 0), f"Gradient accumulation steps: {grad_acc} has to be greater than 0"
AssertionError: Gradient accumulation steps: 0 has to be greater than 0
building GPT2 model ...
Traceback (most recent call last):
File "train.py", line 35, in <module>
main()
File "train.py", line 31, in main
pretrain(neox_args=neox_args)
File "/hy-tmp/gpt-neox-main/megatron/training.py", line 195, in pretrain
model, optimizer, lr_scheduler = setup_model_and_optimizer(
File "/hy-tmp/gpt-neox-main/megatron/training.py", line 743, in setup_model_and_optimizer
model = get_model(neox_args=neox_args, use_cache=use_cache)
File "/hy-tmp/gpt-neox-main/megatron/training.py", line 486, in get_model
with deepspeed.zero.Init(
File "/usr/local/miniconda3/lib/python3.8/site-packages/deepspeed/runtime/zero/partition_parameters.py", line 933, in __init__
_ds_config = deepspeed.runtime.config.DeepSpeedConfig(config_dict_or_path,
File "/usr/local/miniconda3/lib/python3.8/site-packages/deepspeed/runtime/config.py", line 796, in __init__
self._configure_train_batch_size()
File "/usr/local/miniconda3/lib/python3.8/site-packages/deepspeed/runtime/config.py", line 979, in _configure_train_batch_size
self._batch_assertion()
File "/usr/local/miniconda3/lib/python3.8/site-packages/deepspeed/runtime/config.py", line 925, in _batch_assertion
assert (grad_acc > 0), f"Gradient accumulation steps: {grad_acc} has to be greater than 0"
AssertionError: Gradient accumulation steps: 0 has to be greater than 0
I am trying to enable parallelism but this error is preventing me from proceeding.
Here are some details about my setup:
- Number of GPUs: 4 GPUs, each with 24GB memory, on a single server
- Python Version: 3.8.10
- DeepSpeed Version: 0.14.4
- CUDA Version: cu118
Below is the content of my 2-7B.yml configuration file:
# GPT-2 pretraining setup
{
# parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
# across the node boundaries )
"pipe_parallel_size": 2,
"model_parallel_size": 1,
# model settings
"num_layers": 32,
"hidden_size": 2560,
"num_attention_heads": 32,
"seq_length": 2048,
"max_position_embeddings": 2048,
"norm": "layernorm",
"pos_emb": "rotary",
"no_weight_tying": true,
"gpt_j_residual": false,
"output_layer_parallelism": "column",
# these should provide some speedup but takes a while to build, set to true if desired
"scaled_upper_triang_masked_softmax_fusion": false,
"bias_gelu_fusion": false,
"rope_fusion": false,
"layernorm_fusion": false,
# init methods
"init_method": "small_init",
"output_layer_init_method": "wang_init",
# optimizer settings
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00016,
"betas": [0.9, 0.95],
"eps": 1.0e-8,
}
},
"min_lr": 0.000016,
# for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
"zero_optimization": {
"stage": 3,
"allgather_partitions": True,
"allgather_bucket_size": 500000000,
"overlap_comm": True,
"reduce_scatter": True,
"reduce_bucket_size": 500000000,
"contiguous_gradients": True,
},
# batch / data settings
"train_micro_batch_size_per_gpu": 4,
"data_impl": "mmap",
# activation checkpointing
"checkpoint_activations": true,
"checkpoint_num_layers": 1,
"partition_activations": true,
"synchronize_each_layer": true,
# regularization
"gradient_clipping": 1.0,
"weight_decay": 0.1,
"hidden_dropout": 0,
"attention_dropout": 0,
# precision settings
"fp16": {
"fp16": true,
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
# misc. training settings
"train_iters": 320000,
"lr_decay_iters": 320000,
"distributed_backend": "nccl",
"lr_decay_style": "cosine",
"warmup": 0.01,
"checkpoint_factor": 10000,
"eval_interval": 1000,
"eval_iters": 10,
# logging
"log_interval": 100,
"steps_per_print": 10,
"keep_last_n_checkpoints": 4,
"wall_clock_breakdown": true,
}I am using the following command to start the training:
python ./deepy.py train.py -d configs 2-7B.yml local_setup.yml
I would appreciate any guidance or suggestions on how to resolve this issue.
Thank you!
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working