Skip to content
This repository was archived by the owner on Feb 25, 2022. It is now read-only.

Add args for sampling & clean up configs #180

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,14 @@ Once you have a trained model, or you've downloaded one of our pre-trained model
python3 main.py --predict --prompt <example_prompt.txt> --tpu <tpu_name> --model <config_name>
```

(Optional) extra arguments for sampling:

- `--temperature` : Temperature for temperature sampling. Float between 0 and 1.
- `--top-k` : An optional integer - if not -1, only sample from the top k logits.
- `--max_steps` : An optional integer, the max number of steps to decode when sampling.
- `--sampling-stop-token` : An optional integer. Stop sampling when this token is produced. Defaults to EOS token if none is provided.
- `--remove-prompt` : Boolean. whether to remove the prompt from the sampling output. Defaults to False.
- `--sample-save-path` : An optional String. Path to save the samples to. If None is provided, defaults to predictions_{current_step}.txt
or, if using GPUs:

```bash
Expand Down
41 changes: 41 additions & 0 deletions configs/GPT3_2-7B_pretrained.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
{
"n_head" : 20,
"n_vocab" : 50257,
"embed_dropout" : 0,
"lr" : 0.00016,
"lr_decay" : "cosine",
"warmup_steps" : 3000,
"beta1" : 0.9,
"beta2" : 0.95,
"epsilon" : 1e-08,
"ada_epsilon1" : "1e-30",
"ada_epsilon2" : 0.001,
"opt_name" : "adam",
"weight_decay" : 0,
"train_batch_size" : 512,
"attn_dropout" : 0,
"train_steps" : 400000,
"lr_decay_end" : 300000,
"eval_steps" : 10,
"predict_steps" : 0,
"res_dropout" : 0,
"eval_batch_size" : 128,
"predict_batch_size" : 1,
"iterations" : 500,
"n_embd" : 2560,
"datasets" : [["pile", null, null, null]],
"model_path" : "gs://neo-d/models/GPT3_2-7B",
"n_ctx" : 2048,
"n_layer" : 32,
"scale_by_depth" : true,
"scale_by_in" : false,
"attention_types" : [[["global", "local"], 16]],
"mesh_shape" : "x:64,y:4",
"layout" : "batch:x,embd:y",
"activation_function" : "gelu",
"recompute_grad" : true,
"gradient_clipping" : 1.0,
"tokens_per_mb_per_replica" : 4096,
"padding_id" : 50257,
"eos_id" : 50256
}
40 changes: 40 additions & 0 deletions configs/GPT3_XL_pretrained.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{
"n_head" : 16,
"n_vocab" : 50257,
"embed_dropout" : 0,
"lr" : 0.0002,
"lr_decay" : "cosine",
"warmup_steps" : 3000,
"beta1" : 0.9,
"beta2" : 0.95,
"epsilon" : 1e-08,
"opt_name" : "adam",
"weight_decay" : 0,
"train_batch_size" : 512,
"attn_dropout" : 0,
"train_steps" : 400000,
"lr_decay_end" : 300000,
"eval_steps" : 10,
"predict_steps" : 0,
"res_dropout" : 0,
"eval_batch_size" : 128,
"predict_batch_size" : 128,
"iterations" : 500,
"n_embd" : 2048,
"datasets" : [["pile", null, null, null]],
"model_path" : "gs://neo-d/models/GPT3_XL_Pile",
"n_ctx" : 2048,
"n_layer" : 24,
"scale_by_depth" : true,
"scale_by_in" : false,
"attention_types" : [[["global", "local"], 12]],
"mesh_shape" : "x:128,y:2",
"layout" : "batch:x,memory_length:y,embd:y",
"activation_function" : "gelu",
"recompute_grad" : true,
"gradient_clipping" : 1.0,
"tokens_per_mb_per_replica" : 4096,
"precision" : "bfloat16",
"padding_id" : 50257,
"eos_id" : 50256
}
5 changes: 3 additions & 2 deletions configs/gpt2_small.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"predict_batch_size": 8,
"iterations": 2500,
"n_embd": 768,
"datasets": ["openwebtext2_new_inputs"],
"datasets": [["pile", null, null, null]],
"model_path": "gs://neo-models/GPT2_SMALL",
"n_ctx": 1024,
"n_layer": 12,
Expand All @@ -32,5 +32,6 @@
"mesh_shape": "all:64",
"layout": "batch:all",
"recompute_grad": false,
"gradient_clipping": 1.0
"gradient_clipping": 1.0,
"precision": "bfloat16"
}
2 changes: 1 addition & 1 deletion configs/gpt3_13B_256.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"predict_batch_size": 1,
"iterations": 500,
"n_embd": 5120,
"datasets": [["openwebtext-documents", 25, "documents_random", 1.0]],
"datasets": [["pile", null, null, null]],
"model_path": "gs://neo-models/GPT3_13B",
"n_ctx": 2048,
"n_layer": 40,
Expand Down
38 changes: 0 additions & 38 deletions configs/gpt3_13B_256_Pile.json

This file was deleted.

5 changes: 3 additions & 2 deletions configs/gpt3_2-7B_256.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"predict_batch_size": 1,
"iterations": 500,
"n_embd": 2560,
"datasets": [["openwebtext-documents", 25, "documents_random", 1.0]],
"datasets": [["pile", null, null, null]],
"model_path": "gs://neo-models/GPT3_2-7B",
"n_ctx": 2048,
"n_layer": 32,
Expand All @@ -33,6 +33,7 @@
"layout": "embd:y,batch:x",
"activation_function": "gelu",
"recompute_grad": true,
"gradient_clipping": 1.0
"gradient_clipping": 1.0,
"precision": "bfloat16"
}

6 changes: 4 additions & 2 deletions configs/gpt3_6-7B_256.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"predict_batch_size": 1,
"iterations": 500,
"n_embd": 4096,
"datasets": [["openwebtext-documents", 25, "documents_random", 1.0]],
"datasets": [["pile", null, null, null]],
"model_path": "gs://neo-models/GPT3_6-7B",
"n_ctx": 2048,
"n_layer": 32,
Expand All @@ -31,6 +31,8 @@
"layout": "embd:y,batch:x",
"activation_function": "gelu",
"recompute_grad": true,
"gradient_clipping": 1.0
"gradient_clipping": 1.0,
"precision": "bfloat16"

}

5 changes: 3 additions & 2 deletions configs/gpt3_PAR_small_256.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"predict_batch_size": 1,
"iterations": 1000,
"n_embd": 768,
"datasets": [["openwebtext-documents", 25, "documents_random", 1.0]],
"datasets": [["pile", null, null, null]],
"model_path": "gs://neo-models/GPT3_PAR_SMALL",
"n_ctx": 2048,
"n_layer": 19,
Expand All @@ -31,6 +31,7 @@
"layout": "batch:x,heads:y,vocab:y,intermediate_expanded:y",
"activation_function": "gelu",
"recompute_grad": false,
"gradient_clipping": 1.0
"gradient_clipping": 1.0,
"precision": "bfloat16"
}

2 changes: 1 addition & 1 deletion configs/gpt3_XL_256_Pile.json → configs/gpt3_XL_256.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"predict_batch_size": 1,
"iterations": 500,
"n_embd": 2048,
"datasets": [["pile", 25, "documents_random", 1.0]],
"datasets": [["pile", null, null, null]],
"model_path": "gs://neo-models/GPT3_XL_Pile",
"n_ctx": 2048,
"n_layer": 24,
Expand Down
5 changes: 3 additions & 2 deletions configs/gpt3_large_256.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"predict_batch_size": 1,
"iterations": 2500,
"n_embd": 1536,
"datasets": [["openwebtext-documents", 25, "documents_random", 1.0]],
"datasets": [["pile", null, null, null]],
"model_path": "gs://neo-models/GPT3_LARGE",
"n_ctx": 2048,
"n_layer": 24,
Expand All @@ -34,6 +34,7 @@
"activation_function": "gelu",
"recompute_grad": true,
"gradient_clipping": 1.0,
"tokens_per_mb_per_replica": 2048
"tokens_per_mb_per_replica": 2048,
"precision": "bfloat16"
}

6 changes: 4 additions & 2 deletions configs/gpt3_medium_256.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"predict_batch_size": 1,
"iterations": 2500,
"n_embd": 1024,
"datasets": [["openwebtext-documents", 25, "documents_random", 1.0]],
"datasets": [["pile", null, null, null]],
"model_path": "gs://neo-models/GPT3_MEDIUM",
"n_ctx": 2048,
"n_layer": 24,
Expand All @@ -31,6 +31,8 @@
"layout": "batch:x,heads:y,vocab:y",
"activation_function": "gelu",
"recompute_grad": false,
"gradient_clipping": 1.0
"gradient_clipping": 1.0,
"precision": "bfloat16"

}

5 changes: 3 additions & 2 deletions configs/gpt3_small_256.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"predict_batch_size": 1,
"iterations": 2500,
"n_embd": 768,
"datasets": [["openwebtext-documents", 25, "documents_random", 1.0]],
"datasets": [["openwebtext-documents", null, "documents_random", 1.0]],
"model_path": "gs://neo-models/GPT3_SMALL",
"n_ctx": 2048,
"n_layer": 12,
Expand All @@ -31,6 +31,7 @@
"layout": "batch:x,heads:y,vocab:y,intermediate_expanded:y",
"activation_function": "gelu",
"recompute_grad": false,
"gradient_clipping": 1.0
"gradient_clipping": 1.0,
"precision": "bfloat16"
}

Loading