EleutherAI · sdtblck · Mar 29, 2021 · Mar 29, 2021 · Mar 29, 2021 · Mar 29, 2021
diff --git a/README.md b/README.md
@@ -79,6 +79,14 @@ Once you have a trained model, or you've downloaded one of our pre-trained model
 python3 main.py --predict --prompt <example_prompt.txt> --tpu <tpu_name> --model <config_name>
 ```
 
+(Optional) extra arguments for sampling:
+
+- `--temperature` : Temperature for temperature sampling. Float between 0 and 1.
+- `--top-k` : An optional integer - if not -1, only sample from the top k logits.
+- `--max_steps` : An optional integer, the max number of steps to decode when sampling.
+- `--sampling-stop-token` : An optional integer. Stop sampling when this token is produced. Defaults to EOS token if none is provided.
+- `--remove-prompt` : Boolean. whether to remove the prompt from the sampling output. Defaults to False.
+- `--sample-save-path` : An optional String. Path to save the samples to. If None is provided, defaults to predictions_{current_step}.txt
 or, if using GPUs:
 
 ```bash

diff --git a/configs/GPT3_2-7B_pretrained.json b/configs/GPT3_2-7B_pretrained.json
@@ -0,0 +1,41 @@
+{
+"n_head" : 20,
+"n_vocab" : 50257,
+"embed_dropout" : 0,
+"lr" : 0.00016,
+"lr_decay" : "cosine",
+"warmup_steps" : 3000,
+"beta1" : 0.9,
+"beta2" : 0.95,
+"epsilon" : 1e-08,
+"ada_epsilon1" : "1e-30",
+"ada_epsilon2" : 0.001,
+"opt_name" : "adam",
+"weight_decay" : 0,
+"train_batch_size" : 512,
+"attn_dropout" : 0,
+"train_steps" : 400000,
+"lr_decay_end" : 300000,
+"eval_steps" : 10,
+"predict_steps" : 0,
+"res_dropout" : 0,
+"eval_batch_size" : 128,
+"predict_batch_size" : 1,
+"iterations" : 500,
+"n_embd" : 2560,
+"datasets" : [["pile", null, null, null]],
+"model_path" : "gs://neo-d/models/GPT3_2-7B",
+"n_ctx" : 2048,
+"n_layer" : 32,
+"scale_by_depth" : true,
+"scale_by_in" : false,
+"attention_types" : [[["global", "local"], 16]],
+"mesh_shape" : "x:64,y:4",
+"layout" : "batch:x,embd:y",
+"activation_function" : "gelu",
+"recompute_grad" : true,
+"gradient_clipping" : 1.0,
+"tokens_per_mb_per_replica" : 4096,
+"padding_id" : 50257,
+"eos_id" : 50256
+}
diff --git a/configs/GPT3_XL_pretrained.json b/configs/GPT3_XL_pretrained.json
@@ -0,0 +1,40 @@
+{
+"n_head" : 16,
+"n_vocab" : 50257,
+"embed_dropout" : 0,
+"lr" : 0.0002,
+"lr_decay" : "cosine",
+"warmup_steps" : 3000,
+"beta1" : 0.9,
+"beta2" : 0.95,
+"epsilon" : 1e-08,
+"opt_name" : "adam",
+"weight_decay" : 0,
+"train_batch_size" : 512,
+"attn_dropout" : 0,
+"train_steps" : 400000,
+"lr_decay_end" : 300000,
+"eval_steps" : 10,
+"predict_steps" : 0,
+"res_dropout" : 0,
+"eval_batch_size" : 128,
+"predict_batch_size" : 128,
+"iterations" : 500,
+"n_embd" : 2048,
+"datasets" : [["pile", null, null, null]],
+"model_path" : "gs://neo-d/models/GPT3_XL_Pile",
+"n_ctx" : 2048,
+"n_layer" : 24,
+"scale_by_depth" : true,
+"scale_by_in" : false,
+"attention_types" : [[["global", "local"], 12]],
+"mesh_shape" : "x:128,y:2",
+"layout" : "batch:x,memory_length:y,embd:y",
+"activation_function" : "gelu",
+"recompute_grad" : true,
+"gradient_clipping" : 1.0,
+"tokens_per_mb_per_replica" : 4096,
+"precision" : "bfloat16",
+"padding_id" : 50257,
+"eos_id" : 50256
+}
diff --git a/configs/gpt2_small.json b/configs/gpt2_small.json
@@ -21,7 +21,7 @@
     "predict_batch_size": 8,
     "iterations": 2500,
     "n_embd": 768,
-    "datasets": ["openwebtext2_new_inputs"],
+    "datasets": [["pile", null, null, null]],
     "model_path": "gs://neo-models/GPT2_SMALL",
     "n_ctx": 1024,
     "n_layer": 12,
@@ -32,5 +32,6 @@
     "mesh_shape": "all:64",
     "layout": "batch:all",
     "recompute_grad": false,
-    "gradient_clipping": 1.0
+    "gradient_clipping": 1.0,
+    "precision": "bfloat16"
 }
diff --git a/configs/gpt3_13B_256.json b/configs/gpt3_13B_256.json
@@ -22,7 +22,7 @@
     "predict_batch_size": 1,
     "iterations": 500,
     "n_embd": 5120,
-    "datasets": [["openwebtext-documents", 25, "documents_random", 1.0]],
+    "datasets": [["pile", null, null, null]],
     "model_path": "gs://neo-models/GPT3_13B",
     "n_ctx": 2048,
     "n_layer": 40,

diff --git a/configs/gpt3_13B_256_Pile.json b/configs/gpt3_13B_256_Pile.json
diff --git a/configs/gpt3_2-7B_256.json b/configs/gpt3_2-7B_256.json
@@ -22,7 +22,7 @@
     "predict_batch_size": 1,
     "iterations": 500,
     "n_embd": 2560,
-    "datasets": [["openwebtext-documents", 25, "documents_random", 1.0]],
+    "datasets": [["pile", null, null, null]],
     "model_path": "gs://neo-models/GPT3_2-7B",
     "n_ctx": 2048,
     "n_layer": 32,
@@ -33,6 +33,7 @@
     "layout": "embd:y,batch:x",
     "activation_function": "gelu",
     "recompute_grad": true,
-    "gradient_clipping": 1.0
+    "gradient_clipping": 1.0,
+    "precision": "bfloat16"
 }
 
diff --git a/configs/gpt3_6-7B_256.json b/configs/gpt3_6-7B_256.json
@@ -20,7 +20,7 @@
     "predict_batch_size": 1,
     "iterations": 500,
     "n_embd": 4096,
-    "datasets": [["openwebtext-documents", 25, "documents_random", 1.0]],
+    "datasets": [["pile", null, null, null]],
     "model_path": "gs://neo-models/GPT3_6-7B",
     "n_ctx": 2048,
     "n_layer": 32,
@@ -31,6 +31,8 @@
     "layout": "embd:y,batch:x",
     "activation_function": "gelu",
     "recompute_grad": true,
-    "gradient_clipping": 1.0
+    "gradient_clipping": 1.0,
+    "precision": "bfloat16"
+
 }
 
diff --git a/configs/gpt3_PAR_small_256.json b/configs/gpt3_PAR_small_256.json
@@ -20,7 +20,7 @@
     "predict_batch_size": 1,
     "iterations": 1000,
     "n_embd": 768,
-    "datasets": [["openwebtext-documents", 25, "documents_random", 1.0]],
+    "datasets": [["pile", null, null, null]],
     "model_path": "gs://neo-models/GPT3_PAR_SMALL",
     "n_ctx": 2048,
     "n_layer": 19,
@@ -31,6 +31,7 @@
     "layout": "batch:x,heads:y,vocab:y,intermediate_expanded:y",
     "activation_function": "gelu",
     "recompute_grad": false,
-    "gradient_clipping": 1.0
+    "gradient_clipping": 1.0,
+    "precision": "bfloat16"
 }
 
diff --git a/configs/gpt3_XL_256_Pile.json → configs/gpt3_XL_256.json b/configs/gpt3_XL_256_Pile.json → configs/gpt3_XL_256.json
@@ -20,7 +20,7 @@
     "predict_batch_size": 1,
     "iterations": 500,
     "n_embd": 2048,
-    "datasets": [["pile", 25, "documents_random", 1.0]],
+    "datasets": [["pile", null, null, null]],
     "model_path": "gs://neo-models/GPT3_XL_Pile",
     "n_ctx": 2048,
     "n_layer": 24,

diff --git a/configs/gpt3_large_256.json b/configs/gpt3_large_256.json
@@ -22,7 +22,7 @@
     "predict_batch_size": 1,
     "iterations": 2500,
     "n_embd": 1536,
-    "datasets": [["openwebtext-documents", 25, "documents_random", 1.0]],
+    "datasets": [["pile", null, null, null]],
     "model_path": "gs://neo-models/GPT3_LARGE",
     "n_ctx": 2048,
     "n_layer": 24,
@@ -34,6 +34,7 @@
     "activation_function": "gelu",
     "recompute_grad": true,
     "gradient_clipping": 1.0,
-    "tokens_per_mb_per_replica": 2048
+    "tokens_per_mb_per_replica": 2048,
+    "precision": "bfloat16"
 }
 
diff --git a/configs/gpt3_medium_256.json b/configs/gpt3_medium_256.json
@@ -20,7 +20,7 @@
     "predict_batch_size": 1,
     "iterations": 2500,
     "n_embd": 1024,
-    "datasets": [["openwebtext-documents", 25, "documents_random", 1.0]],
+    "datasets": [["pile", null, null, null]],
     "model_path": "gs://neo-models/GPT3_MEDIUM",
     "n_ctx": 2048,
     "n_layer": 24,
@@ -31,6 +31,8 @@
     "layout": "batch:x,heads:y,vocab:y",
     "activation_function": "gelu",
     "recompute_grad": false,
-    "gradient_clipping": 1.0
+    "gradient_clipping": 1.0,
+    "precision": "bfloat16"
+
 }
 
diff --git a/configs/gpt3_small_256.json b/configs/gpt3_small_256.json
@@ -20,7 +20,7 @@
     "predict_batch_size": 1,
     "iterations": 2500,
     "n_embd": 768,
-    "datasets": [["openwebtext-documents", 25, "documents_random", 1.0]],
+    "datasets": [["openwebtext-documents", null, "documents_random", 1.0]],
     "model_path": "gs://neo-models/GPT3_SMALL",
     "n_ctx": 2048,
     "n_layer": 12,
@@ -31,6 +31,7 @@
     "layout": "batch:x,heads:y,vocab:y,intermediate_expanded:y",
     "activation_function": "gelu",
     "recompute_grad": false,
-    "gradient_clipping": 1.0
+    "gradient_clipping": 1.0,
+    "precision": "bfloat16"
 }