Skip to content

Commit 9610391

Browse files
Simplify and relax dependencies (Take 2) (#818)
* Remove unused einops dependency * Make Weights & Biases monitoring optional - Move `wandb` to `./requirements/requirements-wandb.txt` - Clean up `./deepy.py` - Make GPT-NeoX not explode if `wandb` is not installed. - Tell the user when `wandb` is not importable and explain how to fix. - Remove implicit dependence on `shortuuid`. - Ensure that `wandb` is installed in Dockerfile. * Relax many dependencies * Remove usage of uuid.uuid4() * Update Dockerfile Add flash attention install * Update logging.py to pass when wandb is unimportable --------- Co-authored-by: Quentin Anthony <[email protected]>
1 parent e897c23 commit 9610391

9 files changed

+67
-51
lines changed

Dockerfile

+3
Original file line numberDiff line numberDiff line change
@@ -88,10 +88,13 @@ RUN mkdir -p /home/mchorse/.ssh /job && \
8888
#### Python packages
8989
RUN pip install torch==1.8.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html && pip cache purge
9090
COPY requirements/requirements.txt .
91+
COPY requirements/requirements-wandb.txt .
9192
COPY requirements/requirements-onebitadam.txt .
9293
COPY requirements/requirements-sparseattention.txt .
9394
RUN pip install -r requirements.txt && pip install -r requirements-onebitadam.txt && \
9495
pip install -r requirements-sparseattention.txt && \
96+
pip install -r requirements-flashattention.txt && \
97+
pip install -r requirements-wandb.txt && \
9598
pip install protobuf==3.20.* && \
9699
pip cache purge
97100

deepy.py

+16-13
Original file line numberDiff line numberDiff line change
@@ -16,23 +16,26 @@
1616
import logging
1717
import os
1818

19-
import deepspeed
20-
from deepspeed.launcher.runner import main
19+
import deepspeed.launcher.runner
2120

22-
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
2321

24-
from megatron.neox_arguments import NeoXArgs
25-
from megatron.utils import get_wandb_api_key
22+
def main():
23+
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))
2624

25+
from megatron.neox_arguments import NeoXArgs
26+
from megatron.utils import get_wandb_api_key
2727

28-
neox_args = NeoXArgs.consume_deepy_args()
29-
deepspeed_main_args = neox_args.get_deepspeed_main_args()
28+
neox_args = NeoXArgs.consume_deepy_args()
29+
deepspeed_main_args = neox_args.get_deepspeed_main_args()
30+
31+
# Extract wandb API key and inject into worker environments
32+
wandb_token = get_wandb_api_key(neox_args=neox_args)
33+
if wandb_token is not None:
34+
deepspeed.launcher.runner.EXPORT_ENVS.append("WANDB_API_KEY")
35+
os.environ["WANDB_API_KEY"] = wandb_token
36+
37+
deepspeed.launcher.runner.main(deepspeed_main_args)
3038

31-
# Extract wandb API key and inject into worker environments
32-
wandb_token = get_wandb_api_key(neox_args=neox_args)
33-
if wandb_token is not None:
34-
deepspeed.launcher.runner.EXPORT_ENVS.append("WANDB_API_KEY")
35-
os.environ["WANDB_API_KEY"] = wandb_token
3639

3740
if __name__ == "__main__":
38-
main(deepspeed_main_args)
41+
main()

megatron/logging.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,12 @@
1414

1515
import sys
1616
import torch
17-
import wandb
17+
18+
try:
19+
import wandb
20+
except ModuleNotFoundError:
21+
pass
22+
1823
from megatron import mpu, print_rank_0
1924
from megatron.utils import report_memory
2025

megatron/neox_arguments/arguments.py

+17-13
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
import yaml
1919
import json
2020
import logging
21-
import shortuuid
2221
import copy
2322
import torch
2423
import argparse
@@ -278,13 +277,13 @@ def consume_deepy_args(cls):
278277
"--wandb_group",
279278
type=str,
280279
default=None,
281-
help='Weights and Biases group name - used to group together "runs".',
280+
help='Weights & Biases group name - used to group together "runs".',
282281
)
283282
group.add_argument(
284283
"--wandb_team",
285284
type=str,
286285
default=None,
287-
help="Team name for Weights and Biases.",
286+
help="Weights & Biases team name.",
288287
)
289288

290289
group = parser.add_argument_group(title="Eval args")
@@ -372,11 +371,22 @@ def consume_deepy_args(cls):
372371
paths_to_yml_files=conf_files, overwrite_values=overwrite_values
373372
)
374373

375-
if neox_args.wandb_group is not None:
376-
# concat the wandb group name with a uid to make sure it's unique
377-
import wandb
374+
if neox_args.use_wandb:
375+
try:
376+
import wandb
377+
378+
# Check if the W&B group name is configured
379+
if neox_args.wandb_group is None:
380+
# Set a randomized string as group name if no group name is provided
381+
neox_args.wandb_group = wandb.sdk.lib.runid.generate_id()
382+
else:
383+
# Concatenate the W&B group name with a randomized string to ensure uniqueness.
384+
neox_args.wandb_group += "_" + wandb.sdk.lib.runid.generate_id()
385+
except ModuleNotFoundError as e:
386+
if e.name == "wandb":
387+
e.msg += "\nWeights & Biases monitoring was requested but `wandb` was not found. Install `wandb` to use Weights & Biases, or set the `use_wandb` configuration option to a boolean false to disable Weights & Biases logging."
388+
raise e
378389

379-
neox_args.wandb_group += "_" + wandb.util.generate_id()
380390
neox_args.print()
381391

382392
return neox_args
@@ -736,12 +746,6 @@ def calculate_derived(self):
736746
Derives additional configuration values necessary for training from the current config
737747
"""
738748

739-
# wandb
740-
# sets a unique wandb group
741-
if self.wandb_group is None:
742-
# if none is defined a uuid is set for the run
743-
self.wandb_group = shortuuid.uuid()
744-
745749
# number of gpus
746750
# Get number of GPUs param or hostfile to determine train_batch_size
747751
global_num_gpus = getattr(self, "global_num_gpus", None)

megatron/utils.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,19 @@
2424
from typing import Dict, List
2525

2626
import requests
27-
import wandb
28-
from wandb import UsageError
27+
28+
try:
29+
import wandb
30+
except ModuleNotFoundError:
31+
pass
2932

3033
import torch
3134

3235
from deepspeed.launcher.runner import fetch_hostfile, parse_inclusion_exclusion
3336

3437
from megatron import print_rank_0
3538
from megatron import mpu
36-
from deepspeed import PipelineEngine, DeepSpeedEngine
39+
3740
from collections import deque
3841

3942

@@ -167,7 +170,7 @@ def init_wandb(neox_args):
167170
force=False,
168171
entity=neox_args.wandb_team,
169172
)
170-
except UsageError as e:
173+
except wandb.UsageError as e:
171174
neox_args.update_value("use_wandb", False)
172175
print(e)
173176
print(

requirements/requirements-dev.txt

+6-7
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
1-
autopep8==1.5.6
2-
clang-format==13.0.1
3-
pre-commit~=2.17.0
4-
pytest==6.2.3
5-
pytest-cov==2.11.1
6-
pytest-forked==1.3.0
1+
autopep8>=1.5.6
2+
clang-format>=13.0.1
3+
pre-commit>=2.17.0
4+
pytest>=6.2.3
5+
pytest-cov>=2.11.1
6+
pytest-forked>=1.3.0
77
pytest-xdist
8-
transformers~=4.16.2
+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
cupy-cuda111==8.6.0
1+
cupy-cuda111>=8.6.0

requirements/requirements-wandb.txt

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
wandb>=0.10.28

requirements/requirements.txt

+10-12
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,14 @@
1-
einops==0.3.0
2-
ftfy==6.0.1
3-
git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
4-
huggingface_hub==0.11.0
5-
lm_eval==0.3.0
6-
mpi4py==3.0.3
7-
numpy==1.22.0
8-
pybind11==2.6.2
91
deepspeed
2+
ftfy>=6.0.1
3+
git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
4+
huggingface_hub>=0.11.0
5+
lm_eval>=0.3.0
6+
mpi4py>=3.0.3
7+
numpy>=1.22.0
8+
pybind11>=2.6.2
109
regex
1110
sentencepiece
1211
six
13-
tiktoken==0.1.2
14-
tokenizers==0.12.1
15-
transformers~=4.24.0
16-
wandb==0.10.28
12+
tiktoken>=0.1.2
13+
tokenizers>=0.12.1
14+
transformers>=4.24.0

0 commit comments

Comments
 (0)