Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
[ghstack-poisoned]
  • Loading branch information
wconstab committed May 31, 2024
2 parents 44ea24c + f186b2f commit fa10c10
Show file tree
Hide file tree
Showing 10 changed files with 102 additions and 101 deletions.
2 changes: 1 addition & 1 deletion .ci/docker/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
torch >= 2.2.0.dev
torch >= 2.3.0
datasets >= 2.19.0
tomli >= 1.1.0 ; python_version < "3.11"
tensorboard
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@
name: 4 GPU Unit Test
name: 4 GPU Integration Test

on:
push:
branches: [ main ]
pull_request:
schedule:
# Runs hourly
- cron: '0 * * * *'

concurrency:
group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true

defaults:
run:
shell: bash -l -eo pipefail {0}

jobs:
build-test:
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
Expand Down
40 changes: 0 additions & 40 deletions .github/workflows/integration_test_periodic.yaml

This file was deleted.

5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,8 @@ out
wandb

torchtitan/datasets/**/*.model

# temp files
*.log
error.json
_remote_module_non_scriptable.py
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[![GPU Integration Test](https://github.com/pytorch/torchtitan/actions/workflows/unit_test_4gpu.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/unit_test_4gpu.yaml)
[![4 GPU Integration Test](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_4gpu.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_4gpu.yaml)

# torchtitan

Expand Down
75 changes: 46 additions & 29 deletions test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,15 @@ class OverrideDefinitions:

override_args: Sequence[Sequence[str]] = tuple(tuple(" "))
test_descr: str = "default"
test_name: str = "default"
requires_seed_checkpoint: bool = False
ngpu: int = 4

def __repr__(self):
return self.test_descr


def build_test_list(args):
def build_test_list():
"""
key is the config file name and value is a list of OverrideDefinitions
that is used to generate variations of integration tests based on the
Expand All @@ -48,149 +49,159 @@ def build_test_list(args):
[
[
"--checkpoint.enable_checkpoint",
f"--job.dump_folder {args.output_dir}/pp_1f1b/",
"--experimental.pipeline_parallel_degree 2",
"--experimental.pipeline_parallel_split_points layers.1",
"--experimental.pipeline_parallel_schedule 1f1b",
"--training.data_parallel_degree 1",
],
],
"PP 1D test 1f1b",
"pp_1f1b",
requires_seed_checkpoint=True,
ngpu=2,
),
OverrideDefinitions(
[
[
"--checkpoint.enable_checkpoint",
f"--job.dump_folder {args.output_dir}/pp_gpipe/",
"--experimental.pipeline_parallel_degree 2",
"--experimental.pipeline_parallel_split_points layers.1",
"--experimental.pipeline_parallel_schedule gpipe",
"--training.data_parallel_degree 1",
],
],
"PP 1D test gpipe",
"pp_gpipe",
requires_seed_checkpoint=True,
ngpu=2,
),
OverrideDefinitions(
[
[
"--checkpoint.enable_checkpoint",
f"--job.dump_folder {args.output_dir}/pp_dp_1f1b/",
"--experimental.pipeline_parallel_degree 2",
"--experimental.pipeline_parallel_split_points layers.1",
"--experimental.pipeline_parallel_schedule 1f1b",
"--training.data_parallel_degree 2",
],
],
"PP+DP 1f1b 2D test",
"pp_dp_1f1b",
requires_seed_checkpoint=True,
),
OverrideDefinitions(
[
[
"--checkpoint.enable_checkpoint",
f"--job.dump_folder {args.output_dir}/pp_dp_gpipe/",
"--experimental.pipeline_parallel_degree 2",
"--experimental.pipeline_parallel_split_points layers.1",
"--experimental.pipeline_parallel_schedule gpipe",
"--training.data_parallel_degree 2",
],
],
"PP+DP gpipe 2D test",
"pp_dp_gpipe",
requires_seed_checkpoint=True,
),
OverrideDefinitions(
[
[
"--checkpoint.enable_checkpoint",
f"--job.dump_folder {args.output_dir}/pp_tp/",
"--experimental.pipeline_parallel_degree 2",
"--experimental.pipeline_parallel_split_points layers.1",
"--training.tensor_parallel_degree 2",
"--model.norm_type rmsnorm", # fused_rmsnorm not yet compatible with TP
],
],
"PP+TP 2D test",
"pp_tp",
requires_seed_checkpoint=True,
),
OverrideDefinitions(
[
[
f"--job.dump_folder {args.output_dir}/default/",
"--checkpoint.enable_checkpoint",
"--experimental.pipeline_parallel_degree 2",
"--experimental.pipeline_parallel_split_points layers.1",
"--model.norm_type rmsnorm", # fused_rmsnorm not yet compatible with tracer
],
],
"Default",
"PP tracer frontend test",
"pp_tracer",
requires_seed_checkpoint=True,
),
OverrideDefinitions(
[
[],
],
"default",
"default",
),
OverrideDefinitions(
[
[
"--training.compile --model.norm_type=rmsnorm",
f"--job.dump_folder {args.output_dir}/1d_compile/",
],
],
"1D compile",
"1d_compile",
),
OverrideDefinitions(
[
[
"--training.compile --training.tensor_parallel_degree 2 --model.norm_type=rmsnorm",
f"--job.dump_folder {args.output_dir}/2d_compile/",
],
],
"2D compile",
"2d_compile",
),
OverrideDefinitions(
[
[
"--training.tensor_parallel_degree 2 --model.norm_type=rmsnorm",
f"--job.dump_folder {args.output_dir}/eager_2d/",
],
],
"Eager mode 2DParallel",
"eager_2d",
),
OverrideDefinitions(
[
[
"--checkpoint.enable_checkpoint",
f"--job.dump_folder {args.output_dir}/full_checkpoint/",
],
[
"--checkpoint.enable_checkpoint",
f"--job.dump_folder {args.output_dir}/full_checkpoint/",
"--training.steps 20",
],
],
"Checkpoint Integration Test - Save Load Full Checkpoint",
"full_checkpoint",
),
OverrideDefinitions(
[
[
"--checkpoint.enable_checkpoint",
f"--job.dump_folder {args.output_dir}/model_weights_only_fp32/",
"--checkpoint.model_weights_only",
],
],
"Checkpoint Integration Test - Save Model Weights Only fp32",
"model_weights_only_fp32",
),
OverrideDefinitions(
[
[
"--checkpoint.enable_checkpoint",
f"--job.dump_folder {args.output_dir}/model_weights_only_bf16/",
"--checkpoint.model_weights_only",
"--checkpoint.export_dtype bfloat16",
],
],
"Checkpoint Integration Test - Save Model Weights Only bf16",
"model_weights_only_bf16",
),
OverrideDefinitions(
[
[
"--checkpoint.enable_checkpoint",
f"--job.dump_folder {args.output_dir}/pp_dp_tp/",
"--experimental.pipeline_parallel_degree 2",
"--experimental.pipeline_parallel_split_points layers.1",
"--training.data_parallel_degree 2",
Expand All @@ -199,6 +210,7 @@ def build_test_list(args):
],
],
"PP+DP+TP 3D test",
"pp_dp_tp",
requires_seed_checkpoint=True,
ngpu=8,
),
Expand All @@ -216,26 +228,22 @@ def _run_cmd(cmd):
)


def run_test(test_flavor: OverrideDefinitions, full_path: str):
def run_test(test_flavor: OverrideDefinitions, full_path: str, output_dir: str):
# run_test supports sequence of tests.
for override_arg in test_flavor.override_args:
test_name = test_flavor.test_name
dump_folder_arg = f"--job.dump_folder {output_dir}/{test_name}"

all_ranks = ",".join(map(str, range(test_flavor.ngpu)))
cmd = f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} ./run_llama_train.sh"
cmd += " " + dump_folder_arg
if override_arg:
cmd += " " + " ".join(override_arg)
logger.info(
f"=====Integration test, flavor : {test_flavor.test_descr}, command : {cmd}====="
)

if test_flavor.requires_seed_checkpoint:
dump_folder_arg = None
for arg in override_arg:
if "--job.dump_folder" in arg:
dump_folder_arg = arg
assert (
dump_folder_arg is not None
), "Can't use seed checkpoint if folder is not specified"
logger.info("Creating seed checkpoint")
result = _run_cmd(
f"CONFIG_FILE={full_path} ./create_seed_checkpoint.sh {dump_folder_arg}"
Expand All @@ -251,7 +259,7 @@ def run_test(test_flavor: OverrideDefinitions, full_path: str):


def run_tests(args):
integration_tests_flavors = build_test_list(args)
integration_tests_flavors = build_test_list()
for config_file in os.listdir(args.config_dir):
if config_file.endswith(".toml"):
full_path = os.path.join(args.config_dir, config_file)
Expand All @@ -262,16 +270,25 @@ def run_tests(args):
)
if is_integration_test:
for test_flavor in integration_tests_flavors[config_file]:
if (args.ngpu == 8 and test_flavor.ngpu == 8) or (
args.ngpu == 4 and test_flavor.ngpu <= 4
):
run_test(test_flavor, full_path)
if args.test == "all" or test_flavor.test_name == args.test:
if args.ngpu < test_flavor.ngpu:
logger.info(
f"Skipping test {test_flavor.test_name} that requires {test_flavor.ngpu} gpus,"
f" because --ngpu arg is {args.ngpu}"
)
else:
run_test(test_flavor, full_path, args.output_dir)


def main():
parser = argparse.ArgumentParser()
parser.add_argument("output_dir")
parser.add_argument("--config_dir", default="./train_configs")
parser.add_argument(
"--test",
default="all",
help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)",
)
parser.add_argument("--ngpu", default=4, type=int)
args = parser.parse_args()

Expand Down
6 changes: 6 additions & 0 deletions torchtitan/config_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,12 @@ def __init__(self):
default="tb",
help="Folder to dump TensorBoard states",
)
self.parser.add_argument(
"--metrics.rank_0_only",
default=True,
action="store_true",
help="Whether to save TensorBoard metrics only for rank 0 or for all ranks",
)

# model configs
self.parser.add_argument(
Expand Down
15 changes: 10 additions & 5 deletions torchtitan/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,16 +113,21 @@ def close(self):

def build_metric_logger(config: JobConfig, tag: Optional[str] = None):
dump_dir = config.job.dump_folder
save_tb_folder = config.metrics.save_tb_folder
# since we don't have run id yet, use current minute as identifier
tb_config = config.metrics
save_tb_folder = tb_config.save_tb_folder
# since we don't have run id, use current minute as the identifier
datetime_str = datetime.now().strftime("%Y%m%d-%H%M")
log_dir = os.path.join(dump_dir, save_tb_folder, datetime_str)

enable_tb = config.metrics.enable_tensorboard
enable_tb = tb_config.enable_tensorboard
if enable_tb:
logger.info(
f"Metrics logging active. Tensorboard logs will be saved at {log_dir}"
)
if tb_config.rank_0_only:
enable_tb = torch.distributed.get_rank() == 0
else:
rank_str = f"rank_{torch.distributed.get_rank()}"
log_dir = os.path.join(log_dir, rank_str)

rank_str = f"rank_{torch.distributed.get_rank()}"
return MetricLogger(os.path.join(log_dir, rank_str), tag, enable_tb)
return MetricLogger(log_dir, tag, enable_tb)
Loading

0 comments on commit fa10c10

Please sign in to comment.