Skip to content

Commit 9f5f7fd

Browse files
authored
gpu github runner (#19)
1 parent f6aaa58 commit 9f5f7fd

File tree

10 files changed

+114
-12
lines changed

10 files changed

+114
-12
lines changed

.github/workflows/ci.yaml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,45 @@ jobs:
4949
if: always()
5050
with:
5151
options: "--check --diff --color --verbose --line-length=110"
52+
53+
run-unittest:
54+
needs: code-lint
55+
runs-on: run-ut
56+
steps:
57+
- run: echo "🎉 Begin Primus Unit Test."
58+
- name: Change owner
59+
run: |
60+
echo "change the owner of all primus files, as some files are generated by root"
61+
echo "GITHUB_WORKSPACE: ${GITHUB_WORKSPACE}"
62+
sudo chown -R $(id -u):$(id -g) ${GITHUB_WORKSPACE}
63+
- name: Checkout
64+
uses: actions/checkout@v4
65+
with:
66+
submodules: 'recursive'
67+
- name: Print dir
68+
run: |
69+
echo "GITHUB_WORKSPACE: ${GITHUB_WORKSPACE}"
70+
echo "current dir: $(pwd)"
71+
- name: Set ut log path
72+
run: |
73+
if [[ "${{ github.event_name }}" == "pull_request" ]]; then
74+
echo "UT_LOG_PATH=$GITHUB_WORKSPACE/ut_out/pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV
75+
elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]]; then
76+
echo "UT_LOG_PATH=$GITHUB_WORKSPACE/ut_out/latest" >> $GITHUB_ENV
77+
elif [[ "${{ github.event_name }}" == "release" ]]; then
78+
TAG_NAME=${{ github.ref }} # refs/tags/v1.0.0
79+
TAG="${TAG_NAME#refs/tags/}" # v1.0.0
80+
echo "UT_LOG_PATH=$GITHUB_WORKSPACE/ut_out/$TAG" >> $GITHUB_ENV
81+
else
82+
echo "UT_LOG_PATH=$GITHUB_WORKSPACE/ut_out/others" >> $GITHUB_ENV
83+
fi
84+
- name: Run unit tests
85+
run: |
86+
echo "set UT_LOG_PATH as: ${{ env.UT_LOG_PATH }}"
87+
if [ -d "${{ env.UT_LOG_PATH }}" ]; then
88+
echo "UT_LOG_PATH dir exists. Deleting..."
89+
rm -rf "${{ env.UT_LOG_PATH }}"
90+
fi
91+
docker exec -e UT_LOG_PATH="${{ env.UT_LOG_PATH }}" \
92+
primus_github_runner bash -c \
93+
"cd $GITHUB_WORKSPACE && python ./tests/run_unit_tests.py"

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[submodule "third_party/Megatron-LM-20250324"]
2+
path = third_party/Megatron-LM-20250324
3+
url = https://github.com/NVIDIA/Megatron-LM.git

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,10 @@ Use the following command to clone the repo:
3131
- [ ] Set Megatron-LM as a submodule repo
3232
```bash
3333
mkdir workspace && cd workspace
34-
git clone [email protected]:AMD-AIG-AIMA/Primus.git
35-
git clone [email protected]:NVIDIA/Megatron-LM.git
36-
# version 20250324
37-
cd Megatron-LM && git checkout d61821b7174bac690afbad9134bcb4983521052f
34+
# If you are cloning the repository for the first time:
35+
git clone --recurse-submodules [email protected]:AMD-AIG-AIMA/Primus.git
36+
# If you've already cloned primus without submodules, run the following commands:
37+
git submodule update --init --recursive
3838
```
3939

4040
## Setup Primus

examples/megatron/run_pretrain.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@ export MODEL_CONFIG=${MODEL_CONFIG:-deepseek_v2_lite}
1212
# framework path
1313
PRIMUS_PATH=$(realpath "$(dirname "$0")/../..")
1414
export PRIMUS_PATH
15-
export MEGATRON_PATH=${MEGATRON_PATH:-${PRIMUS_PATH}/../Megatron-LM}
16-
[[ ! -d "${MEGATRON_PATH}" ]] && {
17-
echo "Error: MEGATRON_PATH (${MEGATRON_PATH}) does not exist"
15+
export MEGATRON_PATH=${MEGATRON_PATH:-${PRIMUS_PATH}/third_party/Megatron-LM-20250324}
16+
[[ ! -d "${MEGATRON_PATH}" || -z "$(ls -A "${MEGATRON_PATH}")" ]] && {
17+
echo "Error: MEGATRON_PATH (${MEGATRON_PATH}) does not exist or is empty"
1818
exit 1
1919
}
2020

tests/configs/test_config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,9 @@ def parse_config(self, cli_args: argparse.Namespace):
2828
exp_config = self.config_parser.parse(cli_args)
2929
return exp_config
3030

31-
def test_exp_sft(self):
31+
def test_exp_configs(self):
3232
exps = [
33-
"examples/deepseek_v3/exp_pretrain.yaml",
33+
"examples/megatron/exp_pretrain.yaml",
3434
]
3535

3636
for exp in exps:

tests/run_unit_tests.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
def get_all_unit_tests():
1818
global DISTRIBUTED_UNIT_TESTS
1919

20-
cur_dir = "./"
20+
cur_dir = "./tests"
2121
unit_tests = {}
2222

2323
for root, dirs, files in os.walk(cur_dir):
@@ -37,7 +37,7 @@ def launch_unit_test(ut_path, nproc_per_node):
3737
global UNIT_TEST_PASS
3838

3939
if nproc_per_node == 1:
40-
cmd = f"pytest {ut_path}"
40+
cmd = f"pytest {ut_path} -s"
4141
else:
4242
cmd = f"torchrun --nnodes 1 --nproc-per-node {nproc_per_node} {ut_path}"
4343

tests/trainer/__init__.py

Whitespace-only changes.
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
###############################################################################
2+
# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
3+
#
4+
# See LICENSE for license information.
5+
#################################################################################
6+
7+
8+
import os
9+
import subprocess
10+
import sys
11+
import time
12+
import unittest
13+
14+
from primus.core.utils import logger
15+
from tests.utils import PrimusUT
16+
17+
18+
class TestMegatronTrainer(PrimusUT):
19+
def __init__(self, *args, **kwargs):
20+
super().__init__(*args, **kwargs)
21+
22+
def setUp(self):
23+
pass
24+
25+
def tearDown(self):
26+
pass
27+
28+
def test_pretrain(self):
29+
shell_entry = "examples/megatron/run_pretrain.sh"
30+
do_print_at_runtime = False
31+
run_stdout = subprocess.PIPE if not do_print_at_runtime else sys.stdout
32+
run_stderr = subprocess.PIPE if not do_print_at_runtime else sys.stderr
33+
try:
34+
logger.info(f"Begin run {shell_entry}...")
35+
start = time.time()
36+
result = subprocess.run(
37+
["bash", f"{shell_entry}"],
38+
check=True,
39+
stdout=run_stdout,
40+
stderr=run_stderr,
41+
text=True,
42+
)
43+
logger.info(f"End run {shell_entry}, time={time.time()-start:.3f} s")
44+
if not do_print_at_runtime:
45+
ut_log_path = os.environ.get("UT_LOG_PATH", "ut_out")
46+
logger.info(f"Training log path: {ut_log_path}/logs/UT-{self.__class__.__name__}")
47+
48+
logger.debug(f"Standard Output:\n {result.stdout}")
49+
logger.debug(f"Standard Error:\n {result.stderr}")
50+
except subprocess.CalledProcessError as e:
51+
os.environ["SCRIPT_ERROR"] = e.stderr.strip()
52+
assert False, f"Shell script failed: {os.environ['SCRIPT_ERROR']}"
53+
54+
55+
if __name__ == "__main__":
56+
unittest.main(buffer=False)

tests/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def __init__(self, *args, **kwargs):
1717

1818
@classmethod
1919
def setUpClass(cls):
20-
ut_log_path = os.environ.get("UT_LOG_PATH", "output/ut")
20+
ut_log_path = os.environ.get("UT_LOG_PATH", "ut_out")
2121
logger_cfg = logger.LoggerConfig(
2222
exp_root_path=ut_log_path,
2323
work_group="develop",

third_party/Megatron-LM-20250324

Submodule Megatron-LM-20250324 added at d61821b

0 commit comments

Comments
 (0)