Skip to content

Commit b6a88bd

Browse files
authored
Merge branch 'dev' into dev
2 parents 9b3f18b + 46d1f47 commit b6a88bd

File tree

344 files changed

+32330
-8995
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

344 files changed

+32330
-8995
lines changed

.github/oncall_schedule.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[]
2+

.github/scripts/oncall_manager.py

Lines changed: 279 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,279 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import os
16+
import sys
17+
import json
18+
import requests
19+
import argparse
20+
from datetime import datetime, timedelta, timezone
21+
22+
# Constants
23+
GITHUB_API_URL = "https://api.github.com"
24+
SCHEDULE_FILE = ".github/oncall_schedule.json"
25+
ROTATION_TEAM_SLUG = "mcore-oncall-rotation"
26+
ACTIVE_ONCALL_TEAM_SLUG = "megatron-oncall"
27+
TARGET_WEEKS = 12
28+
29+
def get_headers():
30+
token = os.environ.get("GH_TOKEN")
31+
if not token:
32+
# Fallback to GITHUB_TOKEN if GH_TOKEN not set
33+
token = os.environ.get("GITHUB_TOKEN")
34+
35+
if not token:
36+
print("Error: GH_TOKEN or GITHUB_TOKEN not set")
37+
sys.exit(1)
38+
39+
return {
40+
"Authorization": f"token {token}",
41+
"Accept": "application/vnd.github.v3+json"
42+
}
43+
44+
def get_repo_info():
45+
"""Returns (owner, repo) from GITHUB_REPOSITORY env var."""
46+
repo_env = os.environ.get("GITHUB_REPOSITORY")
47+
if not repo_env:
48+
print("Error: GITHUB_REPOSITORY environment variable not set")
49+
sys.exit(1)
50+
parts = repo_env.split("/")
51+
return parts[0], parts[1]
52+
53+
def get_team_members(org, team_slug):
54+
"""Fetches members of the GitHub team."""
55+
url = f"{GITHUB_API_URL}/orgs/{org}/teams/{team_slug}/members"
56+
headers = get_headers()
57+
58+
members = []
59+
page = 1
60+
while True:
61+
resp = requests.get(f"{url}?per_page=100&page={page}", headers=headers)
62+
if resp.status_code != 200:
63+
print(f"Error fetching team members: {resp.status_code} {resp.text}")
64+
sys.exit(1)
65+
66+
data = resp.json()
67+
if not data:
68+
break
69+
70+
members.extend([m['login'] for m in data])
71+
if len(data) < 100:
72+
break
73+
page += 1
74+
75+
return members
76+
77+
def load_schedule():
78+
if not os.path.exists(SCHEDULE_FILE):
79+
return []
80+
try:
81+
with open(SCHEDULE_FILE, 'r') as f:
82+
data = json.load(f)
83+
# Normalize to list of dicts if it's a list of strings
84+
schedule = []
85+
for item in data:
86+
if isinstance(item, str):
87+
schedule.append({"user": item, "date": "YYYY-MM-DD"})
88+
else:
89+
schedule.append(item)
90+
return schedule
91+
except (json.JSONDecodeError, FileNotFoundError):
92+
return []
93+
94+
def save_schedule(schedule):
95+
with open(SCHEDULE_FILE, 'w') as f:
96+
json.dump(schedule, f, indent=4)
97+
f.write('\n') # trailing newline
98+
99+
def update_active_oncall_team(org, new_oncall):
100+
"""Updates the active oncall team to contain only the new oncall user."""
101+
# 1. Get current members of the active team
102+
current_members = get_team_members(org, ACTIVE_ONCALL_TEAM_SLUG)
103+
104+
# 2. Add the new oncall if not present
105+
if new_oncall not in current_members:
106+
url = f"{GITHUB_API_URL}/orgs/{org}/teams/{ACTIVE_ONCALL_TEAM_SLUG}/memberships/{new_oncall}"
107+
resp = requests.put(url, headers=get_headers())
108+
if resp.status_code == 200:
109+
print(f"Added {new_oncall} to {ACTIVE_ONCALL_TEAM_SLUG}")
110+
else:
111+
print(f"Failed to add {new_oncall} to {ACTIVE_ONCALL_TEAM_SLUG}: {resp.status_code} {resp.text}")
112+
113+
# 3. Remove everyone else
114+
for member in current_members:
115+
if member != new_oncall:
116+
url = f"{GITHUB_API_URL}/orgs/{org}/teams/{ACTIVE_ONCALL_TEAM_SLUG}/memberships/{member}"
117+
resp = requests.delete(url, headers=get_headers())
118+
if resp.status_code == 204:
119+
print(f"Removed {member} from {ACTIVE_ONCALL_TEAM_SLUG}")
120+
else:
121+
print(f"Failed to remove {member} from {ACTIVE_ONCALL_TEAM_SLUG}: {resp.status_code} {resp.text}")
122+
123+
def rotate_schedule(repo_owner, dry_run=False):
124+
schedule = load_schedule()
125+
print(f"Current schedule length: {len(schedule)}")
126+
127+
# 1. Rotate (Remove past week)
128+
# Only if schedule is not empty.
129+
if schedule:
130+
# Check date of first entry
131+
first_entry = schedule[0]
132+
try:
133+
# We assume the date is the *start* of the oncall shift (Wednesday).
134+
# The shift ends 7 days later.
135+
start_date = datetime.strptime(first_entry['date'], "%Y-%m-%d").date()
136+
end_date = start_date + timedelta(days=7)
137+
138+
today = datetime.now(timezone.utc).date()
139+
140+
# If today is >= end_date, the shift is over.
141+
# (e.g. Started last Wed, ends today Wed. If today is Wed, we rotate)
142+
if today >= end_date:
143+
removed = schedule.pop(0)
144+
print(f"Rotated out: {removed} (Ended {end_date})")
145+
else:
146+
print(f"First entry {first_entry} has not ended yet (Ends {end_date}). Not removing.")
147+
except ValueError:
148+
# Fallback if date is invalid, rotate anyway
149+
removed = schedule.pop(0)
150+
print(f"Rotated out (invalid date): {removed}")
151+
else:
152+
print("Schedule empty, nothing to rotate.")
153+
154+
# 2. Replenish
155+
ensure_schedule_filled(schedule, repo_owner)
156+
157+
# 3. Update active oncall team
158+
if schedule:
159+
current_oncall = schedule[0]['user']
160+
print(f"New active oncall: {current_oncall}")
161+
if not dry_run:
162+
update_active_oncall_team(repo_owner, current_oncall)
163+
else:
164+
print(f"Dry run: Would update {ACTIVE_ONCALL_TEAM_SLUG} to contain only {current_oncall}")
165+
166+
if not dry_run:
167+
save_schedule(schedule)
168+
print("Schedule updated and saved.")
169+
else:
170+
print("Dry run: Schedule not saved.")
171+
print(json.dumps(schedule, indent=4))
172+
173+
def get_last_wednesday():
174+
today = datetime.now(timezone.utc).date()
175+
# Monday=0, Wednesday=2
176+
offset = (today.weekday() - 2) % 7
177+
return today - timedelta(days=offset)
178+
179+
def ensure_schedule_filled(schedule, repo_owner):
180+
"""Appends users to schedule until it reaches TARGET_WEEKS."""
181+
members = get_team_members(repo_owner, ROTATION_TEAM_SLUG)
182+
if not members:
183+
print(f"Warning: No team members found in {ROTATION_TEAM_SLUG}.")
184+
return
185+
186+
members.sort() # Deterministic order
187+
188+
while len(schedule) < TARGET_WEEKS:
189+
# Determine start date for the new entry
190+
if not schedule:
191+
# Start with the most recent Wednesday if list is empty
192+
next_date = get_last_wednesday()
193+
194+
# Start with the first member alphabetically if list is empty
195+
next_user = members[0]
196+
else:
197+
last_entry = schedule[-1]
198+
last_user = last_entry['user']
199+
200+
# Parse last date and add 7 days
201+
try:
202+
last_date = datetime.strptime(last_entry['date'], "%Y-%m-%d").date()
203+
next_date = last_date + timedelta(days=7)
204+
except ValueError:
205+
# Fallback if date is invalid/placeholder
206+
next_date = get_last_wednesday() + timedelta(days=7 * len(schedule))
207+
208+
try:
209+
# Find index of last scheduled user in the team list
210+
if last_user in members:
211+
last_idx = members.index(last_user)
212+
next_idx = (last_idx + 1) % len(members)
213+
next_user = members[next_idx]
214+
else:
215+
# Last user not in team, just pick first member
216+
next_user = members[0]
217+
except ValueError:
218+
next_user = members[0]
219+
220+
new_entry = {"user": next_user, "date": next_date.strftime("%Y-%m-%d")}
221+
schedule.append(new_entry)
222+
print(f"Appended: {new_entry}")
223+
224+
def assign_reviewer(pr_number):
225+
"""Assigns the current oncall as the reviewer for the PR."""
226+
schedule = load_schedule()
227+
if not schedule:
228+
print("Error: Schedule is empty. Cannot assign reviewer.")
229+
sys.exit(1)
230+
231+
current_entry = schedule[0]
232+
current_oncall = current_entry['user']
233+
print(f"Current oncall: {current_oncall} (Since {current_entry['date']})")
234+
235+
owner, repo = get_repo_info()
236+
url = f"{GITHUB_API_URL}/repos/{owner}/{repo}/pulls/{pr_number}/requested_reviewers"
237+
238+
# We can assign the user directly
239+
data = {"reviewers": [current_oncall]}
240+
resp = requests.post(url, headers=get_headers(), json=data)
241+
242+
if resp.status_code in [201, 200]:
243+
print(f"Successfully requested review from {current_oncall}")
244+
else:
245+
print(f"Failed to request review: {resp.status_code} {resp.text}")
246+
sys.exit(1)
247+
248+
def main():
249+
parser = argparse.ArgumentParser(description="Manage Oncall Schedule")
250+
subparsers = parser.add_subparsers(dest="command", required=True)
251+
252+
# Rotate command
253+
parser_rotate = subparsers.add_parser("rotate", help="Rotate the schedule (remove first, append new)")
254+
parser_rotate.add_argument("--dry-run", action="store_true", help="Do not save changes")
255+
256+
# Fill command (just fill up to 12 without rotating - useful for init)
257+
parser_fill = subparsers.add_parser("fill", help="Fill the schedule to 12 weeks without rotating")
258+
259+
# Assign command
260+
parser_assign = subparsers.add_parser("assign", help="Assign current oncall to PR")
261+
parser_assign.add_argument("--pr", type=int, required=True, help="PR number")
262+
263+
args = parser.parse_args()
264+
265+
owner, _ = get_repo_info()
266+
267+
if args.command == "rotate":
268+
rotate_schedule(owner, dry_run=args.dry_run)
269+
elif args.command == "fill":
270+
schedule = load_schedule()
271+
ensure_schedule_filled(schedule, owner)
272+
save_schedule(schedule)
273+
print("Schedule filled and saved.")
274+
elif args.command == "assign":
275+
assign_reviewer(args.pr)
276+
277+
if __name__ == "__main__":
278+
main()
279+

.github/workflows/_build_test_publish_wheel.yml

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,21 @@
11
on:
22
workflow_call:
3+
inputs:
4+
ref:
5+
required: false
6+
description: Ref (SHA or branch) to release
7+
type: string
8+
default: ${{ github.sha }}
9+
dry-run:
10+
required: false
11+
description: Upload to PyPy Test instance
12+
type: boolean
13+
default: true
14+
no-publish:
15+
required: false
16+
description: Do not publish the wheel
17+
type: boolean
18+
default: true
319
secrets:
420
TWINE_USERNAME:
521
required: true
@@ -26,17 +42,18 @@ jobs:
2642
PACKAGE: ${{ matrix.PACKAGE }}
2743
IMAGE: ${{ matrix.IMAGE }}
2844
PLATFORM: ${{ matrix.PLATFORM }}
45+
PUBLISH_DRYRUN: ${{ inputs.dry-run }}
2946
steps:
3047
- name: Checkout repository
3148
uses: actions/checkout@v4
49+
with:
50+
ref: ${{ inputs.ref }}
3251

3352
- name: Build wheel
3453
id: build-wheel
3554
run: |
3655
set -x
3756
38-
PUBLISH_DRYRUN=yes
39-
4057
if [ "$PACKAGE" = "megatron-core" ]; then
4158
ROOTDIR="megatron/core"
4259
BUILD_DIR="."
@@ -48,7 +65,7 @@ jobs:
4865
exit 1
4966
fi
5067
51-
if [ "$PUBLISH_DRYRUN" = "yes" ]; then
68+
if [ "$PUBLISH_DRYRUN" = "true" ]; then
5269
PRE_RELEASE=$(sed -n "s/.*PRE_RELEASE = '\(.*\)'/\1/p" $ROOTDIR/package_info.py)
5370
sed -i "/^PRE_RELEASE/c\PRE_RELEASE = '${PRE_RELEASE}.dev$((RANDOM % 900000 + 100000))'" $ROOTDIR/package_info.py
5471
fi
@@ -123,26 +140,31 @@ jobs:
123140
- name: Upload wheels
124141
uses: actions/upload-artifact@v4
125142
with:
126-
name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}
143+
name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}-${{ inputs.dry-run && 'dry-run' || 'release' }}
127144
path: dist/
128145

129146
publish-wheels:
130147
needs: [build-and-test-wheels]
131148
runs-on: ubuntu-latest
132-
if: github.ref == 'refs/heads/main'
149+
if: inputs.no-publish == false
133150
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && 'main' || 'public' }}
134151
strategy:
135152
fail-fast: false
136153
matrix:
137154
include:
138-
- PACKAGE: megatron_core
139-
- PACKAGE: megatron_fsdp
155+
- PACKAGE: megatron-core
156+
PLATFORM: arm64
157+
- PACKAGE: megatron-core
158+
PLATFORM: amd64
159+
- PACKAGE: megatron-fsdp
160+
IMAGE: quay.io/pypa/manylinux_2_28_x86_64
140161
env:
141162
PACKAGE: ${{ matrix.PACKAGE }}
142163
steps:
143164
- name: Download wheels
144165
uses: actions/download-artifact@v4
145166
with:
167+
name: wheels-${{ matrix.PACKAGE }}-${{ matrix.PLATFORM }}-${{ inputs.dry-run && 'dry-run' || 'release' }}
146168
path: dist/
147169
merge-multiple: true
148170

0 commit comments

Comments
 (0)