Skip to content

Commit 45c164b

Browse files
authored
Merge pull request #3410 from OpenNeuroOrg/feat/health-check
feat(scripts): Add a check-github-sync script
2 parents 5f21c97 + 98f2870 commit 45c164b

File tree

2 files changed

+162
-0
lines changed

2 files changed

+162
-0
lines changed

.github/workflows/health-check.yml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
name: Check public datasets
2+
3+
on:
4+
push:
5+
branches: ['feat/health-check']
6+
schedule:
7+
# 8am PST / 9am PDT Mondays
8+
- cron: '0 16 * * 1'
9+
workflow_dispatch:
10+
11+
defaults:
12+
run:
13+
shell: bash
14+
15+
env:
16+
# Disable progress bar
17+
TTY_COMPATIBLE: 0
18+
FORCE_COLOR: true
19+
20+
concurrency:
21+
group: ${{ github.workflow }}-${{ github.ref }}
22+
cancel-in-progress: true
23+
24+
permissions:
25+
contents: read
26+
27+
jobs:
28+
run:
29+
runs-on: ubuntu-latest
30+
steps:
31+
- name: Checkout
32+
uses: actions/checkout@v4
33+
- name: Install the latest version of uv
34+
uses: astral-sh/setup-uv@v5
35+
- run: scripts/check-github-sync

scripts/check-github-sync

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
#!/usr/bin/env -S uv run --script
2+
# /// script
3+
# requires-python = ">=3.12"
4+
# dependencies = [
5+
# "gql[httpx]",
6+
# "rich",
7+
# "stamina",
8+
# "structlog",
9+
# ]
10+
# ///
11+
12+
import subprocess
13+
from functools import cache
14+
from operator import itemgetter
15+
from typing import Iterator
16+
17+
import httpx
18+
import stamina
19+
import structlog
20+
from gql import Client, gql as gql_query
21+
from gql.transport.httpx import HTTPXTransport
22+
from rich.progress import Progress, TextColumn, BarColumn, MofNCompleteColumn
23+
24+
logger = structlog.get_logger()
25+
26+
ENDPOINT = "https://openneuro.org/crn/graphql"
27+
QUERY = gql_query("""
28+
query DatasetsWithLatestSnapshots($count: Int, $after: String) {
29+
datasets(
30+
first: $count,
31+
after: $after,
32+
orderBy: { created: ascending }
33+
filterBy: { public: true }
34+
) {
35+
edges {
36+
node {
37+
id
38+
latestSnapshot {
39+
tag
40+
created
41+
hexsha
42+
}
43+
}
44+
}
45+
pageInfo {
46+
hasNextPage
47+
endCursor
48+
count
49+
}
50+
}
51+
}
52+
""")
53+
54+
55+
@cache
56+
def get_client(url: str) -> Client:
57+
return Client(transport=HTTPXTransport(url=url))
58+
59+
60+
@stamina.retry(on=httpx.HTTPError)
61+
def get_page(url: str, count: int, after: str | None) -> dict:
62+
return get_client(url).execute(
63+
QUERY, variable_values={"count": count, "after": after}
64+
)
65+
66+
67+
def get_dataset_count(url: str) -> int:
68+
response = get_page(url, 0, None)
69+
return response["datasets"]["pageInfo"]["count"]
70+
71+
72+
def dataset_iterator(url: str) -> Iterator[tuple[str, str, str, str]]:
73+
page_info = {"hasNextPage": True, "endCursor": None}
74+
75+
while page_info["hasNextPage"]:
76+
result = get_page(url, 100, page_info["endCursor"])
77+
78+
edges, page_info = itemgetter("edges", "pageInfo")(result["datasets"])
79+
80+
for edge in edges:
81+
dataset_id, latest_snapshot = itemgetter("id", "latestSnapshot")(
82+
edge["node"]
83+
)
84+
yield (dataset_id, *itemgetter("tag", "created", "hexsha")(latest_snapshot))
85+
86+
87+
def check_remote(dataset_id: str, tag: str, hexsha: str) -> bool | None:
88+
log = logger.bind(dataset=dataset_id, tag=tag)
89+
repo = f"https://github.com/OpenNeuroDatasets/{dataset_id}.git"
90+
result = subprocess.run(
91+
["git", "ls-remote", "--exit-code", repo, tag],
92+
capture_output=True,
93+
)
94+
if result.returncode:
95+
if "Repository not found" in result.stderr.decode():
96+
log.error("Missing repository")
97+
return None
98+
log.error("Missing latest tag")
99+
return False
100+
101+
shasum, ref = result.stdout.decode("utf-8").strip().split()
102+
103+
if shasum != hexsha:
104+
log.warning(f"mismatch: {shasum[:7]}({ref[10:]}) != {hexsha[:7]}")
105+
return False
106+
107+
return ref == f"refs/tags/{tag}"
108+
109+
110+
if __name__ == "__main__":
111+
count = get_dataset_count(ENDPOINT)
112+
113+
retcode = 0
114+
115+
with Progress(
116+
TextColumn("[progress.description]{task.description} {task.fields[dataset]:8s}"),
117+
BarColumn(),
118+
MofNCompleteColumn(),
119+
) as progress:
120+
task = progress.add_task("Checking", total=count, dataset="...")
121+
122+
for dataset_id, tag, created, hexsha in dataset_iterator(ENDPOINT):
123+
progress.update(task, advance=1, dataset=dataset_id)
124+
125+
retcode |= not check_remote(dataset_id, tag, hexsha)
126+
127+
raise SystemExit(retcode)

0 commit comments

Comments
 (0)