|
| 1 | +"""GitHub Organization Activity Tracker |
| 2 | +
|
| 3 | +This module tracks and reports the last activity of members across GitHub organizations. |
| 4 | +It implements disk-based caching to minimize API requests and respect rate limits. |
| 5 | +""" |
| 6 | + |
| 7 | +import os |
| 8 | +import asyncio |
| 9 | +import aiohttp |
| 10 | +from rich import print |
| 11 | +from datetime import datetime, timezone, timedelta |
| 12 | +import humanize |
| 13 | +from itertools import count |
| 14 | +import diskcache |
| 15 | +import pathlib |
| 16 | +from typing import Optional, List, Dict |
| 17 | +import argparse |
| 18 | + |
| 19 | +default_orgs = [ |
| 20 | + "binder-examples", |
| 21 | + "binderhub-ci-repos", |
| 22 | + "ipython", |
| 23 | + "jupyter", |
| 24 | + "jupyter-attic", |
| 25 | + "jupyter-book", |
| 26 | + "jupyter-governance", |
| 27 | + "jupyter-incubator", |
| 28 | + "jupyter-resources", |
| 29 | + "jupyter-server", |
| 30 | + "jupyter-standard", |
| 31 | + "jupyter-standards", |
| 32 | + "jupyter-widgets", |
| 33 | + "jupyter-xeus", |
| 34 | + "jupytercon", |
| 35 | + "jupyterhub", |
| 36 | + "jupyterlab", |
| 37 | + "voila-dashboards", |
| 38 | + "voila-gallery", |
| 39 | + "pickleshare", |
| 40 | +] |
| 41 | + |
| 42 | +token = os.getenv("GH_TOKEN") |
| 43 | +if not token: |
| 44 | + print("[red]Error: GH_TOKEN environment variable not set[/red]") |
| 45 | + exit(1) |
| 46 | + |
| 47 | +headers = { |
| 48 | + "Authorization": f"token {token}", |
| 49 | + "Accept": "application/vnd.github.v3+json", |
| 50 | +} |
| 51 | + |
| 52 | + |
| 53 | +class DateTimeCache(diskcache.Cache): |
| 54 | + """Custom cache class that handles datetime serialization.""" |
| 55 | + |
| 56 | + def __setitem__(self, key, value): |
| 57 | + """Override to serialize datetime objects.""" |
| 58 | + if isinstance(value, datetime): |
| 59 | + value = {"__datetime__": value.isoformat()} |
| 60 | + super().__setitem__(key, value) |
| 61 | + |
| 62 | + def __getitem__(self, key): |
| 63 | + """Override to deserialize datetime objects.""" |
| 64 | + value = super().__getitem__(key) |
| 65 | + if isinstance(value, dict) and "__datetime__" in value: |
| 66 | + return datetime.fromisoformat(value["__datetime__"]) |
| 67 | + assert not isinstance(value, dict), value |
| 68 | + return value |
| 69 | + |
| 70 | + def get(self, key, default=None, retry=False): |
| 71 | + """Override to handle datetime deserialization in get method with retry.""" |
| 72 | + try: |
| 73 | + value = super().get(key, default=default, retry=retry) |
| 74 | + if isinstance(value, dict) and "__datetime__" in value: |
| 75 | + return datetime.fromisoformat(value["__datetime__"]) |
| 76 | + return value |
| 77 | + |
| 78 | + except KeyError: |
| 79 | + return default |
| 80 | + |
| 81 | + |
| 82 | +# Configure DiskCache in the current directory |
| 83 | +CACHE_DIR = "github_cache" |
| 84 | +cache = DateTimeCache(CACHE_DIR) |
| 85 | + |
| 86 | + |
| 87 | +async def get_org_members( |
| 88 | + session: aiohttp.ClientSession, org: str, debug: bool |
| 89 | +) -> List[Dict]: |
| 90 | + """Fetch all members of a GitHub organization with caching. |
| 91 | +
|
| 92 | + Parameters |
| 93 | + ---------- |
| 94 | + session : aiohttp.ClientSession |
| 95 | + The HTTP session to use for requests |
| 96 | + org : str |
| 97 | + The name of the GitHub organization |
| 98 | +
|
| 99 | + Returns |
| 100 | + ------- |
| 101 | + List[Dict] |
| 102 | + A list of dictionaries containing member information. |
| 103 | + Each dictionary contains at least: |
| 104 | + - 'login': str, the username |
| 105 | + - 'id': int, the user ID |
| 106 | + - 'type': str, usually 'User' |
| 107 | +
|
| 108 | + Notes |
| 109 | + ----- |
| 110 | + Results are cached for 24 hours to minimize API requests. |
| 111 | + Pagination is handled automatically (100 items per page). |
| 112 | + """ |
| 113 | + cache_key = f"org_members_{org}" |
| 114 | + |
| 115 | + # Try to get from cache with retry |
| 116 | + cached_data = cache.get(cache_key, retry=True) |
| 117 | + if cached_data is not None: |
| 118 | + if debug: |
| 119 | + print(f"[cyan]Cache hit for {org} members[/cyan]") |
| 120 | + return cached_data |
| 121 | + |
| 122 | + print(f"[yellow]Cache miss for {org} members - fetching from API[/yellow]") |
| 123 | + members = [] |
| 124 | + |
| 125 | + try: |
| 126 | + for page in count(1): |
| 127 | + url = f"https://api.github.com/orgs/{org}/members?page={page}&per_page=100" |
| 128 | + async with session.get(url, headers=headers) as response: |
| 129 | + if response.status != 200: |
| 130 | + print( |
| 131 | + f"[red]Error fetching members for {org}: {response.status}[/red]" |
| 132 | + ) |
| 133 | + break |
| 134 | + |
| 135 | + page_members = await response.json() |
| 136 | + if not page_members: |
| 137 | + break |
| 138 | + |
| 139 | + members.extend(page_members) |
| 140 | + |
| 141 | + # Cache the results |
| 142 | + cache[cache_key] = members # Using __setitem__ instead of set() |
| 143 | + print(f"[green]Cached {len(members)} members for {org}[/green]") |
| 144 | + return members |
| 145 | + |
| 146 | + except Exception as e: |
| 147 | + print(f"[red]Error fetching members for {org}: {str(e)}[/red]") |
| 148 | + return [] |
| 149 | + |
| 150 | + |
| 151 | +async def get_user_activity( |
| 152 | + session: aiohttp.ClientSession, username: str, debug: bool |
| 153 | +) -> Optional[datetime]: |
| 154 | + """Fetch the last public activity date for a GitHub user.""" |
| 155 | + cache_key = f"user_activity_{username}" |
| 156 | + |
| 157 | + # Try to get from cache |
| 158 | + cached_data = cache.get(cache_key) |
| 159 | + if cached_data is not None: |
| 160 | + if debug: |
| 161 | + print(f"[cyan]Cache hit for {username} activity[/cyan]") |
| 162 | + assert isinstance(cached_data, datetime), cached_data |
| 163 | + return cached_data |
| 164 | + if debug: |
| 165 | + print( |
| 166 | + f"[yellow]Cache miss for {username} activity - fetching from API[/yellow]" |
| 167 | + ) |
| 168 | + |
| 169 | + try: |
| 170 | + if debug: |
| 171 | + print(f"[blue]Getting activity for {username}[/blue]") |
| 172 | + url = f"https://api.github.com/users/{username}/events/public" |
| 173 | + async with session.get(url, headers=headers) as response: |
| 174 | + if response.status == 200: |
| 175 | + if debug: |
| 176 | + print(f"Got activity for {username}") |
| 177 | + events = await response.json() |
| 178 | + if events: |
| 179 | + last_activity = datetime.fromisoformat( |
| 180 | + events[0]["created_at"].replace("Z", "+00:00") |
| 181 | + ) |
| 182 | + # Cache the results |
| 183 | + assert isinstance(last_activity, datetime) |
| 184 | + cache[cache_key] = ( |
| 185 | + last_activity # Using __setitem__ instead of set() |
| 186 | + ) |
| 187 | + if debug: |
| 188 | + print(f"[green]Cached activity for {username}[/green]") |
| 189 | + assert isinstance(last_activity, datetime) |
| 190 | + return last_activity |
| 191 | + else: |
| 192 | + if debug: |
| 193 | + print(f"[yellow]No activity found for {username}[/yellow]") |
| 194 | + cache[cache_key] = None # Using __setitem__ instead of set() |
| 195 | + else: |
| 196 | + print( |
| 197 | + f"[red]Error fetching activity for {username}: {response.status}[/red]" |
| 198 | + ) |
| 199 | + except Exception as e: |
| 200 | + print(f"[red]Error fetching activity for {username}: {str(e)}[/red]") |
| 201 | + |
| 202 | + return None |
| 203 | + |
| 204 | + |
| 205 | +def get_cache_size() -> str: |
| 206 | + """Get the current cache size in a human-readable format.""" |
| 207 | + try: |
| 208 | + cache_path = pathlib.Path(CACHE_DIR) |
| 209 | + if cache_path.exists(): |
| 210 | + total_size = sum( |
| 211 | + f.stat().st_size for f in cache_path.rglob("*") if f.is_file() |
| 212 | + ) |
| 213 | + return f"{total_size / 1024 / 1024:.1f} MB" |
| 214 | + except Exception: |
| 215 | + pass |
| 216 | + return "unknown size" |
| 217 | + |
| 218 | + |
| 219 | +def clear_cache() -> None: |
| 220 | + """Clear the disk cache.""" |
| 221 | + try: |
| 222 | + cache.clear() |
| 223 | + print("[green]Cache cleared successfully[/green]") |
| 224 | + except Exception as e: |
| 225 | + print(f"[red]Error clearing cache: {str(e)}[/red]") |
| 226 | + |
| 227 | + |
| 228 | +async def main(orgs, debug: bool, timelimit_days: int): |
| 229 | + """Main execution function.""" |
| 230 | + # Show cache status |
| 231 | + print(f"[blue]Cache directory: {CACHE_DIR} (size: {get_cache_size()})[/blue]") |
| 232 | + print(f"[blue]Cache contains {len(cache)} items[/blue]") |
| 233 | + |
| 234 | + async with aiohttp.ClientSession() as session: |
| 235 | + # Check rate limit |
| 236 | + async with session.get( |
| 237 | + "https://api.github.com/rate_limit", headers=headers |
| 238 | + ) as response: |
| 239 | + if response.status == 200: |
| 240 | + rate_data = await response.json() |
| 241 | + remaining = rate_data["resources"]["core"]["remaining"] |
| 242 | + reset_time = datetime.fromtimestamp( |
| 243 | + rate_data["resources"]["core"]["reset"] |
| 244 | + ) |
| 245 | + reset_in = humanize.naturaltime(reset_time) |
| 246 | + print(f"Rate limit remaining: {remaining}") |
| 247 | + print(f"Rate limit resets {reset_in}") |
| 248 | + if remaining < 100: |
| 249 | + print( |
| 250 | + f"[yellow]Warning: Low rate limit ({remaining} remaining)[/yellow]" |
| 251 | + ) |
| 252 | + if remaining < 10: |
| 253 | + print("[red]Aborting due to very low rate limit[/red]") |
| 254 | + return |
| 255 | + |
| 256 | + # Get all members from all orgs |
| 257 | + all_members = {} |
| 258 | + for org in orgs: |
| 259 | + members = await get_org_members(session, org, debug) |
| 260 | + for member in members: |
| 261 | + if member["login"] not in all_members: |
| 262 | + all_members[member["login"]] = [] |
| 263 | + all_members[member["login"]].append(org) |
| 264 | + |
| 265 | + # Get activity for each user |
| 266 | + tasks = [] |
| 267 | + for username in all_members: |
| 268 | + task = get_user_activity(session, username, debug) |
| 269 | + tasks.append((username, task)) |
| 270 | + |
| 271 | + results = await asyncio.gather(*(task for _, task in tasks)) |
| 272 | + |
| 273 | + # Print results sorted by last activity |
| 274 | + user_activities = [] |
| 275 | + for (username, _), last_activity in zip(tasks, results): |
| 276 | + if last_activity is not None: |
| 277 | + assert isinstance(last_activity, datetime), last_activity |
| 278 | + user_activities.append( |
| 279 | + ( |
| 280 | + username, |
| 281 | + last_activity if last_activity is not None else None, |
| 282 | + all_members[username], |
| 283 | + ) |
| 284 | + ) |
| 285 | + for org in orgs: |
| 286 | + print(f"[bold]{org}[/bold]") |
| 287 | + n_active = 0 |
| 288 | + n_inactive = 0 |
| 289 | + for username, last_activity, user_orgs in sorted( |
| 290 | + user_activities, |
| 291 | + key=lambda x: (x[1], x[0]) |
| 292 | + if x[1] is not None |
| 293 | + else (datetime.fromtimestamp(0).replace(tzinfo=timezone.utc), x[0]), |
| 294 | + reverse=True, |
| 295 | + ): |
| 296 | + if org not in user_orgs: |
| 297 | + continue |
| 298 | + if last_activity is not None and last_activity > (datetime.now().replace(tzinfo=timezone.utc) - timedelta(days=timelimit_days)): |
| 299 | + n_active += 1 |
| 300 | + continue |
| 301 | + n_inactive += 1 |
| 302 | + last_activity_ago = ( |
| 303 | + humanize.naturaltime(datetime.now(last_activity.tzinfo) - last_activity) |
| 304 | + if last_activity |
| 305 | + else "[red]never[/red]" |
| 306 | + ) |
| 307 | + orgs_str = ", ".join(user_orgs) |
| 308 | + print( |
| 309 | + f" {username:<20}: Last activity {last_activity_ago}" |
| 310 | + ) |
| 311 | + print(f" Found [red]{n_inactive} inactive[/red] and [green]{n_active} active[/green] users in {org} with last activity more recent than {timelimit_days} days.") |
| 312 | + |
| 313 | + |
| 314 | +if __name__ == "__main__": |
| 315 | + parser = argparse.ArgumentParser(description="GitHub Organization Activity Tracker") |
| 316 | + parser.add_argument( |
| 317 | + "--clear-cache", action="store_true", help="Clear the cache before running" |
| 318 | + ) |
| 319 | + parser.add_argument("--debug", action="store_true", help="Show debug information") |
| 320 | + |
| 321 | + |
| 322 | + parser.add_argument( |
| 323 | + "--timelimit-days", |
| 324 | + type=int, |
| 325 | + default=365, |
| 326 | + help="Maximum number of days since last activity before an account is marked as inactive. (default: 365)", |
| 327 | + ) |
| 328 | + parser.add_argument( |
| 329 | + "--orgs", |
| 330 | + nargs="+", |
| 331 | + default=default_orgs, |
| 332 | + help="GitHub organizations to track (default: all)", |
| 333 | + ) |
| 334 | + args = parser.parse_args() |
| 335 | + |
| 336 | + |
| 337 | + if args.clear_cache: |
| 338 | + clear_cache() |
| 339 | + |
| 340 | + asyncio.run(main(args.orgs, args.debug, args.timelimit_days)) |
0 commit comments