From ca546dd59eb8eb3ce6de1f2af7bec8da59511ed5 Mon Sep 17 00:00:00 2001 From: M Bussonnier Date: Thu, 5 Dec 2024 17:23:02 +0100 Subject: [PATCH 01/13] Last activity --- tools/last_user_activity.py | 141 ++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 tools/last_user_activity.py diff --git a/tools/last_user_activity.py b/tools/last_user_activity.py new file mode 100644 index 0000000..ac8714f --- /dev/null +++ b/tools/last_user_activity.py @@ -0,0 +1,141 @@ +""" +This tools find all users in multiple organizations and print their last activity date. +""" + +import os +import asyncio +import aiohttp +from rich import print +from datetime import datetime +import humanize +from itertools import count + +orgs = [ + "binder-examples", + "binderhub-ci-repos", + "ipython", + "jupyter", + "jupyter-book", + "jupyter-governance", + "jupyter-incubator", + "jupyter-server", + "jupyter-standards", + "jupyter-widgets", + "jupyterhub", + "jupyterlab", + "jupyter-xeus", + "jupytercon", + "voila-dashboards", + "voila-gallery", +] + +token = os.getenv("GH_TOKEN") +if not token: + print("[red]Error: GH_TOKEN environment variable not set[/red]") + exit(1) + +headers = { + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json", +} + +async def get_org_members(session: aiohttp.ClientSession, org: str) -> list[dict]: + """Get all members for an organization + + Parameters + ---------- + session: aiohttp.ClientSession + The aiohttp client session + org: str + The organization name + + Returns + ------- + list[dict]: The list of members + """ + members = [] + + for page in count(1): + url = f"https://api.github.com/orgs/{org}/members?page={page}&per_page=100" + async with session.get(url, headers=headers) as response: + if response.status != 200: + print(f"[red]Error fetching members for {org}: {response.status}[/red]") + break + + page_members = await response.json() + if not page_members: + break + + members.extend(page_members) + + return members + +async def get_user_activity(session: aiohttp.ClientSession, username: str) -> datetime: + """Get the last activity date for a user + + Parameters + ---------- + session: aiohttp.ClientSession + The aiohttp client session + username: str + The GitHub username + + Returns + ------- + datetime: The last activity date + """ + url = f"https://api.github.com/users/{username}/events/public" + async with session.get(url, headers=headers) as response: + if response.status == 200: + events = await response.json() + if events: + return datetime.fromisoformat(events[0]["created_at"].replace('Z', '+00:00')) + return None + +async def main(): + async with aiohttp.ClientSession() as session: + # Check rate limit + async with session.get("https://api.github.com/rate_limit", headers=headers) as response: + if response.status == 200: + rate_data = await response.json() + remaining = rate_data["resources"]["core"]["remaining"] + reset_time = datetime.fromtimestamp(rate_data["resources"]["core"]["reset"]) + reset_in = humanize.naturaltime(reset_time) + print(f"Rate limit remaining: {remaining}") + print(f"Rate limit resets {reset_in}") + if remaining < 100: + print(f"[yellow]Warning: Low rate limit ({remaining} remaining)[/yellow]") + if remaining < 10: + print("[red]Aborting due to very low rate limit[/red]") + return + + # Get all members from all orgs + all_members = {} + for org in orgs: + members = await get_org_members(session, org) + for member in members: + if member["login"] not in all_members: + all_members[member["login"]] = [] + all_members[member["login"]].append(org) + + # Get activity for each user + tasks = [] + for username in all_members: + task = get_user_activity(session, username) + tasks.append((username, task)) + + results = await asyncio.gather(*(task for _, task in tasks)) + + # Print results sorted by last activity + user_activities = [] + for (username, _), last_activity in zip(tasks, results): + if last_activity: + user_activities.append((username, last_activity, all_members[username])) + + for username, last_activity, user_orgs in sorted(user_activities, key=lambda x: x[1], reverse=True): + last_activity_ago = humanize.naturaltime(datetime.now(last_activity.tzinfo) - last_activity) + orgs_str = ", ".join(user_orgs) + print(f"{username:<20}: Last activity {last_activity_ago} in orgs: {orgs_str}") + +if __name__ == "__main__": + asyncio.run(main()) From ae92aa82aa1fa6c5162fe6e8e622d95a6f9805d4 Mon Sep 17 00:00:00 2001 From: M Bussonnier Date: Thu, 5 Dec 2024 17:38:03 +0100 Subject: [PATCH 02/13] cache --- tools/last_user_activity.py | 61 +++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/tools/last_user_activity.py b/tools/last_user_activity.py index ac8714f..2467030 100644 --- a/tools/last_user_activity.py +++ b/tools/last_user_activity.py @@ -9,6 +9,11 @@ from datetime import datetime import humanize from itertools import count +import aiosqlite +import diskcache +import json +import pathlib +from typing import Optional orgs = [ "binder-examples", @@ -39,22 +44,17 @@ "Accept": "application/vnd.github.v3+json", } +# Configure DiskCache in the current directory +CACHE_DIR = "github_cache" +cache = diskcache.Cache(CACHE_DIR) + async def get_org_members(session: aiohttp.ClientSession, org: str) -> list[dict]: - """Get all members for an organization - - Parameters - ---------- - session: aiohttp.ClientSession - The aiohttp client session - org: str - The organization name - - Returns - ------- - list[dict]: The list of members - """ + """Get all members for an organization with persistent caching""" + cache_key = f"org_members_{org}" + if cache_key in cache: + return cache[cache_key] + members = [] - for page in count(1): url = f"https://api.github.com/orgs/{org}/members?page={page}&per_page=100" async with session.get(url, headers=headers) as response: @@ -67,32 +67,33 @@ async def get_org_members(session: aiohttp.ClientSession, org: str) -> list[dict break members.extend(page_members) - + + cache.set(cache_key, members, expire=3600 * 24) # Cache for 24 hours return members -async def get_user_activity(session: aiohttp.ClientSession, username: str) -> datetime: - """Get the last activity date for a user - - Parameters - ---------- - session: aiohttp.ClientSession - The aiohttp client session - username: str - The GitHub username - - Returns - ------- - datetime: The last activity date - """ +async def get_user_activity(session: aiohttp.ClientSession, username: str) -> Optional[datetime]: + """Get the last activity date for a user with persistent caching""" + cache_key = f"user_activity_{username}" + if cache_key in cache: + return cache[cache_key] + url = f"https://api.github.com/users/{username}/events/public" async with session.get(url, headers=headers) as response: if response.status == 200: events = await response.json() if events: - return datetime.fromisoformat(events[0]["created_at"].replace('Z', '+00:00')) + last_activity = datetime.fromisoformat(events[0]["created_at"].replace('Z', '+00:00')) + cache.set(cache_key, last_activity, expire=3600 * 24) # Cache for 24 hours + return last_activity return None async def main(): + # Add cache info at start + if pathlib.Path(CACHE_DIR).exists(): + print(f"[blue]Using cache directory: {CACHE_DIR}[/blue]") + else: + print("[yellow]Creating new cache directory[/yellow]") + async with aiohttp.ClientSession() as session: # Check rate limit async with session.get("https://api.github.com/rate_limit", headers=headers) as response: From c1a0b0d8c9457a28a9129c802d8437b023046474 Mon Sep 17 00:00:00 2001 From: M Bussonnier Date: Thu, 5 Dec 2024 17:40:24 +0100 Subject: [PATCH 03/13] docs --- tools/last_user_activity.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/last_user_activity.py b/tools/last_user_activity.py index 2467030..5e3b583 100644 --- a/tools/last_user_activity.py +++ b/tools/last_user_activity.py @@ -17,7 +17,7 @@ orgs = [ "binder-examples", - "binderhub-ci-repos", + "binderhub-ci-repos", "ipython", "jupyter", "jupyter-book", From 7f48bd470db84c40bdf1fb6af2a1e107b164219e Mon Sep 17 00:00:00 2001 From: M Bussonnier Date: Thu, 5 Dec 2024 17:41:49 +0100 Subject: [PATCH 04/13] clear cache --- tools/last_user_activity.py | 99 +++++++++++++++++++++++++++++++++---- 1 file changed, 89 insertions(+), 10 deletions(-) diff --git a/tools/last_user_activity.py b/tools/last_user_activity.py index 5e3b583..9b47074 100644 --- a/tools/last_user_activity.py +++ b/tools/last_user_activity.py @@ -1,5 +1,7 @@ -""" -This tools find all users in multiple organizations and print their last activity date. +"""GitHub Organization Activity Tracker + +This module tracks and reports the last activity of members across GitHub organizations. +It implements disk-based caching to minimize API requests and respect rate limits. """ import os @@ -13,7 +15,8 @@ import diskcache import json import pathlib -from typing import Optional +from typing import Optional, List, Dict +import argparse orgs = [ "binder-examples", @@ -48,8 +51,30 @@ CACHE_DIR = "github_cache" cache = diskcache.Cache(CACHE_DIR) -async def get_org_members(session: aiohttp.ClientSession, org: str) -> list[dict]: - """Get all members for an organization with persistent caching""" +async def get_org_members(session: aiohttp.ClientSession, org: str) -> List[Dict]: + """Fetch all members of a GitHub organization with caching. + + Parameters + ---------- + session : aiohttp.ClientSession + The HTTP session to use for requests + org : str + The name of the GitHub organization + + Returns + ------- + List[Dict] + A list of dictionaries containing member information. + Each dictionary contains at least: + - 'login': str, the username + - 'id': int, the user ID + - 'type': str, usually 'User' + + Notes + ----- + Results are cached for 24 hours to minimize API requests. + Pagination is handled automatically (100 items per page). + """ cache_key = f"org_members_{org}" if cache_key in cache: return cache[cache_key] @@ -68,11 +93,30 @@ async def get_org_members(session: aiohttp.ClientSession, org: str) -> list[dict members.extend(page_members) - cache.set(cache_key, members, expire=3600 * 24) # Cache for 24 hours + cache.set(cache_key, members, expire=3600 * 24) return members async def get_user_activity(session: aiohttp.ClientSession, username: str) -> Optional[datetime]: - """Get the last activity date for a user with persistent caching""" + """Fetch the last public activity date for a GitHub user. + + Parameters + ---------- + session : aiohttp.ClientSession + The HTTP session to use for requests + username : str + The GitHub username to check + + Returns + ------- + Optional[datetime] + The datetime of the user's last public activity, + or None if no activity was found or an error occurred + + Notes + ----- + Results are cached for 24 hours to minimize API requests. + Only public events are considered for activity tracking. + """ cache_key = f"user_activity_{username}" if cache_key in cache: return cache[cache_key] @@ -83,14 +127,42 @@ async def get_user_activity(session: aiohttp.ClientSession, username: str) -> Op events = await response.json() if events: last_activity = datetime.fromisoformat(events[0]["created_at"].replace('Z', '+00:00')) - cache.set(cache_key, last_activity, expire=3600 * 24) # Cache for 24 hours + cache.set(cache_key, last_activity, expire=3600 * 24) return last_activity return None +def clear_cache() -> None: + """Clear the disk cache. + + Removes all cached data, forcing fresh API requests on next run. + + Notes + ----- + This is useful when you want to ensure you're getting the latest data + or if the cache becomes corrupted. + """ + if pathlib.Path(CACHE_DIR).exists(): + cache.clear() + print("[green]Cache cleared successfully[/green]") + else: + print("[yellow]No cache directory found[/yellow]") + async def main(): + """Main execution function. + + Fetches and displays the last activity for all members across specified organizations. + Uses disk caching to minimize API requests and handles GitHub API rate limits. + + Notes + ----- + The results are displayed organization by organization, with members sorted + by their last activity date (most recent first). + """ # Add cache info at start - if pathlib.Path(CACHE_DIR).exists(): - print(f"[blue]Using cache directory: {CACHE_DIR}[/blue]") + cache_path = pathlib.Path(CACHE_DIR) + if cache_path.exists(): + cache_size = sum(f.stat().st_size for f in cache_path.rglob('*') if f.is_file()) + print(f"[blue]Using cache directory: {CACHE_DIR} ({cache_size / 1024 / 1024:.1f} MB)[/blue]") else: print("[yellow]Creating new cache directory[/yellow]") @@ -139,4 +211,11 @@ async def main(): print(f"{username:<20}: Last activity {last_activity_ago} in orgs: {orgs_str}") if __name__ == "__main__": + parser = argparse.ArgumentParser(description="GitHub Organization Activity Tracker") + parser.add_argument('--clear-cache', action='store_true', help='Clear the cache before running') + args = parser.parse_args() + + if args.clear_cache: + clear_cache() + asyncio.run(main()) From 54a7dda5d623de9bc4849276364a613ab6242425 Mon Sep 17 00:00:00 2001 From: M Bussonnier Date: Thu, 5 Dec 2024 17:49:36 +0100 Subject: [PATCH 05/13] setting cache does not work --- tools/last_user_activity.py | 186 ++++++++++++++++++------------------ 1 file changed, 95 insertions(+), 91 deletions(-) diff --git a/tools/last_user_activity.py b/tools/last_user_activity.py index 9b47074..588d8dc 100644 --- a/tools/last_user_activity.py +++ b/tools/last_user_activity.py @@ -19,22 +19,22 @@ import argparse orgs = [ - "binder-examples", - "binderhub-ci-repos", + # "binder-examples", + # "binderhub-ci-repos", "ipython", - "jupyter", - "jupyter-book", - "jupyter-governance", - "jupyter-incubator", - "jupyter-server", - "jupyter-standards", - "jupyter-widgets", - "jupyterhub", - "jupyterlab", - "jupyter-xeus", - "jupytercon", - "voila-dashboards", - "voila-gallery", + # "jupyter", + # "jupyter-book", + # "jupyter-governance", + # "jupyter-incubator", + # "jupyter-server", + # "jupyter-standards", + # "jupyter-widgets", + # "jupyterhub", + # "jupyterlab", + # "jupyter-xeus", + # "jupytercon", + # "voila-dashboards", + # "voila-gallery", ] token = os.getenv("GH_TOKEN") @@ -76,95 +76,98 @@ async def get_org_members(session: aiohttp.ClientSession, org: str) -> List[Dict Pagination is handled automatically (100 items per page). """ cache_key = f"org_members_{org}" - if cache_key in cache: - return cache[cache_key] + + # Try to get from cache + cached_data = cache.get(cache_key) + if cached_data is not None: + print(f"[cyan]Cache hit for {org} members[/cyan]") + return cached_data + print(f"[yellow]Cache miss for {org} members - fetching from API[/yellow]") members = [] - for page in count(1): - url = f"https://api.github.com/orgs/{org}/members?page={page}&per_page=100" - async with session.get(url, headers=headers) as response: - if response.status != 200: - print(f"[red]Error fetching members for {org}: {response.status}[/red]") - break - - page_members = await response.json() - if not page_members: - break - - members.extend(page_members) - cache.set(cache_key, members, expire=3600 * 24) - return members + try: + for page in count(1): + url = f"https://api.github.com/orgs/{org}/members?page={page}&per_page=100" + async with session.get(url, headers=headers) as response: + if response.status != 200: + print(f"[red]Error fetching members for {org}: {response.status}[/red]") + break + + page_members = await response.json() + if not page_members: + break + + members.extend(page_members) + + # Cache the results + cache.set(cache_key, members, expire=3600 * 24) # 24 hours + print(f"[green]Cached {len(members)} members for {org}[/green]") + return members + + except Exception as e: + print(f"[red]Error fetching members for {org}: {str(e)}[/red]") + return [] async def get_user_activity(session: aiohttp.ClientSession, username: str) -> Optional[datetime]: - """Fetch the last public activity date for a GitHub user. - - Parameters - ---------- - session : aiohttp.ClientSession - The HTTP session to use for requests - username : str - The GitHub username to check - - Returns - ------- - Optional[datetime] - The datetime of the user's last public activity, - or None if no activity was found or an error occurred - - Notes - ----- - Results are cached for 24 hours to minimize API requests. - Only public events are considered for activity tracking. - """ + """Fetch the last public activity date for a GitHub user.""" cache_key = f"user_activity_{username}" - if cache_key in cache: - return cache[cache_key] - - url = f"https://api.github.com/users/{username}/events/public" - async with session.get(url, headers=headers) as response: - if response.status == 200: - events = await response.json() - if events: - last_activity = datetime.fromisoformat(events[0]["created_at"].replace('Z', '+00:00')) - cache.set(cache_key, last_activity, expire=3600 * 24) - return last_activity - return None + + # Try to get from cache + cached_data = cache.get(cache_key) + if cached_data is not None: + print(f"[cyan]Cache hit for {username} activity[/cyan]") + return cached_data -def clear_cache() -> None: - """Clear the disk cache. + print(f"[yellow]Cache miss for {username} activity - fetching from API[/yellow]") - Removes all cached data, forcing fresh API requests on next run. + try: + print(f"Getting activity for {username}") + url = f"https://api.github.com/users/{username}/events/public" + async with session.get(url, headers=headers) as response: + if response.status == 200: + print(f"Got activity for {username}") + events = await response.json() + if events: + last_activity = datetime.fromisoformat(events[0]["created_at"].replace('Z', '+00:00')) + # Cache the results + cache.set(cache_key, last_activity, expire=3600 * 24) # 24 hours + print(f"[green]Cached activity for {username}[/green]") + return last_activity + else: + print(f"[yellow]No activity found for {username}[/yellow]") + cache.set(cache_key, None, expire=3600 * 24) + else: + print(f"[red]Error fetching activity for {username}: {response.status}[/red]") + except Exception as e: + print(f"[red]Error fetching activity for {username}: {str(e)}[/red]") - Notes - ----- - This is useful when you want to ensure you're getting the latest data - or if the cache becomes corrupted. - """ - if pathlib.Path(CACHE_DIR).exists(): + return None + +def get_cache_size() -> str: + """Get the current cache size in a human-readable format.""" + try: + cache_path = pathlib.Path(CACHE_DIR) + if cache_path.exists(): + total_size = sum(f.stat().st_size for f in cache_path.rglob('*') if f.is_file()) + return f"{total_size / 1024 / 1024:.1f} MB" + except Exception: + pass + return "unknown size" + +def clear_cache() -> None: + """Clear the disk cache.""" + try: cache.clear() print("[green]Cache cleared successfully[/green]") - else: - print("[yellow]No cache directory found[/yellow]") + except Exception as e: + print(f"[red]Error clearing cache: {str(e)}[/red]") async def main(): - """Main execution function. - - Fetches and displays the last activity for all members across specified organizations. - Uses disk caching to minimize API requests and handles GitHub API rate limits. - - Notes - ----- - The results are displayed organization by organization, with members sorted - by their last activity date (most recent first). - """ - # Add cache info at start - cache_path = pathlib.Path(CACHE_DIR) - if cache_path.exists(): - cache_size = sum(f.stat().st_size for f in cache_path.rglob('*') if f.is_file()) - print(f"[blue]Using cache directory: {CACHE_DIR} ({cache_size / 1024 / 1024:.1f} MB)[/blue]") - else: - print("[yellow]Creating new cache directory[/yellow]") + """Main execution function.""" + # Show cache status + print(f"[blue]Cache directory: {CACHE_DIR} (size: {get_cache_size()})[/blue]") + print(f"[blue]Cache contains {len(cache)} items[/blue]") async with aiohttp.ClientSession() as session: # Check rate limit @@ -213,6 +216,7 @@ async def main(): if __name__ == "__main__": parser = argparse.ArgumentParser(description="GitHub Organization Activity Tracker") parser.add_argument('--clear-cache', action='store_true', help='Clear the cache before running') + parser.add_argument('--debug', action='store_true', help='Show debug information') args = parser.parse_args() if args.clear_cache: From 6a5cafd4895302d5c98a610358b5f09dfd27ea9e Mon Sep 17 00:00:00 2001 From: M Bussonnier Date: Thu, 5 Dec 2024 17:57:14 +0100 Subject: [PATCH 06/13] working proto --- tools/last_user_activity.py | 72 ++++++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 25 deletions(-) diff --git a/tools/last_user_activity.py b/tools/last_user_activity.py index 588d8dc..1273e8e 100644 --- a/tools/last_user_activity.py +++ b/tools/last_user_activity.py @@ -19,22 +19,22 @@ import argparse orgs = [ - # "binder-examples", - # "binderhub-ci-repos", + "binder-examples", + "binderhub-ci-repos", "ipython", - # "jupyter", - # "jupyter-book", - # "jupyter-governance", - # "jupyter-incubator", - # "jupyter-server", - # "jupyter-standards", - # "jupyter-widgets", - # "jupyterhub", - # "jupyterlab", - # "jupyter-xeus", - # "jupytercon", - # "voila-dashboards", - # "voila-gallery", + "jupyter", + "jupyter-book", + "jupyter-governance", + "jupyter-incubator", + "jupyter-server", + "jupyter-standards", + "jupyter-widgets", + "jupyterhub", + "jupyterlab", + "jupyter-xeus", + "jupytercon", + "voila-dashboards", + "voila-gallery", ] token = os.getenv("GH_TOKEN") @@ -47,9 +47,32 @@ "Accept": "application/vnd.github.v3+json", } +class DateTimeCache(diskcache.Cache): + """Custom cache class that handles datetime serialization.""" + + def __setitem__(self, key, value): + """Override to serialize datetime objects.""" + if isinstance(value, datetime): + value = {'__datetime__': value.isoformat()} + super().__setitem__(key, value) + + def __getitem__(self, key): + """Override to deserialize datetime objects.""" + value = super().__getitem__(key) + if isinstance(value, dict) and '__datetime__' in value: + return datetime.fromisoformat(value['__datetime__']) + return value + + def get(self, key, default=None, retry=False): + """Override to handle datetime deserialization in get method with retry.""" + try: + return super().get(key, default=default, retry=retry) + except KeyError: + return default + # Configure DiskCache in the current directory CACHE_DIR = "github_cache" -cache = diskcache.Cache(CACHE_DIR) +cache = DateTimeCache(CACHE_DIR) async def get_org_members(session: aiohttp.ClientSession, org: str) -> List[Dict]: """Fetch all members of a GitHub organization with caching. @@ -77,8 +100,8 @@ async def get_org_members(session: aiohttp.ClientSession, org: str) -> List[Dict """ cache_key = f"org_members_{org}" - # Try to get from cache - cached_data = cache.get(cache_key) + # Try to get from cache with retry + cached_data = cache.get(cache_key, retry=True) if cached_data is not None: print(f"[cyan]Cache hit for {org} members[/cyan]") return cached_data @@ -101,7 +124,7 @@ async def get_org_members(session: aiohttp.ClientSession, org: str) -> List[Dict members.extend(page_members) # Cache the results - cache.set(cache_key, members, expire=3600 * 24) # 24 hours + cache[cache_key] = members # Using __setitem__ instead of set() print(f"[green]Cached {len(members)} members for {org}[/green]") return members @@ -131,12 +154,12 @@ async def get_user_activity(session: aiohttp.ClientSession, username: str) -> Op if events: last_activity = datetime.fromisoformat(events[0]["created_at"].replace('Z', '+00:00')) # Cache the results - cache.set(cache_key, last_activity, expire=3600 * 24) # 24 hours + cache[cache_key] = last_activity # Using __setitem__ instead of set() print(f"[green]Cached activity for {username}[/green]") return last_activity else: print(f"[yellow]No activity found for {username}[/yellow]") - cache.set(cache_key, None, expire=3600 * 24) + cache[cache_key] = None # Using __setitem__ instead of set() else: print(f"[red]Error fetching activity for {username}: {response.status}[/red]") except Exception as e: @@ -205,11 +228,10 @@ async def main(): # Print results sorted by last activity user_activities = [] for (username, _), last_activity in zip(tasks, results): - if last_activity: - user_activities.append((username, last_activity, all_members[username])) + user_activities.append((username, last_activity, all_members[username])) - for username, last_activity, user_orgs in sorted(user_activities, key=lambda x: x[1], reverse=True): - last_activity_ago = humanize.naturaltime(datetime.now(last_activity.tzinfo) - last_activity) + for username, last_activity, user_orgs in sorted(user_activities, key=lambda x: x[1] if x[1] is not None else datetime.fromtimestamp(0).astimezone(datetime.now().tzinfo), reverse=True): + last_activity_ago = humanize.naturaltime(datetime.now(last_activity.tzinfo) - last_activity) if last_activity else "[red]never[/red]" orgs_str = ", ".join(user_orgs) print(f"{username:<20}: Last activity {last_activity_ago} in orgs: {orgs_str}") From 8d23c4eac3e7665c3c39c3cdce1e6b4d5a1ec779 Mon Sep 17 00:00:00 2001 From: M Bussonnier Date: Tue, 7 Jan 2025 18:07:58 +0100 Subject: [PATCH 07/13] stuff --- tools/last_user_activity.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/last_user_activity.py b/tools/last_user_activity.py index 1273e8e..9caedc9 100644 --- a/tools/last_user_activity.py +++ b/tools/last_user_activity.py @@ -230,7 +230,7 @@ async def main(): for (username, _), last_activity in zip(tasks, results): user_activities.append((username, last_activity, all_members[username])) - for username, last_activity, user_orgs in sorted(user_activities, key=lambda x: x[1] if x[1] is not None else datetime.fromtimestamp(0).astimezone(datetime.now().tzinfo), reverse=True): + for username, last_activity, user_orgs in sorted(user_activities, key=lambda x: x[1] if x[1] is not None else datetime.fromtimestamp(0), reverse=True): last_activity_ago = humanize.naturaltime(datetime.now(last_activity.tzinfo) - last_activity) if last_activity else "[red]never[/red]" orgs_str = ", ".join(user_orgs) print(f"{username:<20}: Last activity {last_activity_ago} in orgs: {orgs_str}") From 33421988f1d3b9cb745e5b1cff158dae70511263 Mon Sep 17 00:00:00 2001 From: M Bussonnier Date: Tue, 14 Jan 2025 17:30:37 +0100 Subject: [PATCH 08/13] reformat --- tools/last_user_activity.py | 98 +++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 30 deletions(-) diff --git a/tools/last_user_activity.py b/tools/last_user_activity.py index 9caedc9..3699dee 100644 --- a/tools/last_user_activity.py +++ b/tools/last_user_activity.py @@ -47,22 +47,23 @@ "Accept": "application/vnd.github.v3+json", } + class DateTimeCache(diskcache.Cache): """Custom cache class that handles datetime serialization.""" - + def __setitem__(self, key, value): """Override to serialize datetime objects.""" if isinstance(value, datetime): - value = {'__datetime__': value.isoformat()} + value = {"__datetime__": value.isoformat()} super().__setitem__(key, value) - + def __getitem__(self, key): """Override to deserialize datetime objects.""" value = super().__getitem__(key) - if isinstance(value, dict) and '__datetime__' in value: - return datetime.fromisoformat(value['__datetime__']) + if isinstance(value, dict) and "__datetime__" in value: + return datetime.fromisoformat(value["__datetime__"]) return value - + def get(self, key, default=None, retry=False): """Override to handle datetime deserialization in get method with retry.""" try: @@ -70,10 +71,12 @@ def get(self, key, default=None, retry=False): except KeyError: return default + # Configure DiskCache in the current directory CACHE_DIR = "github_cache" cache = DateTimeCache(CACHE_DIR) + async def get_org_members(session: aiohttp.ClientSession, org: str) -> List[Dict]: """Fetch all members of a GitHub organization with caching. @@ -99,7 +102,7 @@ async def get_org_members(session: aiohttp.ClientSession, org: str) -> List[Dict Pagination is handled automatically (100 items per page). """ cache_key = f"org_members_{org}" - + # Try to get from cache with retry cached_data = cache.get(cache_key, retry=True) if cached_data is not None: @@ -108,34 +111,39 @@ async def get_org_members(session: aiohttp.ClientSession, org: str) -> List[Dict print(f"[yellow]Cache miss for {org} members - fetching from API[/yellow]") members = [] - + try: for page in count(1): url = f"https://api.github.com/orgs/{org}/members?page={page}&per_page=100" async with session.get(url, headers=headers) as response: if response.status != 200: - print(f"[red]Error fetching members for {org}: {response.status}[/red]") + print( + f"[red]Error fetching members for {org}: {response.status}[/red]" + ) break - + page_members = await response.json() if not page_members: break - + members.extend(page_members) - + # Cache the results cache[cache_key] = members # Using __setitem__ instead of set() print(f"[green]Cached {len(members)} members for {org}[/green]") return members - + except Exception as e: print(f"[red]Error fetching members for {org}: {str(e)}[/red]") return [] -async def get_user_activity(session: aiohttp.ClientSession, username: str) -> Optional[datetime]: + +async def get_user_activity( + session: aiohttp.ClientSession, username: str +) -> Optional[datetime]: """Fetch the last public activity date for a GitHub user.""" cache_key = f"user_activity_{username}" - + # Try to get from cache cached_data = cache.get(cache_key) if cached_data is not None: @@ -143,7 +151,7 @@ async def get_user_activity(session: aiohttp.ClientSession, username: str) -> Op return cached_data print(f"[yellow]Cache miss for {username} activity - fetching from API[/yellow]") - + try: print(f"Getting activity for {username}") url = f"https://api.github.com/users/{username}/events/public" @@ -152,32 +160,42 @@ async def get_user_activity(session: aiohttp.ClientSession, username: str) -> Op print(f"Got activity for {username}") events = await response.json() if events: - last_activity = datetime.fromisoformat(events[0]["created_at"].replace('Z', '+00:00')) + last_activity = datetime.fromisoformat( + events[0]["created_at"].replace("Z", "+00:00") + ) # Cache the results - cache[cache_key] = last_activity # Using __setitem__ instead of set() + cache[cache_key] = ( + last_activity # Using __setitem__ instead of set() + ) print(f"[green]Cached activity for {username}[/green]") return last_activity else: print(f"[yellow]No activity found for {username}[/yellow]") cache[cache_key] = None # Using __setitem__ instead of set() else: - print(f"[red]Error fetching activity for {username}: {response.status}[/red]") + print( + f"[red]Error fetching activity for {username}: {response.status}[/red]" + ) except Exception as e: print(f"[red]Error fetching activity for {username}: {str(e)}[/red]") - + return None + def get_cache_size() -> str: """Get the current cache size in a human-readable format.""" try: cache_path = pathlib.Path(CACHE_DIR) if cache_path.exists(): - total_size = sum(f.stat().st_size for f in cache_path.rglob('*') if f.is_file()) + total_size = sum( + f.stat().st_size for f in cache_path.rglob("*") if f.is_file() + ) return f"{total_size / 1024 / 1024:.1f} MB" except Exception: pass return "unknown size" + def clear_cache() -> None: """Clear the disk cache.""" try: @@ -186,6 +204,7 @@ def clear_cache() -> None: except Exception as e: print(f"[red]Error clearing cache: {str(e)}[/red]") + async def main(): """Main execution function.""" # Show cache status @@ -194,16 +213,22 @@ async def main(): async with aiohttp.ClientSession() as session: # Check rate limit - async with session.get("https://api.github.com/rate_limit", headers=headers) as response: + async with session.get( + "https://api.github.com/rate_limit", headers=headers + ) as response: if response.status == 200: rate_data = await response.json() remaining = rate_data["resources"]["core"]["remaining"] - reset_time = datetime.fromtimestamp(rate_data["resources"]["core"]["reset"]) + reset_time = datetime.fromtimestamp( + rate_data["resources"]["core"]["reset"] + ) reset_in = humanize.naturaltime(reset_time) print(f"Rate limit remaining: {remaining}") print(f"Rate limit resets {reset_in}") if remaining < 100: - print(f"[yellow]Warning: Low rate limit ({remaining} remaining)[/yellow]") + print( + f"[yellow]Warning: Low rate limit ({remaining} remaining)[/yellow]" + ) if remaining < 10: print("[red]Aborting due to very low rate limit[/red]") return @@ -230,18 +255,31 @@ async def main(): for (username, _), last_activity in zip(tasks, results): user_activities.append((username, last_activity, all_members[username])) - for username, last_activity, user_orgs in sorted(user_activities, key=lambda x: x[1] if x[1] is not None else datetime.fromtimestamp(0), reverse=True): - last_activity_ago = humanize.naturaltime(datetime.now(last_activity.tzinfo) - last_activity) if last_activity else "[red]never[/red]" + for username, last_activity, user_orgs in sorted( + user_activities, + key=lambda x: x[1] if x[1] is not None else datetime.fromtimestamp(0), + reverse=True, + ): + last_activity_ago = ( + humanize.naturaltime(datetime.now(last_activity.tzinfo) - last_activity) + if last_activity + else "[red]never[/red]" + ) orgs_str = ", ".join(user_orgs) - print(f"{username:<20}: Last activity {last_activity_ago} in orgs: {orgs_str}") + print( + f"{username:<20}: Last activity {last_activity_ago} in orgs: {orgs_str}" + ) + if __name__ == "__main__": parser = argparse.ArgumentParser(description="GitHub Organization Activity Tracker") - parser.add_argument('--clear-cache', action='store_true', help='Clear the cache before running') - parser.add_argument('--debug', action='store_true', help='Show debug information') + parser.add_argument( + "--clear-cache", action="store_true", help="Clear the cache before running" + ) + parser.add_argument("--debug", action="store_true", help="Show debug information") args = parser.parse_args() if args.clear_cache: clear_cache() - + asyncio.run(main()) From ab4a21dbbfc3808bc8f86835eef9736cd9b514a9 Mon Sep 17 00:00:00 2001 From: M Bussonnier Date: Tue, 14 Jan 2025 18:15:49 +0100 Subject: [PATCH 09/13] working --- tools/last_user_activity.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/last_user_activity.py b/tools/last_user_activity.py index 3699dee..38702b4 100644 --- a/tools/last_user_activity.py +++ b/tools/last_user_activity.py @@ -8,7 +8,7 @@ import asyncio import aiohttp from rich import print -from datetime import datetime +from datetime import datetime, timezone import humanize from itertools import count import aiosqlite @@ -253,7 +253,15 @@ async def main(): # Print results sorted by last activity user_activities = [] for (username, _), last_activity in zip(tasks, results): - user_activities.append((username, last_activity, all_members[username])) + user_activities.append( + ( + username, + datetime.fromisoformat(last_activity["__datetime__"]) + if last_activity is not None + else datetime.fromtimestamp(0).replace(tzinfo=timezone.utc), + all_members[username], + ) + ) for username, last_activity, user_orgs in sorted( user_activities, From b32861f114cee390a685aa3cf89dc3f40ace5cc3 Mon Sep 17 00:00:00 2001 From: M Bussonnier Date: Tue, 14 Jan 2025 18:22:36 +0100 Subject: [PATCH 10/13] fix script --- tools/last_user_activity.py | 43 ++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/tools/last_user_activity.py b/tools/last_user_activity.py index 38702b4..645497f 100644 --- a/tools/last_user_activity.py +++ b/tools/last_user_activity.py @@ -11,9 +11,7 @@ from datetime import datetime, timezone import humanize from itertools import count -import aiosqlite import diskcache -import json import pathlib from typing import Optional, List, Dict import argparse @@ -77,7 +75,9 @@ def get(self, key, default=None, retry=False): cache = DateTimeCache(CACHE_DIR) -async def get_org_members(session: aiohttp.ClientSession, org: str) -> List[Dict]: +async def get_org_members( + session: aiohttp.ClientSession, org: str, debug: bool +) -> List[Dict]: """Fetch all members of a GitHub organization with caching. Parameters @@ -106,7 +106,8 @@ async def get_org_members(session: aiohttp.ClientSession, org: str) -> List[Dict # Try to get from cache with retry cached_data = cache.get(cache_key, retry=True) if cached_data is not None: - print(f"[cyan]Cache hit for {org} members[/cyan]") + if debug: + print(f"[cyan]Cache hit for {org} members[/cyan]") return cached_data print(f"[yellow]Cache miss for {org} members - fetching from API[/yellow]") @@ -139,7 +140,7 @@ async def get_org_members(session: aiohttp.ClientSession, org: str) -> List[Dict async def get_user_activity( - session: aiohttp.ClientSession, username: str + session: aiohttp.ClientSession, username: str, debug: bool ) -> Optional[datetime]: """Fetch the last public activity date for a GitHub user.""" cache_key = f"user_activity_{username}" @@ -147,17 +148,22 @@ async def get_user_activity( # Try to get from cache cached_data = cache.get(cache_key) if cached_data is not None: - print(f"[cyan]Cache hit for {username} activity[/cyan]") + if debug: + print(f"[cyan]Cache hit for {username} activity[/cyan]") return cached_data - - print(f"[yellow]Cache miss for {username} activity - fetching from API[/yellow]") + if debug: + print( + f"[yellow]Cache miss for {username} activity - fetching from API[/yellow]" + ) try: - print(f"Getting activity for {username}") + if debug: + print(f"[blue]Getting activity for {username}[/blue]") url = f"https://api.github.com/users/{username}/events/public" async with session.get(url, headers=headers) as response: if response.status == 200: - print(f"Got activity for {username}") + if debug: + print(f"Got activity for {username}") events = await response.json() if events: last_activity = datetime.fromisoformat( @@ -170,7 +176,8 @@ async def get_user_activity( print(f"[green]Cached activity for {username}[/green]") return last_activity else: - print(f"[yellow]No activity found for {username}[/yellow]") + if debug: + print(f"[yellow]No activity found for {username}[/yellow]") cache[cache_key] = None # Using __setitem__ instead of set() else: print( @@ -205,7 +212,7 @@ def clear_cache() -> None: print(f"[red]Error clearing cache: {str(e)}[/red]") -async def main(): +async def main(debug: bool): """Main execution function.""" # Show cache status print(f"[blue]Cache directory: {CACHE_DIR} (size: {get_cache_size()})[/blue]") @@ -236,7 +243,7 @@ async def main(): # Get all members from all orgs all_members = {} for org in orgs: - members = await get_org_members(session, org) + members = await get_org_members(session, org, debug) for member in members: if member["login"] not in all_members: all_members[member["login"]] = [] @@ -245,7 +252,7 @@ async def main(): # Get activity for each user tasks = [] for username in all_members: - task = get_user_activity(session, username) + task = get_user_activity(session, username, debug) tasks.append((username, task)) results = await asyncio.gather(*(task for _, task in tasks)) @@ -258,14 +265,16 @@ async def main(): username, datetime.fromisoformat(last_activity["__datetime__"]) if last_activity is not None - else datetime.fromtimestamp(0).replace(tzinfo=timezone.utc), + else None, all_members[username], ) ) for username, last_activity, user_orgs in sorted( user_activities, - key=lambda x: x[1] if x[1] is not None else datetime.fromtimestamp(0), + key=lambda x: (x[1], x[0]) + if x[1] is not None + else (datetime.fromtimestamp(0).replace(tzinfo=timezone.utc), x[0]), reverse=True, ): last_activity_ago = ( @@ -290,4 +299,4 @@ async def main(): if args.clear_cache: clear_cache() - asyncio.run(main()) + asyncio.run(main(args.debug)) From a2dcd1c80a74df89b88b0c4f377724c7d606c113 Mon Sep 17 00:00:00 2001 From: M Bussonnier Date: Tue, 14 Jan 2025 18:41:02 +0100 Subject: [PATCH 11/13] cleanup --- tools/last_user_activity.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tools/last_user_activity.py b/tools/last_user_activity.py index 645497f..c7eed7d 100644 --- a/tools/last_user_activity.py +++ b/tools/last_user_activity.py @@ -60,12 +60,17 @@ def __getitem__(self, key): value = super().__getitem__(key) if isinstance(value, dict) and "__datetime__" in value: return datetime.fromisoformat(value["__datetime__"]) + assert not isinstance(value, dict), value return value def get(self, key, default=None, retry=False): """Override to handle datetime deserialization in get method with retry.""" try: - return super().get(key, default=default, retry=retry) + value = super().get(key, default=default, retry=retry) + if isinstance(value, dict) and "__datetime__" in value: + return datetime.fromisoformat(value["__datetime__"]) + return value + except KeyError: return default @@ -150,6 +155,7 @@ async def get_user_activity( if cached_data is not None: if debug: print(f"[cyan]Cache hit for {username} activity[/cyan]") + assert isinstance(cached_data, datetime), cached_data return cached_data if debug: print( @@ -170,10 +176,13 @@ async def get_user_activity( events[0]["created_at"].replace("Z", "+00:00") ) # Cache the results + assert isinstance(last_activity, datetime) cache[cache_key] = ( last_activity # Using __setitem__ instead of set() ) - print(f"[green]Cached activity for {username}[/green]") + if debug: + print(f"[green]Cached activity for {username}[/green]") + assert isinstance(last_activity, datetime) return last_activity else: if debug: @@ -260,12 +269,12 @@ async def main(debug: bool): # Print results sorted by last activity user_activities = [] for (username, _), last_activity in zip(tasks, results): + if last_activity is not None: + assert isinstance(last_activity, datetime), last_activity user_activities.append( ( username, - datetime.fromisoformat(last_activity["__datetime__"]) - if last_activity is not None - else None, + last_activity if last_activity is not None else None, all_members[username], ) ) From 9e2406eb2bc1819b96dc071e942f4cdfdc35ca7d Mon Sep 17 00:00:00 2001 From: M Bussonnier Date: Wed, 15 Jan 2025 11:36:50 +0100 Subject: [PATCH 12/13] per org --- tools/last_user_activity.py | 75 +++++++++++++++++++++++++------------ 1 file changed, 52 insertions(+), 23 deletions(-) diff --git a/tools/last_user_activity.py b/tools/last_user_activity.py index c7eed7d..4df4f62 100644 --- a/tools/last_user_activity.py +++ b/tools/last_user_activity.py @@ -8,7 +8,7 @@ import asyncio import aiohttp from rich import print -from datetime import datetime, timezone +from datetime import datetime, timezone, timedelta import humanize from itertools import count import diskcache @@ -16,23 +16,27 @@ from typing import Optional, List, Dict import argparse -orgs = [ +default_orgs = [ "binder-examples", "binderhub-ci-repos", "ipython", "jupyter", + "jupyter-attic", "jupyter-book", "jupyter-governance", "jupyter-incubator", + "jupyter-resources", "jupyter-server", + "jupyter-standard", "jupyter-standards", "jupyter-widgets", - "jupyterhub", - "jupyterlab", "jupyter-xeus", "jupytercon", + "jupyterhub", + "jupyterlab", "voila-dashboards", "voila-gallery", + "pickleshare", ] token = os.getenv("GH_TOKEN") @@ -221,7 +225,7 @@ def clear_cache() -> None: print(f"[red]Error clearing cache: {str(e)}[/red]") -async def main(debug: bool): +async def main(orgs, debug: bool, timelimit_days: int): """Main execution function.""" # Show cache status print(f"[blue]Cache directory: {CACHE_DIR} (size: {get_cache_size()})[/blue]") @@ -278,23 +282,33 @@ async def main(debug: bool): all_members[username], ) ) - - for username, last_activity, user_orgs in sorted( - user_activities, - key=lambda x: (x[1], x[0]) - if x[1] is not None - else (datetime.fromtimestamp(0).replace(tzinfo=timezone.utc), x[0]), - reverse=True, - ): - last_activity_ago = ( - humanize.naturaltime(datetime.now(last_activity.tzinfo) - last_activity) - if last_activity - else "[red]never[/red]" - ) - orgs_str = ", ".join(user_orgs) - print( - f"{username:<20}: Last activity {last_activity_ago} in orgs: {orgs_str}" - ) + for org in orgs: + print(f"[bold]{org}[/bold]") + n_active = 0 + n_inactive = 0 + for username, last_activity, user_orgs in sorted( + user_activities, + key=lambda x: (x[1], x[0]) + if x[1] is not None + else (datetime.fromtimestamp(0).replace(tzinfo=timezone.utc), x[0]), + reverse=True, + ): + if org not in user_orgs: + continue + if last_activity is not None and last_activity > (datetime.now().replace(tzinfo=timezone.utc) - timedelta(days=timelimit_days)): + n_active += 1 + continue + n_inactive += 1 + last_activity_ago = ( + humanize.naturaltime(datetime.now(last_activity.tzinfo) - last_activity) + if last_activity + else "[red]never[/red]" + ) + orgs_str = ", ".join(user_orgs) + print( + f" {username:<20}: Last activity {last_activity_ago}" + ) + print(f" Found [red]{n_inactive} inactive[/red] and [green]{n_active} active[/green] users in {org} with last activity more recent than {timelimit_days} days.") if __name__ == "__main__": @@ -303,9 +317,24 @@ async def main(debug: bool): "--clear-cache", action="store_true", help="Clear the cache before running" ) parser.add_argument("--debug", action="store_true", help="Show debug information") + + + parser.add_argument( + "--timelimit-days", + type=int, + default=0, + help="Time limit in days for the last activity (default: 30)", + ) + parser.add_argument( + "--orgs", + nargs="+", + default=default_orgs, + help="GitHub organizations to track (default: all)", + ) args = parser.parse_args() + if args.clear_cache: clear_cache() - asyncio.run(main(args.debug)) + asyncio.run(main(args.orgs, args.debug, args.timelimit_days)) From f1733afb1ef9dba51384b9f83f328cf3c4f5c5f0 Mon Sep 17 00:00:00 2001 From: M Bussonnier Date: Tue, 21 Jan 2025 17:42:46 +0100 Subject: [PATCH 13/13] Update tools/last_user_activity.py Co-authored-by: David L. Qiu --- tools/last_user_activity.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/last_user_activity.py b/tools/last_user_activity.py index 4df4f62..d3dda82 100644 --- a/tools/last_user_activity.py +++ b/tools/last_user_activity.py @@ -322,8 +322,8 @@ async def main(orgs, debug: bool, timelimit_days: int): parser.add_argument( "--timelimit-days", type=int, - default=0, - help="Time limit in days for the last activity (default: 30)", + default=365, + help="Maximum number of days since last activity before an account is marked as inactive. (default: 365)", ) parser.add_argument( "--orgs",