Skip to content

Commit 1aee41c

Browse files
authored
Merge pull request #83 from Carreau/lat-acct
Add script to report inactive users in GitHub orgs
2 parents 84199b6 + f1733af commit 1aee41c

File tree

1 file changed

+340
-0
lines changed

1 file changed

+340
-0
lines changed

tools/last_user_activity.py

+340
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,340 @@
1+
"""GitHub Organization Activity Tracker
2+
3+
This module tracks and reports the last activity of members across GitHub organizations.
4+
It implements disk-based caching to minimize API requests and respect rate limits.
5+
"""
6+
7+
import os
8+
import asyncio
9+
import aiohttp
10+
from rich import print
11+
from datetime import datetime, timezone, timedelta
12+
import humanize
13+
from itertools import count
14+
import diskcache
15+
import pathlib
16+
from typing import Optional, List, Dict
17+
import argparse
18+
19+
default_orgs = [
20+
"binder-examples",
21+
"binderhub-ci-repos",
22+
"ipython",
23+
"jupyter",
24+
"jupyter-attic",
25+
"jupyter-book",
26+
"jupyter-governance",
27+
"jupyter-incubator",
28+
"jupyter-resources",
29+
"jupyter-server",
30+
"jupyter-standard",
31+
"jupyter-standards",
32+
"jupyter-widgets",
33+
"jupyter-xeus",
34+
"jupytercon",
35+
"jupyterhub",
36+
"jupyterlab",
37+
"voila-dashboards",
38+
"voila-gallery",
39+
"pickleshare",
40+
]
41+
42+
token = os.getenv("GH_TOKEN")
43+
if not token:
44+
print("[red]Error: GH_TOKEN environment variable not set[/red]")
45+
exit(1)
46+
47+
headers = {
48+
"Authorization": f"token {token}",
49+
"Accept": "application/vnd.github.v3+json",
50+
}
51+
52+
53+
class DateTimeCache(diskcache.Cache):
54+
"""Custom cache class that handles datetime serialization."""
55+
56+
def __setitem__(self, key, value):
57+
"""Override to serialize datetime objects."""
58+
if isinstance(value, datetime):
59+
value = {"__datetime__": value.isoformat()}
60+
super().__setitem__(key, value)
61+
62+
def __getitem__(self, key):
63+
"""Override to deserialize datetime objects."""
64+
value = super().__getitem__(key)
65+
if isinstance(value, dict) and "__datetime__" in value:
66+
return datetime.fromisoformat(value["__datetime__"])
67+
assert not isinstance(value, dict), value
68+
return value
69+
70+
def get(self, key, default=None, retry=False):
71+
"""Override to handle datetime deserialization in get method with retry."""
72+
try:
73+
value = super().get(key, default=default, retry=retry)
74+
if isinstance(value, dict) and "__datetime__" in value:
75+
return datetime.fromisoformat(value["__datetime__"])
76+
return value
77+
78+
except KeyError:
79+
return default
80+
81+
82+
# Configure DiskCache in the current directory
83+
CACHE_DIR = "github_cache"
84+
cache = DateTimeCache(CACHE_DIR)
85+
86+
87+
async def get_org_members(
88+
session: aiohttp.ClientSession, org: str, debug: bool
89+
) -> List[Dict]:
90+
"""Fetch all members of a GitHub organization with caching.
91+
92+
Parameters
93+
----------
94+
session : aiohttp.ClientSession
95+
The HTTP session to use for requests
96+
org : str
97+
The name of the GitHub organization
98+
99+
Returns
100+
-------
101+
List[Dict]
102+
A list of dictionaries containing member information.
103+
Each dictionary contains at least:
104+
- 'login': str, the username
105+
- 'id': int, the user ID
106+
- 'type': str, usually 'User'
107+
108+
Notes
109+
-----
110+
Results are cached for 24 hours to minimize API requests.
111+
Pagination is handled automatically (100 items per page).
112+
"""
113+
cache_key = f"org_members_{org}"
114+
115+
# Try to get from cache with retry
116+
cached_data = cache.get(cache_key, retry=True)
117+
if cached_data is not None:
118+
if debug:
119+
print(f"[cyan]Cache hit for {org} members[/cyan]")
120+
return cached_data
121+
122+
print(f"[yellow]Cache miss for {org} members - fetching from API[/yellow]")
123+
members = []
124+
125+
try:
126+
for page in count(1):
127+
url = f"https://api.github.com/orgs/{org}/members?page={page}&per_page=100"
128+
async with session.get(url, headers=headers) as response:
129+
if response.status != 200:
130+
print(
131+
f"[red]Error fetching members for {org}: {response.status}[/red]"
132+
)
133+
break
134+
135+
page_members = await response.json()
136+
if not page_members:
137+
break
138+
139+
members.extend(page_members)
140+
141+
# Cache the results
142+
cache[cache_key] = members # Using __setitem__ instead of set()
143+
print(f"[green]Cached {len(members)} members for {org}[/green]")
144+
return members
145+
146+
except Exception as e:
147+
print(f"[red]Error fetching members for {org}: {str(e)}[/red]")
148+
return []
149+
150+
151+
async def get_user_activity(
152+
session: aiohttp.ClientSession, username: str, debug: bool
153+
) -> Optional[datetime]:
154+
"""Fetch the last public activity date for a GitHub user."""
155+
cache_key = f"user_activity_{username}"
156+
157+
# Try to get from cache
158+
cached_data = cache.get(cache_key)
159+
if cached_data is not None:
160+
if debug:
161+
print(f"[cyan]Cache hit for {username} activity[/cyan]")
162+
assert isinstance(cached_data, datetime), cached_data
163+
return cached_data
164+
if debug:
165+
print(
166+
f"[yellow]Cache miss for {username} activity - fetching from API[/yellow]"
167+
)
168+
169+
try:
170+
if debug:
171+
print(f"[blue]Getting activity for {username}[/blue]")
172+
url = f"https://api.github.com/users/{username}/events/public"
173+
async with session.get(url, headers=headers) as response:
174+
if response.status == 200:
175+
if debug:
176+
print(f"Got activity for {username}")
177+
events = await response.json()
178+
if events:
179+
last_activity = datetime.fromisoformat(
180+
events[0]["created_at"].replace("Z", "+00:00")
181+
)
182+
# Cache the results
183+
assert isinstance(last_activity, datetime)
184+
cache[cache_key] = (
185+
last_activity # Using __setitem__ instead of set()
186+
)
187+
if debug:
188+
print(f"[green]Cached activity for {username}[/green]")
189+
assert isinstance(last_activity, datetime)
190+
return last_activity
191+
else:
192+
if debug:
193+
print(f"[yellow]No activity found for {username}[/yellow]")
194+
cache[cache_key] = None # Using __setitem__ instead of set()
195+
else:
196+
print(
197+
f"[red]Error fetching activity for {username}: {response.status}[/red]"
198+
)
199+
except Exception as e:
200+
print(f"[red]Error fetching activity for {username}: {str(e)}[/red]")
201+
202+
return None
203+
204+
205+
def get_cache_size() -> str:
206+
"""Get the current cache size in a human-readable format."""
207+
try:
208+
cache_path = pathlib.Path(CACHE_DIR)
209+
if cache_path.exists():
210+
total_size = sum(
211+
f.stat().st_size for f in cache_path.rglob("*") if f.is_file()
212+
)
213+
return f"{total_size / 1024 / 1024:.1f} MB"
214+
except Exception:
215+
pass
216+
return "unknown size"
217+
218+
219+
def clear_cache() -> None:
220+
"""Clear the disk cache."""
221+
try:
222+
cache.clear()
223+
print("[green]Cache cleared successfully[/green]")
224+
except Exception as e:
225+
print(f"[red]Error clearing cache: {str(e)}[/red]")
226+
227+
228+
async def main(orgs, debug: bool, timelimit_days: int):
229+
"""Main execution function."""
230+
# Show cache status
231+
print(f"[blue]Cache directory: {CACHE_DIR} (size: {get_cache_size()})[/blue]")
232+
print(f"[blue]Cache contains {len(cache)} items[/blue]")
233+
234+
async with aiohttp.ClientSession() as session:
235+
# Check rate limit
236+
async with session.get(
237+
"https://api.github.com/rate_limit", headers=headers
238+
) as response:
239+
if response.status == 200:
240+
rate_data = await response.json()
241+
remaining = rate_data["resources"]["core"]["remaining"]
242+
reset_time = datetime.fromtimestamp(
243+
rate_data["resources"]["core"]["reset"]
244+
)
245+
reset_in = humanize.naturaltime(reset_time)
246+
print(f"Rate limit remaining: {remaining}")
247+
print(f"Rate limit resets {reset_in}")
248+
if remaining < 100:
249+
print(
250+
f"[yellow]Warning: Low rate limit ({remaining} remaining)[/yellow]"
251+
)
252+
if remaining < 10:
253+
print("[red]Aborting due to very low rate limit[/red]")
254+
return
255+
256+
# Get all members from all orgs
257+
all_members = {}
258+
for org in orgs:
259+
members = await get_org_members(session, org, debug)
260+
for member in members:
261+
if member["login"] not in all_members:
262+
all_members[member["login"]] = []
263+
all_members[member["login"]].append(org)
264+
265+
# Get activity for each user
266+
tasks = []
267+
for username in all_members:
268+
task = get_user_activity(session, username, debug)
269+
tasks.append((username, task))
270+
271+
results = await asyncio.gather(*(task for _, task in tasks))
272+
273+
# Print results sorted by last activity
274+
user_activities = []
275+
for (username, _), last_activity in zip(tasks, results):
276+
if last_activity is not None:
277+
assert isinstance(last_activity, datetime), last_activity
278+
user_activities.append(
279+
(
280+
username,
281+
last_activity if last_activity is not None else None,
282+
all_members[username],
283+
)
284+
)
285+
for org in orgs:
286+
print(f"[bold]{org}[/bold]")
287+
n_active = 0
288+
n_inactive = 0
289+
for username, last_activity, user_orgs in sorted(
290+
user_activities,
291+
key=lambda x: (x[1], x[0])
292+
if x[1] is not None
293+
else (datetime.fromtimestamp(0).replace(tzinfo=timezone.utc), x[0]),
294+
reverse=True,
295+
):
296+
if org not in user_orgs:
297+
continue
298+
if last_activity is not None and last_activity > (datetime.now().replace(tzinfo=timezone.utc) - timedelta(days=timelimit_days)):
299+
n_active += 1
300+
continue
301+
n_inactive += 1
302+
last_activity_ago = (
303+
humanize.naturaltime(datetime.now(last_activity.tzinfo) - last_activity)
304+
if last_activity
305+
else "[red]never[/red]"
306+
)
307+
orgs_str = ", ".join(user_orgs)
308+
print(
309+
f" {username:<20}: Last activity {last_activity_ago}"
310+
)
311+
print(f" Found [red]{n_inactive} inactive[/red] and [green]{n_active} active[/green] users in {org} with last activity more recent than {timelimit_days} days.")
312+
313+
314+
if __name__ == "__main__":
315+
parser = argparse.ArgumentParser(description="GitHub Organization Activity Tracker")
316+
parser.add_argument(
317+
"--clear-cache", action="store_true", help="Clear the cache before running"
318+
)
319+
parser.add_argument("--debug", action="store_true", help="Show debug information")
320+
321+
322+
parser.add_argument(
323+
"--timelimit-days",
324+
type=int,
325+
default=365,
326+
help="Maximum number of days since last activity before an account is marked as inactive. (default: 365)",
327+
)
328+
parser.add_argument(
329+
"--orgs",
330+
nargs="+",
331+
default=default_orgs,
332+
help="GitHub organizations to track (default: all)",
333+
)
334+
args = parser.parse_args()
335+
336+
337+
if args.clear_cache:
338+
clear_cache()
339+
340+
asyncio.run(main(args.orgs, args.debug, args.timelimit_days))

0 commit comments

Comments
 (0)