Skip to content

Commit ed12eb6

Browse files
committed
feat: [AB#17452] add script to find and report duplicate industries based on task sets
1 parent 3e9bb4d commit ed12eb6

File tree

1 file changed

+117
-0
lines changed

1 file changed

+117
-0
lines changed
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
#!/usr/bin/env python3
2+
3+
import json
4+
import sys
5+
from collections import defaultdict
6+
from pathlib import Path
7+
8+
INDUSTRIES_DIR = Path(__file__).parent.parent / "src" / "roadmaps" / "industries"
9+
10+
11+
def load_industry(file_path):
12+
"""Parse one industry JSON file and extract its task IDs.
13+
14+
Args:
15+
file_path: Path to the industry JSON file.
16+
17+
Returns:
18+
A tuple of (name, id, frozenset of task IDs).
19+
"""
20+
with open(file_path, encoding="utf-8") as f:
21+
data = json.load(f)
22+
23+
task_ids = set()
24+
for step in data.get("roadmapSteps", []):
25+
task = step.get("task", "")
26+
if task:
27+
task_ids.add(task)
28+
license_task = step.get("licenseTask", "")
29+
if license_task:
30+
task_ids.add(license_task)
31+
32+
industry_id = data.get("id", file_path.stem)
33+
name = data.get("name", industry_id)
34+
return name, industry_id, frozenset(task_ids)
35+
36+
37+
def load_all_industries(directory):
38+
"""Load all industry JSON files from a directory.
39+
40+
Args:
41+
directory: Path to the industries directory.
42+
43+
Returns:
44+
A list of (name, id, frozenset of task IDs) tuples.
45+
"""
46+
industries = []
47+
for file_path in sorted(directory.glob("*.json")):
48+
try:
49+
industries.append(load_industry(file_path))
50+
except (json.JSONDecodeError, KeyError) as e:
51+
print(f"Warning: skipping {file_path.name}: {e}", file=sys.stderr)
52+
return industries
53+
54+
55+
def find_duplicate_groups(industries):
56+
"""Group industries by identical task sets.
57+
58+
Args:
59+
industries: List of (name, id, task_set) tuples.
60+
61+
Returns:
62+
A list of (task_set, [(name, id), ...]) for groups with 2+ members,
63+
sorted by group size descending.
64+
"""
65+
groups = defaultdict(list)
66+
for name, industry_id, task_set in industries:
67+
groups[task_set].append((name, industry_id))
68+
69+
duplicates = [
70+
(task_set, members)
71+
for task_set, members in groups.items()
72+
if len(members) >= 2
73+
]
74+
duplicates.sort(key=lambda g: len(g[1]), reverse=True)
75+
return duplicates
76+
77+
78+
def print_report(groups, total):
79+
"""Print a formatted report of duplicate industry groups.
80+
81+
Args:
82+
groups: List of (task_set, members) tuples from find_duplicate_groups.
83+
total: Total number of industry files loaded.
84+
"""
85+
if not groups:
86+
print("No duplicate industry groups found.")
87+
return
88+
89+
involved = sum(len(members) for _, members in groups)
90+
print(f"\nFound {len(groups)} group(s) of industries with identical task sets")
91+
print(f"({involved} industries involved out of {total} total)")
92+
print("=" * 70)
93+
94+
for i, (task_set, members) in enumerate(groups, 1):
95+
print(f"\nGroup {i} ({len(members)} industries, {len(task_set)} tasks):")
96+
print(" Industries:")
97+
for name, industry_id in sorted(members, key=lambda m: m[0]):
98+
print(f" - {name} ({industry_id})")
99+
print(" Shared tasks:")
100+
for task_id in sorted(task_set):
101+
print(f" - {task_id}")
102+
103+
104+
def main():
105+
if not INDUSTRIES_DIR.is_dir():
106+
print(f"Error: directory not found: {INDUSTRIES_DIR}", file=sys.stderr)
107+
sys.exit(1)
108+
109+
industries = load_all_industries(INDUSTRIES_DIR)
110+
print(f"Loaded {len(industries)} industry files from {INDUSTRIES_DIR.name}/")
111+
112+
groups = find_duplicate_groups(industries)
113+
print_report(groups, len(industries))
114+
115+
116+
if __name__ == "__main__":
117+
main()

0 commit comments

Comments
 (0)