Skip to content

Commit ecbeb04

Browse files
committed
Create create_dataset_helper.py
1 parent 4c00dbc commit ecbeb04

File tree

1 file changed

+48
-0
lines changed

1 file changed

+48
-0
lines changed

tests/create_dataset_helper.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import pandas as pd
2+
import requests
3+
4+
5+
def fetch_repositories(topic, max_repos=100):
6+
url = f'https://api.github.com/search/repositories?q={topic}&sort=stars&order=desc'
7+
repositories = []
8+
page = 1
9+
10+
while len(repositories) < max_repos:
11+
response = requests.get(url, params={'page': page, 'per_page': 30})
12+
if response.status_code != 200:
13+
print(f"Error fetching data: {response.status_code}")
14+
break
15+
16+
data = response.json()
17+
repositories.extend(data.get('items', []))
18+
19+
if 'items' not in data or len(data['items']) == 0:
20+
break
21+
22+
page += 1
23+
24+
return [repo['html_url'] for repo in repositories[:max_repos]]
25+
26+
27+
def save_to_excel(urls, filename='repositories.xlsx'):
28+
df = pd.DataFrame(urls, columns=['Repository URL'])
29+
df.to_excel(filename, index=False)
30+
31+
32+
def main():
33+
# Set your desired topic and maximum number of repositories here
34+
topic = "machine learning" # Change this to your desired topic
35+
max_repos = 100 # Change this to your desired maximum number of repositories
36+
37+
print(f"Fetching repositories for topic: {topic}")
38+
urls = fetch_repositories(topic, max_repos)
39+
40+
if urls:
41+
save_to_excel(urls)
42+
print(f"Saved {len(urls)} repository URLs to 'repositories.xlsx'")
43+
else:
44+
print("No repositories found.")
45+
46+
47+
if __name__ == "__main__":
48+
main()

0 commit comments

Comments
 (0)