issue-classification/scrape.py at master · Audace/issue-classification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
import csv
import time
import openpyxl
from github import Github

dir_path = os.path.dirname(os.path.realpath(__file__))
g = Github("username","access-token")

def search_repos():
    search_results = g.search_repositories('python+language:python', sort='stars', order='desc')
    repos = []
    with open('repos.csv', 'w') as csvfile:
        fieldnames = ['id', 'url', 'name', 'desc', 'lang', 'open_issues_count',
                      'stars', 'watching', 'forked', 'owner_un', 'owner_id']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for repo in search_results:
            repo_data = { "id": repo.id, "url": repo.url, "name": repo.full_name, "desc": repo.description.encode('ascii', 'ignore'), "lang": repo.language, "open_issues_count": repo.open_issues_count, "stars": repo.stargazers_count, "watching": repo.watchers_count, "forked": repo.forks_count, "owner_un": repo.owner.login, "owner_id": repo.owner.id }
            writer.writerow(repo_data)
            repos.append(repo_data)
    return repos


def search_issues(repos=None):
    with open('repos.csv') as csvfile:
        reader = csv.DictReader(csvfile)
        with open('issues.csv', 'w') as issuesfile:
            workbook = openpyxl.load_workbook(dir_path + '/issues.xlsx')
            sheet = workbook.get_sheet_by_name('Sheet1')
            for repo in reader:
                q = 'repo:' + repo['name'] + ' state:closed type:issue'
                search_results = g.search_issues(q)

                for issue in search_results:
                    if g.rate_limiting[0] == 5:
                        time.sleep(1)
                    labels = [label.name.encode('ascii','ignore') for label in issue.labels]
                    if labels == []:
                        continue

                    title = ''
                    body = ''
                    if issue.title != None:
                        title = issue.title.encode('ascii', 'ignore')
                    if issue.body != None:
                        body = issue.body.encode('ascii', 'ignore')
                    issue_data = [issue.id, title, body, issue.number, issue.comments, repo["id"]] + labels
                    for col_index, cell in enumerate(issue_data):
                        try:
                            sheet.cell(row=(issues_stored+1),column=(col_index+1),value=cell)
                        except:
                            pass
                    workbook.save(dir_path + '/issues.xlsx')