gsoc-scraper/scraper_v1.py at main · tashi21/gsoc-scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""
Scrapes the GSoC archive of organizations from the years 2016 to 2021 using Selenium WebDriver.
Collects organizations names for each year, their GSoC links, and their logo urls.
"""
import json
import math

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait


def get_organizations(orgs: list, driver: WebDriver) -> None:
    """
    Get the organizations on each page.
    """
    orgs.extend(
        WebDriverWait(driver, 20).until(
            EC.presence_of_all_elements_located((
                By.CSS_SELECTOR,
                ".grid__row.no-gap.ng-star-inserted div.grid__row__item"
            ))
        )
    )


def organize(data: dict, orgs: list, year: str, start: int = 0) -> None:
    """
    Organizes the data into a dictionary.
    """
    for org in orgs[start::]:
        # find the link of the current organization
        link = org.find_element(
            by=By.TAG_NAME,
            value="a"
        ).get_attribute("href")

        # find the name of the current organization
        name = org.find_element(by=By.CLASS_NAME, value="name").text

        # find the image url of the current organization
        img_url = org.find_element(
            by=By.TAG_NAME,
            value="img"
        ).get_attribute("src")

        # find the description of the current organization
        short_description = org.find_element(
            by=By.CLASS_NAME,
            value="short-description"
        ).text

        # save the data as a dictionary
        data[year].append(
            {
                name: {
                    "link": link,
                    "img_url": img_url,
                    "short_description": short_description
                }
            }
        )


def main() -> None:
    """
    The main method.
    """
    driver = webdriver.Chrome(service=Service("chromedriver.exe"))
    driver.maximize_window()

    # store all the organization details
    data = {"2009": [], "2010": [], "2011": [], "2012": [], "2013": [], "2014": [],
            "2015": [], "2016": [], "2017": [], "2018": [], "2019": [], "2020": [],
            "2021": []}

    for year in range(2016, 2022):
        # create new list of organizations for each year
        orgs = []

        # link to access the list of organizations for the current year
        driver.get(
            f"https://summerofcode.withgoogle.com/archive/{year}/organizations")

        # find number of orgs
        num_orgs = int(WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((
                By.CSS_SELECTOR,
                "div.mat-paginator-range-label"
            ))
        ).text.split()[-1])

        # organize the organization details
        get_organizations(orgs, driver)
        organize(data, orgs, str(year))

        # number of times to press next page button
        for i in range(math.ceil(num_orgs/50) - 1):

            # find the next page button
            button = driver.find_element(
                by=By.CSS_SELECTOR,
                value="button[aria-label='Next page']")

            # click the next page button using javascript
            driver.execute_script("arguments[0].click();", button)

            # organize the organization details
            get_organizations(orgs, driver)
            organize(data, orgs, str(year), start=50 + (i * 50))

    try:
        with open("data.json", "w") as f:
            json.dump(data, f, indent=4, ensure_ascii=False)

    finally:
        driver.quit()


main()