google-books-ngram-frequency/python/create_source_data_lists.py at main · orgtre/google-books-ngram-frequency · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# Creates source-data directory with links to source data
# run from repository root directory 'google-books-ngram-frequency'

import re, os, wget
import httplib2
from bs4 import BeautifulSoup, SoupStrainer


langcode = {"english": "eng", "english-us": "eng-us", "english-gb": "eng-gb",
            "english-fiction": "eng-fiction", "chinese_simplified": "chi_sim",
            "french": "fre", "german": "ger", "hebrew": "heb",
            "italian": "ita", "russian": "rus", "spanish": "spa"}

nmax = 5


def create_source_data_lists(langcode):
    """This function creates the directories "source-data/data_googlebooks-*"
    and fills them with files holding the urls of each gz file.
    For 1-grams, the totalcounts file is also added.
    Only needs to be run if Google changes the urls."""

    for n in range(1, nmax+1):

        for key, langc in langcode.items():

            if not os.path.exists(("source-data/data_googlebooks-"
                                   + f"{langc}-20200217/")):
                os.makedirs(("source-data/data_googlebooks-"
                             + f"{langc}-20200217/"))

            http = httplib2.Http()
            status, response = http.request(("http://storage.googleapis.com/"
                                             + "books/ngrams/books/20200217/"
                                             + f"{langc}/{langc}-{n}-"
                                             + "ngrams_exports.html"))

            urls = []
            for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):

                if link.has_attr('href'):

                    if re.match(r".*gz$", link['href']):
                        urls += [link['href']]

                    elif (n == 1) & (re.match(r".*totalcounts-1$",
                                              link['href']) is not None):
                        filename = ("source-data/data_googlebooks-"
                                    + f"{langc}-20200217/totalcounts_1.txt")

                        if os.path.exists(filename):
                            os.remove(filename)

                        wget.download(link['href'], filename)

            with open(f"source-data/data_googlebooks-{langc}-"
                       + f"20200217/filelinklist_{n}grams.txt", 'w') as f:

                for url in urls:
                    f.write("%s\n" % url)


if __name__ == '__main__':
    create_source_data_lists(langcode)