wikipedia-ga-fa-extraction/scripts/wikipedia_ga_fa_data_extraction.py at main · UCREL/wikipedia-ga-fa-extraction · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import os
from collections import defaultdict
import json
from pathlib import Path
from typing import Annotated, Any
from itertools import product
from collections import Counter

import yaml
import typer
import mariadb
from dotenv import load_dotenv


def load_yaml(file_path: Path) -> dict[str, Any]:
    with file_path.open("r", encoding="utf-8") as f:
        return yaml.safe_load(f)

def get_language_meta_data(file_path: Path, language_code: str) -> dict[str, Any]:
    for language in load_yaml(file_path)["languages"]:
        if language["wikipedia_code"] == language_code:
            assert isinstance(language, dict)
            return language
    raise ValueError(f"Language {language_code} not found language meta data file {file_path}")

def create_index_if_not_exists(cursor, database, table, index_name, columns):
    """
    Creates an index only if it doesn't already exist.

    columns: list of column names e.g. ["customer_id"] or ["status", "country"]
    """
    cursor.execute("""
        SELECT COUNT(*)
        FROM information_schema.STATISTICS
        WHERE table_schema = ?
          AND table_name = ?
          AND index_name = ?
    """, (database, table, index_name))

    row = cursor.fetchone()
    if row is not None and row[0] > 0:
        print(f"Index '{index_name}' already exists on '{table}', skipping.")
        return

    columns_str = ", ".join(columns)
    cursor.execute(f"CREATE INDEX {index_name} ON {table} ({columns_str})")
    print(f"Index '{index_name}' created on '{table}' ({columns_str}).")


def main(env_file: Annotated[Path, typer.Argument(help="Path to a .env file that contains the MariaDB connection details. This should contain two environment variables")],
         port: Annotated[int, typer.Argument(help="Port that MariaDB is running on for the given language")],
         output_file: Annotated[Path, typer.Argument(help="File to write the output to. This file will be in JSONL format whereby in will contain per line `page_id`, `page_title`, `ga`, and `fa` keys.")],
         language_file: Annotated[Path, typer.Argument(help="YAML file that contains language code meta data, specifically the language has to have either `fa_article_names` or `ga_article_names` keys.")],
         language: Annotated[str, typer.Argument(help="Wikipedia language code of the data in the MariaDB database.")],
         exploring: Annotated[bool, typer.Option("-e", "--exploring", help="Whether we are exploring the data.")] = False
         ):
    """
    For the given language, extract the data from the MariaDB database and write it to the output file.

    The data extracted will be in JSONL format whereby each line will contain:

    {
        "page_id": int,
        "page_title": str,
        "ga": bool,
        "fa": bool
    }
    whereby `ga` will be True if the Wikipedia page is a Good Article, and `fa` will be True if the Wikipedia page is a Featured Article.

    The `page_title` will be truncated to 255 characters
    (this is stated within Wikipedia's documentation: https://www.mediawiki.org/wiki/Manual:Page_table)
    and the `page_id` will be the Wikipedia page ID.
    """

    language_meta_data = get_language_meta_data(language_file, language)
    fa_article_names = []
    if "fa_article_names" in language_meta_data:
        fa_article_names = list(product(["featured"], language_meta_data["fa_article_names"]))
    ga_article_names = []
    if "ga_article_names" in language_meta_data:
        ga_article_names = list(product(["good"], language_meta_data["ga_article_names"]))
    all_article_names = fa_article_names + ga_article_names
    if all_article_names == []:
        raise ValueError(f"The language meta data file: {language_file} does not "
                         "contain either a `fa_article_names` or `ga_article_names` "
                         f"key for the given language: {language}")

    assert load_dotenv(env_file)
    database = os.getenv("MARIADB_DATABASE")
    connection_kwargs = {
        "user": "root",
        "password": os.getenv("MARIADB_ROOT_PASSWORD"),
        "host": "127.0.0.1",
        "port": port,
        "database": database,
        #"binary": True
        #"character_encoding": 'utf8mb4' should be the default
    }
    article_counts = defaultdict(int)
    output_file.parent.mkdir(parents=True, exist_ok=True)
    with output_file.open("w", encoding="utf-8") as fp:
        with mariadb.connect(**connection_kwargs) as conn:
            with conn.cursor() as cur:
                cur.execute("SHOW TABLES")
                table_values = defaultdict(list)
                for value in cur:
                    table_name = value[0]
                    print(f"value: {table_name}")
                    with conn.cursor() as internal_cur:
                        internal_cur.execute(f"DESCRIBE {value[0]}")
                        for internal_value in internal_cur:
                            print(f"internal_value: {internal_value}")
                            table_values[table_name].append(internal_value[0])
                print(table_values)

                create_index_if_not_exists(
                    cur,
                    database=database,
                    table="linktarget",
                    index_name="idx_linktarget_title",
                    columns=["lt_title"]
                )
                create_index_if_not_exists(
                    cur,
                    database=database,
                    table="linktarget",
                    index_name="idx_lt_namespace",
                    columns=["lt_namespace"]
                )
                create_index_if_not_exists(
                    cur,
                    database=database,
                    table="categorylinks",
                    index_name="idx_target_id",
                    columns=["cl_target_id"]
                )
                create_index_if_not_exists(
                    cur,
                    database=database,
                    table="categorylinks",
                    index_name="idx_cl_from",
                    columns=["cl_from"]
                )
                create_index_if_not_exists(
                    cur,
                    database=database,
                    table="page",
                    index_name="idx_page_namespace",
                    columns=["page_namespace"]
                )

                ga_article = False
                fa_article = False
                for article_type, article_name in all_article_names:
                    if article_type == "good":
                        ga_article = True
                        fa_article = False
                    elif article_type == "featured":
                        ga_article = False
                        fa_article = True

                    sql_statement = ("SELECT p.page_id, p.page_title, p.page_namespace, lt.lt_namespace, lt.lt_id FROM page p "
                                    "INNER JOIN categorylinks cl ON p.page_id = cl.cl_from "
                                    "INNER JOIN linktarget lt ON cl.cl_target_id = lt.lt_id "
                                    "WHERE lt.lt_title = ? AND lt.lt_namespace = ? AND p.page_namespace = ?")
                    if exploring:
                        sql_statement = ("SELECT lt.lt_title FROM linktarget lt "
                                         "INNER JOIN categorylinks cl ON cl.cl_target_id = lt.lt_id "
                                         "INNER JOIN page p ON cl.cl_from = p.page_id "
                                         "WHERE lt.lt_title LIKE ? AND lt.lt_namespace = ? AND p.page_namespace = ?")
                        cur.execute(sql_statement, (f"%{article_name}%", 14, 0))
                        print(f"Category names that are similar to {article_name}:")
                        wikipedia_article_names_count = Counter()
                        for value in cur:
                            wikipedia_article_names_count.update(([value[0].decode("utf-8")]))
                        for article_name_count, count in wikipedia_article_names_count.most_common(20):
                            print(f"* {article_name_count}: {count}")

                        sql_statement = ("SELECT COUNT(*) FROM page p "
                                         "INNER JOIN categorylinks cl ON p.page_id = cl.cl_from "
                                         "INNER JOIN linktarget lt ON cl.cl_target_id = lt.lt_id "
                                         "WHERE lt.lt_title LIKE ? AND lt.lt_namespace = ? AND p.page_namespace = ?")
                        cur.execute(sql_statement, (f"%{article_name}%", 14, 0))
                        for value in cur:
                            print("Number of Wikipedia pages found for Wikipedia "
                                  f"article like {article_name} and "
                                  f"for article type {article_type} (type): {value}")
                        continue

                    cur.execute(sql_statement, (article_name, 14, 0))
                    for value in cur:
                        page_id, page_title, _, _, _ = value
                        page_title_string = page_title.decode("utf-8").replace("_", " ")
                        page_data = {
                            "page_id": page_id,
                            "page_title": page_title_string,
                            "ga": ga_article,
                            "fa": fa_article
                        }
                        fp.write(json.dumps(page_data) + "\n")
                        article_counts[article_type] += 1

    print(f"Good articles count: {article_counts['good']}")
    print(f"Featured articles count: {article_counts['featured']}")


if __name__ == "__main__":
    typer.run(main)