-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathwikiFilter.py
More file actions
executable file
·104 lines (95 loc) · 3.91 KB
/
wikiFilter.py
File metadata and controls
executable file
·104 lines (95 loc) · 3.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/python3
import os
import bz2
import argparse
def split_xml(filename, splitsize, dir, tags, template, keywords):
''' The function gets the filename of wiktionary.xml.bz2 file as input and creates
smaller chunks of it in the directory chunks.
'''
# Check and create chunk directory
if not os.path.exists(dir):
os.mkdir(dir)
# Counters
pagecount = 0
filecount = 1
ismatch = 2
header = ""
footer = "</mediawiki>"
tempstr = ""
# Open chunkfile in write mode
chunkname = lambda filecount: os.path.join(dir, "chunk-" + str(filecount) + ".xml.bz2")
chunkfile = bz2.BZ2File(chunkname(filecount), 'w')
# Read line by line
bzfile = bz2.BZ2File(filename)
tags = tags.split(',')
if keywords:
keywords = keywords.split(',')
else:
keywords = []
# The header
for line in bzfile:
header += line.decode('utf-8') # decode the bytes to string
if '</siteinfo>' in header:
break
chunkfile.write(header.encode('utf-8')) # write as bytes
# And the rest
for line in bzfile:
line = line.decode('utf-8') # decode the bytes to string
# The </page> determines new wiki page
if '<page' in line:
tempstr = ""
ismatch = 0
tempstr = tempstr + line
for tag in tags:
if '</' + tag + '>' in line:
ismatch = 1
pagecount += 1
print(splitsize * filecount + pagecount)
for keyword in keywords:
if keyword in line:
ismatch = 1
pagecount += 1
print(splitsize * filecount + pagecount)
if template and ('<ns>10</ns>' in line or '<ns>828</ns>' in line):
ismatch = 1
pagecount += 1
print('template')
if '</page>' in line:
if ismatch == 1:
chunkfile.write(tempstr.encode('utf-8')) # write as bytes
tempstr = ""
if pagecount > splitsize:
# print chunkname() # For Debugging
chunkfile.write(footer.encode('utf-8')) # write as bytes
chunkfile.close()
pagecount = 0 # RESET pagecount
filecount += 1 # increment filename
chunkfile = bz2.BZ2File(chunkname(filecount), 'w')
chunkfile.write(header.encode('utf-8')) # write as bytes
try:
if pagecount > 0:
chunkfile.write(footer.encode('utf-8')) # write as bytes
chunkfile.close()
else:
chunkfile.close()
os.remove(chunkname(filecount))
except Exception as e:
print('Error:', e)
if __name__ == '__main__': # When the script is self-run
parser = argparse.ArgumentParser(description='Extract wikipages that contain the math tag',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-f', '--filename', help='The bz2-file to be split and filtered',
default='enwiki-latest-pages-articles.xml.bz2', dest='file')
parser.add_argument('-s', '--splitsize', help='The number of pages contained in each split',
default=1000000, type=int, dest='size')
parser.add_argument('-d', '--outputdir', help='The directory name where the files go',
default='wout', type=str, dest='dir')
parser.add_argument('-t', '--tagname', help='Comma separated list of the tag names to search for',
default='math,ce,chem', type=str, dest='tags')
parser.add_argument('-k', '--keyword', help='Comma separated list of the keywords to search for',
default='', type=str, dest='keywords')
parser.add_argument("-v", "--verbosity", action="count", default=0)
parser.add_argument('-T', '--template', help='Include all templates',
action="store_true", dest='template')
args = parser.parse_args()
split_xml(args.file, args.size, args.dir, args.tags, args.template, args.keywords)