Skip to content
This repository was archived by the owner on Mar 8, 2023. It is now read-only.

Commit e66cae3

Browse files
committed
fix #85
1 parent e258f98 commit e66cae3

File tree

2 files changed

+21
-5
lines changed

2 files changed

+21
-5
lines changed

MITADS/merge_txt.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
i=0
33
parsing=''
44
loop=0
5-
for f in ./corpus_api.py ./eulogos_chat_importer.py ./ananas_exporter.py ./tg_ita_exporter.py ./ted_importer.py ./gutenberg_exporter.py ./wikiquote_exporter.py ./wikisource_importer.py ./opensubtitles_exporter.py
5+
for f in ./wikisource_importer.py ./opensubtitles_exporter.py ./corpus_api.py ./eulogos_chat_importer.py ./ananas_exporter.py ./tg_ita_exporter.py ./ted_importer.py ./gutenberg_exporter.py ./wikiquote_exporter.py
66
do
77
echo "========="
88
echo $f

MITADS/wikisource_importer.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,12 @@
22
from utils import sanitize, line_rules, download
33
from urllib import parse
44
import time
5+
import os
56

67
OUTFILE = "output/wikisource.txt"
8+
PARSING = './parsing/wikisource/'
9+
if not os.path.isdir(PARSING):
10+
os.mkdir(PARSING)
711
DISCARD_FILE = 'output/discarded/wikisource.json'
812
DOWNLOAD_LINK = 'https://wsexport.wmflabs.org/tool/book.php?lang=it&format=txt&page='
913

@@ -33,8 +37,17 @@ def process_line(line, out_file):
3337

3438

3539
def process_book(book, out_file):
40+
book_file = PARSING + book.replace('/','-') + '.txt'
3641
book = parse.quote(book) # need to html encode book title to avoid non ascii chars
37-
raw_text = download_me.download_page(DOWNLOAD_LINK + book)
42+
if not os.path.isfile(book_file):
43+
print(" Downloading in progress")
44+
time.sleep(5) # to avoid being banned from wikipedia servers for excess in requests
45+
raw_text = download_me.download_page(DOWNLOAD_LINK + book)
46+
result = open(book_file, "w", encoding='utf-8')
47+
result.write(raw_text)
48+
else:
49+
print(" Already downloaded in " + book_file)
50+
raw_text = open(book_file, 'r').read()
3851
raw_text = clean_me.maybe_normalize(raw_text)
3952
raw_text = clean_me.prepare_splitlines(raw_text).splitlines()
4053
tot_lines = 0
@@ -52,9 +65,12 @@ def main():
5265

5366
tot_lines = 0
5467
for count, book in enumerate(books):
55-
time.sleep(5) # to avoid being banned from wikipedia servers for excess in requests
56-
print(" Processing book : {}\n {} of {}".format(book, count, len(books)))
57-
tot_lines += process_book(book, result)
68+
print(" Processing book: {}\n {} of {}".format(book, count, len(books)))
69+
try:
70+
tot_lines += process_book(book, result)
71+
except:
72+
# if fails try again
73+
tot_lines += process_book(book, result)
5874

5975
result.close()
6076

0 commit comments

Comments
 (0)