22from utils import sanitize , line_rules , download
33from urllib import parse
44import time
5+ import os
56
67OUTFILE = "output/wikisource.txt"
8+ PARSING = './parsing/wikisource/'
9+ if not os .path .isdir (PARSING ):
10+ os .mkdir (PARSING )
711DISCARD_FILE = 'output/discarded/wikisource.json'
812DOWNLOAD_LINK = 'https://wsexport.wmflabs.org/tool/book.php?lang=it&format=txt&page='
913
@@ -33,8 +37,17 @@ def process_line(line, out_file):
3337
3438
3539def process_book (book , out_file ):
40+ book_file = PARSING + book .replace ('/' ,'-' ) + '.txt'
3641 book = parse .quote (book ) # need to html encode book title to avoid non ascii chars
37- raw_text = download_me .download_page (DOWNLOAD_LINK + book )
42+ if not os .path .isfile (book_file ):
43+ print (" Downloading in progress" )
44+ time .sleep (5 ) # to avoid being banned from wikipedia servers for excess in requests
45+ raw_text = download_me .download_page (DOWNLOAD_LINK + book )
46+ result = open (book_file , "w" , encoding = 'utf-8' )
47+ result .write (raw_text )
48+ else :
49+ print (" Already downloaded in " + book_file )
50+ raw_text = open (book_file , 'r' ).read ()
3851 raw_text = clean_me .maybe_normalize (raw_text )
3952 raw_text = clean_me .prepare_splitlines (raw_text ).splitlines ()
4053 tot_lines = 0
@@ -52,9 +65,12 @@ def main():
5265
5366 tot_lines = 0
5467 for count , book in enumerate (books ):
55- time .sleep (5 ) # to avoid being banned from wikipedia servers for excess in requests
56- print (" Processing book : {}\n {} of {}" .format (book , count , len (books )))
57- tot_lines += process_book (book , result )
68+ print (" Processing book: {}\n {} of {}" .format (book , count , len (books )))
69+ try :
70+ tot_lines += process_book (book , result )
71+ except :
72+ # if fails try again
73+ tot_lines += process_book (book , result )
5874
5975 result .close ()
6076
0 commit comments