Skip to content

Commit 0cfde9e

Browse files
authored
Merge pull request #394 from nsapa/nico_fix_1
Nico's fixes (dumping wiki.dystify.com/CI fixes)
2 parents 9b1996d + 5986467 commit 0cfde9e

File tree

2 files changed

+70
-29
lines changed

2 files changed

+70
-29
lines changed

dumpgenerator.py

+54-13
Original file line numberDiff line numberDiff line change
@@ -299,16 +299,22 @@ def getPageTitlesScraper(config={}, session=None):
299299
else:
300300
pass # perhaps no subpages
301301

302-
# 3 is the current deep of English Wikipedia for Special:Allpages
303-
deep = 3
302+
# Should be enought subpages on Special:Allpages
303+
deep = 50
304304
c = 0
305+
oldfr = ''
305306
checked_suballpages = []
306307
rawacum = raw
307308
while r_suballpages and re.search(r_suballpages, raw) and c < deep:
308309
# load sub-Allpages
309310
m = re.compile(r_suballpages).finditer(raw)
310311
for i in m:
311312
fr = i.group('from')
313+
currfr = fr
314+
315+
if oldfr == currfr:
316+
# We are looping, exit the loop
317+
pass
312318

313319
if r_suballpages == r_suballpages1:
314320
to = i.group('to')
@@ -329,19 +335,23 @@ def getPageTitlesScraper(config={}, session=None):
329335
url = '%s?title=Special:Allpages&from=%s&namespace=%s' % (
330336
config['index'], name, namespace)
331337

338+
339+
332340
if name not in checked_suballpages:
333341
# to avoid reload dupe subpages links
334342
checked_suballpages.append(name)
335343
delay(config=config, session=session)
336-
r2 = session.get(url=url, timeout=10)
337-
raw2 = r2.text
338-
raw2 = cleanHTML(raw2)
339-
rawacum += raw2 # merge it after removed junk
340-
print ' Reading', name, len(raw2), 'bytes', \
341-
len(re.findall(r_suballpages, raw2)), 'subpages', \
342-
len(re.findall(r_title, raw2)), 'pages'
344+
r = session.get(url=url, timeout=10)
345+
#print 'Fetching URL: ', url
346+
raw = r.text
347+
raw = cleanHTML(raw)
348+
rawacum += raw # merge it after removed junk
349+
print ' Reading', name, len(raw), 'bytes', \
350+
len(re.findall(r_suballpages, raw)), 'subpages', \
351+
len(re.findall(r_title, raw)), 'pages'
343352

344353
delay(config=config, session=session)
354+
oldfr = currfr
345355
c += 1
346356

347357
c = 0
@@ -497,8 +507,9 @@ def getUserAgent():
497507
""" Return a cool user-agent to hide Python user-agent """
498508
useragents = [
499509
# firefox
500-
'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0',
501-
'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0',
510+
#'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0',
511+
#'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0',
512+
'Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0'
502513
]
503514
return useragents[0]
504515

@@ -574,6 +585,9 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
574585
except requests.exceptions.ConnectionError as e:
575586
print ' Connection error: %s'%(str(e[0]))
576587
xml = ''
588+
except requests.exceptions.ReadTimeout as e:
589+
print ' Read timeout: %s'%(str(e[0]))
590+
xml = ''
577591
c += 1
578592

579593
return xml
@@ -1471,7 +1485,29 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
14711485
print 'Filename is too long, truncating. Now it is:', filename2
14721486
filename3 = u'%s/%s' % (imagepath, filename2)
14731487
imagefile = open(filename3, 'wb')
1474-
r = requests.get(url=url)
1488+
1489+
r = session.head(url=url, allow_redirects=True)
1490+
original_url_redirected = len(r.history) > 0
1491+
1492+
if original_url_redirected:
1493+
#print 'Site is redirecting us to: ', r.url
1494+
original_url = url
1495+
url = r.url
1496+
1497+
r = session.get(url=url, allow_redirects=False)
1498+
1499+
# Try to fix a broken HTTP to HTTPS redirect
1500+
if r.status_code == 404 and original_url_redirected:
1501+
if original_url.split("://")[0] == "http" and url.split("://")[0] == "https":
1502+
url = 'https://' + original_url.split("://")[1]
1503+
#print 'Maybe a broken http to https redirect, trying ', url
1504+
r = session.get(url=url, allow_redirects=False)
1505+
1506+
if r.status_code == 404:
1507+
logerror(
1508+
config=config,
1509+
text=u'File %s at URL %s is missing' % (filename2,url))
1510+
14751511
imagefile.write(r.content)
14761512
imagefile.close()
14771513
# saving description if any
@@ -1494,9 +1530,14 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
14941530

14951531
f = open('%s/%s.desc' % (imagepath, filename2), 'w')
14961532
# <text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text>
1497-
if not re.search(r'</mediawiki>', xmlfiledesc):
1533+
if not re.search(r'</page>', xmlfiledesc):
14981534
# failure when retrieving desc? then save it as empty .desc
14991535
xmlfiledesc = ''
1536+
1537+
# Fixup the XML
1538+
if xmlfiledesc is not '' and not re.search(r'</mediawiki>', xmlfiledesc):
1539+
xmlfiledesc += '</mediawiki>'
1540+
15001541
f.write(xmlfiledesc.encode('utf-8'))
15011542
f.close()
15021543
delay(config=config, session=session)

testing/test_dumpgenerator.py

+16-16
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def test_getImages(self):
6262
tests = [
6363
# Alone wikis
6464
#['http://wiki.annotation.jp/index.php', 'http://wiki.annotation.jp/api.php', u'かずさアノテーション - ソーシャル・ゲノム・アノテーション.jpg'],
65-
['https://www.archiveteam.org/index.php', 'https://www.archiveteam.org/api.php', u'Archive-is 2013-07-02 17-05-40.png'],
65+
['https://archiveteam.org/index.php', 'https://archiveteam.org/api.php', u'Archive-is 2013-07-02 17-05-40.png'],
6666
#['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Benham\'s disc (animated).gif'],
6767

6868
# Editthis wikifarm
@@ -146,7 +146,7 @@ def test_getPageTitles(self):
146146
print '\n', '#'*73, '\n', 'test_getPageTitles', '\n', '#'*73
147147
tests = [
148148
# Alone wikis
149-
['https://www.archiveteam.org/index.php', 'https://www.archiveteam.org/api.php', u'April Fools\' Day'],
149+
['https://archiveteam.org/index.php', 'https://archiveteam.org/api.php', u'April Fools\' Day'],
150150
#['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Conway\'s Game of Life'],
151151

152152
# Test old allpages API behaviour
@@ -206,7 +206,7 @@ def test_getWikiEngine(self):
206206
tests = [
207207
['https://www.dokuwiki.org', 'DokuWiki'],
208208
#['http://wiki.openwrt.org', 'DokuWiki'],
209-
['http://skilledtests.com/wiki/', 'MediaWiki'],
209+
#['http://skilledtests.com/wiki/', 'MediaWiki'],
210210
#['http://moinmo.in', 'MoinMoin'],
211211
['https://wiki.debian.org', 'MoinMoin'],
212212
['http://twiki.org/cgi-bin/view/', 'TWiki'],
@@ -219,42 +219,42 @@ def test_getWikiEngine(self):
219219
['http://www.wasteflake.com/', 'TikiWiki'],
220220
['http://foswiki.org/', 'FosWiki'],
221221
['http://www.w3c.br/Home/WebHome', 'FosWiki'],
222-
['http://mojomojo.org/', 'MojoMojo'],
223-
['http://wiki.catalystframework.org/wiki/', 'MojoMojo'],
224-
['https://www.ictu.nl/archief/wiki.noiv.nl/xwiki/bin/view/Main', 'XWiki'],
222+
#['http://mojomojo.org/', 'MojoMojo'],
223+
#['http://wiki.catalystframework.org/wiki/', 'MojoMojo'],
224+
#['https://www.ictu.nl/archief/wiki.noiv.nl/xwiki/bin/view/Main', 'XWiki'],
225225
#['https://web.archive.org/web/20080517021020id_/http://berlin.xwiki.com/xwiki/bin/view/Main/WebHome', 'XWiki'],
226226
['http://www.xwiki.org/xwiki/bin/view/Main/WebHome', 'XWiki'],
227227
['https://confluence.atlassian.com/', 'Confluence'],
228228
#['https://wiki.hybris.com/dashboard.action', 'Confluence'],
229229
['https://confluence.sakaiproject.org/', 'Confluence'],
230230
#['http://demo.bananadance.org/', 'Banana Dance'],
231231
['http://wagn.org/', 'Wagn'],
232-
['http://wiki.ace-mod.net/', 'Wagn'],
232+
#['http://wiki.ace-mod.net/', 'Wagn'],
233233
#['https://success.mindtouch.com/', 'MindTouch'],
234234
#['https://jspwiki.apache.org/', 'JSPWiki'],
235235
['http://www.ihear.com/FreeCLAS/', 'JSPWiki'],
236236
['http://www.wikkawiki.org/HomePage', 'WikkaWiki'],
237-
['http://puppylinux.org/wikka/', 'WikkaWiki'],
238-
['http://cs.netsville.com/wiki/wikka.php', 'WikkaWiki'],
237+
#['http://puppylinux.org/wikka/', 'WikkaWiki'],
238+
['https://www.cybersphere.net/', 'MediaWiki'],
239239
#['http://web.archive.org/web/20060717202033id_/http://www.comawiki.org/CoMa.php?CoMa=startseite', 'CoMaWiki'],
240240
['http://bootbook.de/CoMa.php', 'CoMaWiki'],
241241
#['http://wikini.net/wakka.php', 'WikiNi'],
242242
['http://wiki.raydium.org/wiki/', 'WikiNi'],
243-
['http://wiki.cs.cityu.edu.hk/CitiWiki/SourceCode', 'CitiWiki'],
244-
['http://wackowiki.sourceforge.net/test/', 'WackoWiki'],
243+
#['http://wiki.cs.cityu.edu.hk/CitiWiki/SourceCode', 'CitiWiki'],
244+
#['http://wackowiki.sourceforge.net/test/', 'WackoWiki'],
245245
['http://www.sw4me.com/wiki/', 'WackoWiki'],
246-
['http://lslwiki.net/lslwiki/wakka.php', 'WakkaWiki'],
246+
#['http://lslwiki.net/lslwiki/wakka.php', 'WakkaWiki'],
247247
['http://kw.pm.org/wiki/index.cgi', 'Kwiki'],
248248
['http://wiki.wubi.org/index.cgi', 'Kwiki'],
249249
#['http://perl.bristolbath.org/index.cgi', 'Kwiki'],
250-
['http://www.anwiki.com/', 'Anwiki'],
251-
['http://www.anw.fr/', 'Anwiki'],
250+
#['http://www.anwiki.com/', 'Anwiki'],
251+
#['http://www.anw.fr/', 'Anwiki'],
252252
['http://www.aneuch.org/', 'Aneuch'],
253253
['http://doc.myunixhost.com/', 'Aneuch'],
254254
['http://www.bitweaver.org/wiki/index.php', 'bitweaver'],
255255
['http://wiki.e-shell.org/Home', 'Zwiki'],
256256
['http://leo.zwiki.org/', 'Zwiki'],
257-
['http://accessibility4all.wikispaces.com/', 'Wikispaces'],
257+
#['http://accessibility4all.wikispaces.com/', 'Wikispaces'],
258258
['http://darksouls.wikidot.com/', 'Wikidot'],
259259
['http://www.wikifoundrycentral.com/', 'Wetpaint'],
260260
['http://wiki.openid.net/', 'PBworks'],
@@ -273,7 +273,7 @@ def test_mwGetAPIAndIndex(self):
273273
print '\n', '#'*73, '\n', 'test_mwGetAPIAndIndex', '\n', '#'*73
274274
tests = [
275275
# Alone wikis
276-
['https://www.archiveteam.org', 'https://www.archiveteam.org/api.php', 'https://www.archiveteam.org/index.php'],
276+
['https://archiveteam.org', 'https://archiveteam.org/api.php', 'https://archiveteam.org/index.php'],
277277
#['http://skilledtests.com/wiki/', 'http://skilledtests.com/wiki/api.php', 'http://skilledtests.com/wiki/index.php'],
278278

279279
# Editthis wikifarm

0 commit comments

Comments
 (0)