Skip to content

Commit b289f86

Browse files
committed
Fix getPageTitlesScraper
Using the API and the Special:Allpages scraper should result in the same number of titles. Fix the detection of the next subpages on Special:Allpages. Change the max depth to 100 and implement an anti loop (could fail on non-western wiki).
1 parent 1048bc3 commit b289f86

File tree

1 file changed

+19
-9
lines changed

1 file changed

+19
-9
lines changed

dumpgenerator.py

+19-9
Original file line numberDiff line numberDiff line change
@@ -299,16 +299,22 @@ def getPageTitlesScraper(config={}, session=None):
299299
else:
300300
pass # perhaps no subpages
301301

302-
# 3 is the current deep of English Wikipedia for Special:Allpages
303-
deep = 3
302+
# Should be enought subpages on Special:Allpages
303+
deep = 50
304304
c = 0
305+
oldfr = ''
305306
checked_suballpages = []
306307
rawacum = raw
307308
while r_suballpages and re.search(r_suballpages, raw) and c < deep:
308309
# load sub-Allpages
309310
m = re.compile(r_suballpages).finditer(raw)
310311
for i in m:
311312
fr = i.group('from')
313+
currfr = fr
314+
315+
if oldfr == currfr:
316+
# We are looping, exit the loop
317+
pass
312318

313319
if r_suballpages == r_suballpages1:
314320
to = i.group('to')
@@ -329,19 +335,23 @@ def getPageTitlesScraper(config={}, session=None):
329335
url = '%s?title=Special:Allpages&from=%s&namespace=%s' % (
330336
config['index'], name, namespace)
331337

338+
339+
332340
if name not in checked_suballpages:
333341
# to avoid reload dupe subpages links
334342
checked_suballpages.append(name)
335343
delay(config=config, session=session)
336-
r2 = session.get(url=url, timeout=10)
337-
raw2 = r2.text
338-
raw2 = cleanHTML(raw2)
339-
rawacum += raw2 # merge it after removed junk
340-
print ' Reading', name, len(raw2), 'bytes', \
341-
len(re.findall(r_suballpages, raw2)), 'subpages', \
342-
len(re.findall(r_title, raw2)), 'pages'
344+
r = session.get(url=url, timeout=10)
345+
#print 'Fetching URL: ', url
346+
raw = r.text
347+
raw = cleanHTML(raw)
348+
rawacum += raw # merge it after removed junk
349+
print ' Reading', name, len(raw), 'bytes', \
350+
len(re.findall(r_suballpages, raw)), 'subpages', \
351+
len(re.findall(r_title, raw)), 'pages'
343352

344353
delay(config=config, session=session)
354+
oldfr = currfr
345355
c += 1
346356

347357
c = 0

0 commit comments

Comments
 (0)