Fix getPageTitlesScraper

nsapa · nsapa · commit b289f86243c6 · 2020-08-28T08:25:33.000+02:00
Using the API and the Special:Allpages scraper should result in the same number of titles.
Fix the detection of the next subpages on Special:Allpages.
Change the max depth to 100 and implement an anti loop (could fail on non-western wiki).
diff --git a/dumpgenerator.py b/dumpgenerator.py
@@ -299,16 +299,22 @@ def getPageTitlesScraper(config={}, session=None):
         else:
             pass  # perhaps no subpages
 
-        # 3 is the current deep of English Wikipedia for Special:Allpages
-        deep = 3
+        # Should be enought subpages on Special:Allpages
+        deep = 50
         c = 0
+        oldfr = ''
         checked_suballpages = []
         rawacum = raw
         while r_suballpages and re.search(r_suballpages, raw) and c < deep:
             # load sub-Allpages
             m = re.compile(r_suballpages).finditer(raw)
             for i in m:
                 fr = i.group('from')
+                currfr = fr
+
+                if oldfr == currfr:
+                    # We are looping, exit the loop
+                    pass
 
                 if r_suballpages == r_suballpages1:
                     to = i.group('to')
@@ -329,19 +335,23 @@ def getPageTitlesScraper(config={}, session=None):
                     url = '%s?title=Special:Allpages&from=%s&namespace=%s' % (
                         config['index'], name, namespace)
 
+
+
                 if name not in checked_suballpages:
                     # to avoid reload dupe subpages links
                     checked_suballpages.append(name)
                     delay(config=config, session=session)
-                    r2 = session.get(url=url, timeout=10)
-                    raw2 = r2.text
-                    raw2 = cleanHTML(raw2)
-                    rawacum += raw2  # merge it after removed junk
-                    print '    Reading', name, len(raw2), 'bytes', \
-                        len(re.findall(r_suballpages, raw2)), 'subpages', \
-                        len(re.findall(r_title, raw2)), 'pages'
+                    r = session.get(url=url, timeout=10)
+                    #print 'Fetching URL: ', url
+                    raw = r.text
+                    raw = cleanHTML(raw)
+                    rawacum += raw  # merge it after removed junk
+                    print '    Reading', name, len(raw), 'bytes', \
+                        len(re.findall(r_suballpages, raw)), 'subpages', \
+                        len(re.findall(r_title, raw)), 'pages'
 
                 delay(config=config, session=session)
+            oldfr = currfr
             c += 1
 
         c = 0