diff --git a/dumpgenerator.py b/dumpgenerator.py index 3193fe2..536cd0e 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -299,9 +299,10 @@ def getPageTitlesScraper(config={}, session=None): else: pass # perhaps no subpages - # 3 is the current deep of English Wikipedia for Special:Allpages - deep = 3 + # Should be enought subpages on Special:Allpages + deep = 50 c = 0 + oldfr = '' checked_suballpages = [] rawacum = raw while r_suballpages and re.search(r_suballpages, raw) and c < deep: @@ -309,6 +310,11 @@ def getPageTitlesScraper(config={}, session=None): m = re.compile(r_suballpages).finditer(raw) for i in m: fr = i.group('from') + currfr = fr + + if oldfr == currfr: + # We are looping, exit the loop + pass if r_suballpages == r_suballpages1: to = i.group('to') @@ -329,19 +335,23 @@ def getPageTitlesScraper(config={}, session=None): url = '%s?title=Special:Allpages&from=%s&namespace=%s' % ( config['index'], name, namespace) + + if name not in checked_suballpages: # to avoid reload dupe subpages links checked_suballpages.append(name) delay(config=config, session=session) - r2 = session.get(url=url, timeout=10) - raw2 = r2.text - raw2 = cleanHTML(raw2) - rawacum += raw2 # merge it after removed junk - print ' Reading', name, len(raw2), 'bytes', \ - len(re.findall(r_suballpages, raw2)), 'subpages', \ - len(re.findall(r_title, raw2)), 'pages' + r = session.get(url=url, timeout=10) + #print 'Fetching URL: ', url + raw = r.text + raw = cleanHTML(raw) + rawacum += raw # merge it after removed junk + print ' Reading', name, len(raw), 'bytes', \ + len(re.findall(r_suballpages, raw)), 'subpages', \ + len(re.findall(r_title, raw)), 'pages' delay(config=config, session=session) + oldfr = currfr c += 1 c = 0 @@ -497,8 +507,9 @@ def getUserAgent(): """ Return a cool user-agent to hide Python user-agent """ useragents = [ # firefox - 'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0', - 'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0', + #'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0', + #'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0', + 'Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0' ] return useragents[0] @@ -574,6 +585,9 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None): except requests.exceptions.ConnectionError as e: print ' Connection error: %s'%(str(e[0])) xml = '' + except requests.exceptions.ReadTimeout as e: + print ' Read timeout: %s'%(str(e[0])) + xml = '' c += 1 return xml @@ -1471,7 +1485,29 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None): print 'Filename is too long, truncating. Now it is:', filename2 filename3 = u'%s/%s' % (imagepath, filename2) imagefile = open(filename3, 'wb') - r = requests.get(url=url) + + r = session.head(url=url, allow_redirects=True) + original_url_redirected = len(r.history) > 0 + + if original_url_redirected: + #print 'Site is redirecting us to: ', r.url + original_url = url + url = r.url + + r = session.get(url=url, allow_redirects=False) + + # Try to fix a broken HTTP to HTTPS redirect + if r.status_code == 404 and original_url_redirected: + if original_url.split("://")[0] == "http" and url.split("://")[0] == "https": + url = 'https://' + original_url.split("://")[1] + #print 'Maybe a broken http to https redirect, trying ', url + r = session.get(url=url, allow_redirects=False) + + if r.status_code == 404: + logerror( + config=config, + text=u'File %s at URL %s is missing' % (filename2,url)) + imagefile.write(r.content) imagefile.close() # saving description if any @@ -1494,9 +1530,14 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None): f = open('%s/%s.desc' % (imagepath, filename2), 'w') # Banner featuring SG1, SGA, SGU teams - if not re.search(r'', xmlfiledesc): + if not re.search(r'', xmlfiledesc): # failure when retrieving desc? then save it as empty .desc xmlfiledesc = '' + + # Fixup the XML + if xmlfiledesc is not '' and not re.search(r'', xmlfiledesc): + xmlfiledesc += '' + f.write(xmlfiledesc.encode('utf-8')) f.close() delay(config=config, session=session) diff --git a/testing/test_dumpgenerator.py b/testing/test_dumpgenerator.py index cb23661..7dd2101 100644 --- a/testing/test_dumpgenerator.py +++ b/testing/test_dumpgenerator.py @@ -62,7 +62,7 @@ class TestDumpgenerator(unittest.TestCase): tests = [ # Alone wikis #['http://wiki.annotation.jp/index.php', 'http://wiki.annotation.jp/api.php', u'かずさアノテーション - ソーシャル・ゲノム・アノテーション.jpg'], - ['https://www.archiveteam.org/index.php', 'https://www.archiveteam.org/api.php', u'Archive-is 2013-07-02 17-05-40.png'], + ['https://archiveteam.org/index.php', 'https://archiveteam.org/api.php', u'Archive-is 2013-07-02 17-05-40.png'], #['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Benham\'s disc (animated).gif'], # Editthis wikifarm @@ -146,7 +146,7 @@ class TestDumpgenerator(unittest.TestCase): print '\n', '#'*73, '\n', 'test_getPageTitles', '\n', '#'*73 tests = [ # Alone wikis - ['https://www.archiveteam.org/index.php', 'https://www.archiveteam.org/api.php', u'April Fools\' Day'], + ['https://archiveteam.org/index.php', 'https://archiveteam.org/api.php', u'April Fools\' Day'], #['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Conway\'s Game of Life'], # Test old allpages API behaviour @@ -206,7 +206,7 @@ class TestDumpgenerator(unittest.TestCase): tests = [ ['https://www.dokuwiki.org', 'DokuWiki'], #['http://wiki.openwrt.org', 'DokuWiki'], - ['http://skilledtests.com/wiki/', 'MediaWiki'], + #['http://skilledtests.com/wiki/', 'MediaWiki'], #['http://moinmo.in', 'MoinMoin'], ['https://wiki.debian.org', 'MoinMoin'], ['http://twiki.org/cgi-bin/view/', 'TWiki'], @@ -219,9 +219,9 @@ class TestDumpgenerator(unittest.TestCase): ['http://www.wasteflake.com/', 'TikiWiki'], ['http://foswiki.org/', 'FosWiki'], ['http://www.w3c.br/Home/WebHome', 'FosWiki'], - ['http://mojomojo.org/', 'MojoMojo'], - ['http://wiki.catalystframework.org/wiki/', 'MojoMojo'], - ['https://www.ictu.nl/archief/wiki.noiv.nl/xwiki/bin/view/Main', 'XWiki'], + #['http://mojomojo.org/', 'MojoMojo'], + #['http://wiki.catalystframework.org/wiki/', 'MojoMojo'], + #['https://www.ictu.nl/archief/wiki.noiv.nl/xwiki/bin/view/Main', 'XWiki'], #['https://web.archive.org/web/20080517021020id_/http://berlin.xwiki.com/xwiki/bin/view/Main/WebHome', 'XWiki'], ['http://www.xwiki.org/xwiki/bin/view/Main/WebHome', 'XWiki'], ['https://confluence.atlassian.com/', 'Confluence'], @@ -229,32 +229,32 @@ class TestDumpgenerator(unittest.TestCase): ['https://confluence.sakaiproject.org/', 'Confluence'], #['http://demo.bananadance.org/', 'Banana Dance'], ['http://wagn.org/', 'Wagn'], - ['http://wiki.ace-mod.net/', 'Wagn'], + #['http://wiki.ace-mod.net/', 'Wagn'], #['https://success.mindtouch.com/', 'MindTouch'], #['https://jspwiki.apache.org/', 'JSPWiki'], ['http://www.ihear.com/FreeCLAS/', 'JSPWiki'], ['http://www.wikkawiki.org/HomePage', 'WikkaWiki'], - ['http://puppylinux.org/wikka/', 'WikkaWiki'], - ['http://cs.netsville.com/wiki/wikka.php', 'WikkaWiki'], + #['http://puppylinux.org/wikka/', 'WikkaWiki'], + ['https://www.cybersphere.net/', 'MediaWiki'], #['http://web.archive.org/web/20060717202033id_/http://www.comawiki.org/CoMa.php?CoMa=startseite', 'CoMaWiki'], ['http://bootbook.de/CoMa.php', 'CoMaWiki'], #['http://wikini.net/wakka.php', 'WikiNi'], ['http://wiki.raydium.org/wiki/', 'WikiNi'], - ['http://wiki.cs.cityu.edu.hk/CitiWiki/SourceCode', 'CitiWiki'], - ['http://wackowiki.sourceforge.net/test/', 'WackoWiki'], + #['http://wiki.cs.cityu.edu.hk/CitiWiki/SourceCode', 'CitiWiki'], + #['http://wackowiki.sourceforge.net/test/', 'WackoWiki'], ['http://www.sw4me.com/wiki/', 'WackoWiki'], - ['http://lslwiki.net/lslwiki/wakka.php', 'WakkaWiki'], + #['http://lslwiki.net/lslwiki/wakka.php', 'WakkaWiki'], ['http://kw.pm.org/wiki/index.cgi', 'Kwiki'], ['http://wiki.wubi.org/index.cgi', 'Kwiki'], #['http://perl.bristolbath.org/index.cgi', 'Kwiki'], - ['http://www.anwiki.com/', 'Anwiki'], - ['http://www.anw.fr/', 'Anwiki'], + #['http://www.anwiki.com/', 'Anwiki'], + #['http://www.anw.fr/', 'Anwiki'], ['http://www.aneuch.org/', 'Aneuch'], ['http://doc.myunixhost.com/', 'Aneuch'], ['http://www.bitweaver.org/wiki/index.php', 'bitweaver'], ['http://wiki.e-shell.org/Home', 'Zwiki'], ['http://leo.zwiki.org/', 'Zwiki'], - ['http://accessibility4all.wikispaces.com/', 'Wikispaces'], + #['http://accessibility4all.wikispaces.com/', 'Wikispaces'], ['http://darksouls.wikidot.com/', 'Wikidot'], ['http://www.wikifoundrycentral.com/', 'Wetpaint'], ['http://wiki.openid.net/', 'PBworks'], @@ -273,7 +273,7 @@ class TestDumpgenerator(unittest.TestCase): print '\n', '#'*73, '\n', 'test_mwGetAPIAndIndex', '\n', '#'*73 tests = [ # Alone wikis - ['https://www.archiveteam.org', 'https://www.archiveteam.org/api.php', 'https://www.archiveteam.org/index.php'], + ['https://archiveteam.org', 'https://archiveteam.org/api.php', 'https://archiveteam.org/index.php'], #['http://skilledtests.com/wiki/', 'http://skilledtests.com/wiki/api.php', 'http://skilledtests.com/wiki/index.php'], # Editthis wikifarm