From 2c21eadf7c456b7ea1efe233d5eeaa45e29d3ee3 Mon Sep 17 00:00:00 2001 From: Federico Leva Date: Mon, 10 Feb 2020 22:32:01 +0200 Subject: [PATCH] Wikia: make getXMLHeader() check more lenient, Otherwise we end up using Special:Export even though the export API would work perfectly well with --xmlrevisions. May also fix images on fandom.com: https://github.com/WikiTeam/wikiteam/issues/330 --- dumpgenerator.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 924a02e..071a458 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -423,10 +423,24 @@ def getXMLHeader(config={}, session=None): # Export and exportnowrap exist from MediaWiki 1.15, allpages from 1.18 r = session.get(config['api'] + '?action=query&export=1&exportnowrap=1&list=allpages&aplimit=1', timeout=10) xml = r.text + # Otherwise try without exportnowrap, e.g. Wikia returns a blank page on 1.19 + if not xml: + r = session.get(config['api'] + '?action=query&export=1&list=allpages&aplimit=1&format=json', timeout=10) + try: + xml = r.json()['query']['export']['*'] + except KeyError: + xml = None if not xml: # Do without a generator, use our usual trick of a random page title r = session.get(config['api'] + '?action=query&export=1&exportnowrap=1&titles=' + randomtitle, timeout=10) xml = r.text + # Again try without exportnowrap + if not xml: + r = session.get(config['api'] + '?action=query&export=1&format=json&titles=' + randomtitle, timeout=10) + try: + xml = r.json()['query']['export']['*'] + except KeyError: + xml = None except requests.exceptions.RetryError: pass @@ -1302,7 +1316,7 @@ def getImageNamesAPI(config={}, session=None): url = curateImageURL(config=config, url=url) # encoding to ascii is needed to work around this horrible bug: # http://bugs.python.org/issue8136 - if 'api' in config and '.wikia.com' in config['api']: + if 'api' in config and ('.wikia.' in config['api'] or '.fandom.com' in config['api']): #to avoid latest?cb=20120816112532 in filenames filename = unicode(urllib.unquote((re.sub('_', ' ', url.split('/')[-3])).encode('ascii', 'ignore')), 'utf-8') else: