Issue 26: Local "Special" namespace, actually limit replies

* For some reason, in a previous commit I had noticed that maxretries
  was not respected in getXMLPageCore, but I didn't fix it. Done now.
* If the "Special" namespace alias doesn't work, fetch the local one.
pull/224/head
Federico Leva 9 years ago
parent c1a5e3e0ca
commit bdc7c9bf06

@ -52,6 +52,12 @@ class PageMissingError(Exception):
def __str__(self):
return "page '%s' not found" % self.title
class ExportAbortedError(Exception):
def __init__(self, index):
self.index = index
def __str__(self):
return "Export from '%s' did not return anything." % self.index
def getVersion():
return(__VERSION__)
@ -396,13 +402,36 @@ def getXMLHeader(config={}, session=None):
try:
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
except PageMissingError as pme:
# The <page> does not exist. Not a problem, if we get the <siteinfo>.
xml = pme.xml
# Issue 26: Account for missing "Special" namespace.
# Hope the canonical special name has not been removed.
# http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases
except ExportAbortedError:
try:
if config['api']:
print "Trying the local name for the Special namespace instead"
r = session.post(
url=config['api'],
data={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'namespaces',
'format': 'json'}
)
config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
+ ':Export'
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
except PageMissingError as pme:
xml = pme.xml
except ExportAbortedError:
pass
header = xml.split('</mediawiki>')[0]
if not re.match("<mediawiki", xml):
print 'XML export on this wiki is broken, quitting.'
sys.exit()
return header
return header, config
def getXMLFileDesc(config={}, title='', session=None):
@ -454,11 +483,11 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
print ' We have retried %d times' % (c)
print ' MediaWiki error for "%s", network error or whatever...' % (params['pages'])
# If it's not already what we tried: our last chance, preserve only the last revision...
# config['curonly'] means that the whole dump is configured to save nonly the last
# config['curonly'] means that the whole dump is configured to save only the last,
# params['curonly'] should mean that we've already tried this
# fallback, because it's set by the following if and passed to
# getXMLPageCore
if not config['curonly']:
if not config['curonly'] and not 'curonly' in params:
print ' Trying to save only the last revision for this page...'
params['curonly'] = 1
logerror(
@ -478,6 +507,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
config=config,
text='Error while retrieving the last revision of "%s". Skipping.' %
(params['pages']))
raise ExportAbortedError(config['index'])
return '' # empty xml
# FIXME HANDLE HTTP Errors HERE
try:
@ -485,6 +515,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
handleStatusCode(r)
xml = r.text
except requests.exceptions.ConnectionError as e:
raise ExportAbortedError(config['index'])
xml = ''
c += 1
@ -502,7 +533,10 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
title_ = title
title_ = re.sub(' ', '_', title_)
# do not convert & into %26, title_ = re.sub('&', '%26', title_)
params = {'title': 'Special:Export', 'pages': title_, 'action': 'submit'}
try:
params = {'title': config['export'], 'pages': title_, 'action': 'submit'}
except KeyError:
params = {'title': 'Special:Export', 'pages': title_, 'action': 'submit'}
if config['curonly']:
params['curonly'] = 1
params['limit'] = 1
@ -514,6 +548,8 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
params['templates'] = 1
xml = getXMLPageCore(params=params, config=config, session=session)
if xml == "":
raise ExportAbortedError(config['index'])
if not "</page>" in xml:
raise PageMissingError(params['title'], xml)
else:
@ -593,7 +629,7 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
""" Generates a XML dump for a list of titles """
print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
header = getXMLHeader(config=config, session=session)
header, config = getXMLHeader(config=config, session=session)
footer = '</mediawiki>\n' # new line at the end
xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config),
config['date'],

Loading…
Cancel
Save