Don't try to add revisions if the namespace has none

Traceback (most recent call last):
  File "dumpgenerator.py", line 2362, in <module>

  File "dumpgenerator.py", line 2354, in main
    resumePreviousDump(config=config, other=other)
  File "dumpgenerator.py", line 1921, in createNewDump
    getPageTitles(config=config, session=other['session'])
  File "dumpgenerator.py", line 755, in generateXMLDump
    for xml in getXMLRevisions(config=config, session=session):
  File "dumpgenerator.py", line 861, in getXMLRevisions
    revids.append(str(revision['revid']))
IndexError: list index out of range
pull/359/head
Federico Leva 4 years ago
parent 3760501f74
commit f10adb71af

@ -850,12 +850,14 @@ def getXMLRevisions(config={}, session=None, allpages=False):
print("Trying to list the revisions and to export them one by one")
# We only need the revision ID, all the rest will come from the raw export
arvparams['arvprop'] = 'ids'
arvrequest = site.api(**arvparams)
# Skip the namespace if it's empty
if len(arvrequest['query']['allrevisions']) < 1:
continue
# Repeat the arvrequest with new arvparams until done
while True:
# Reset revision IDs from the previous batch from arv
revids = []
# Get the new ones
arvrequest = site.api(**arvparams)
for page in arvrequest['query']['allrevisions']:
for revision in page['revisions']:
revids.append(str(revision['revid']))
@ -879,19 +881,22 @@ def getXMLRevisions(config={}, session=None, allpages=False):
yield makeXmlPageFromRaw(xml)
if 'continue' in arvrequest:
# Get the new ones
arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue']
arvrequest = site.api(**arvparams)
else:
# End of continuation. We are done with this namespace.
break
except KeyError:
# TODO: check whether the KeyError was really for a missing arv API
print "Warning. Could not use allrevisions. Wiki too old?"
if config['curonly']:
# The raw XML export in the API gets a title and gives the latest revision
for title in readTitles(config):
# TODO: as we're doing one page and revision at a time,
# we might as well use xml format and exportnowrap=1
# TODO: as we're doing one page and revision at a time, we might
# as well use xml format and exportnowrap=1 to use the string of,
# XML as is, but need to check how well the library handles it.
exportparams = {
'action': 'query',
'titles': title,
@ -899,8 +904,13 @@ def getXMLRevisions(config={}, session=None, allpages=False):
}
exportrequest = site.api(**exportparams)
xml = exportrequest['query']['export']['*']
# Because we got the fancy XML from the JSON format, clean it:
yield makeXmlPageFromRaw(xml)
else:
# This is the closest to what we usually do with Special:Export:
# take one title at a time and try to get all revisions exported.
# The XML needs to be made manually because the export=1 option
# refuses to return an arbitrary number of revisions (see above).
for title in readTitles(config):
pparams = {
'action': 'query',
@ -908,7 +918,6 @@ def getXMLRevisions(config={}, session=None, allpages=False):
'prop': 'revisions',
'rvlimit': 'max',
'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content',
'rawcontinue': 'yes'
}
prequest = site.api(**pparams)
try:

Loading…
Cancel
Save