Finish xmlrevisions option for older wikis

* Actually proceed to the next page when no continuation.
* Provide the same output as with the usual per-page export.

Tested on a MediaWiki 1.16 wiki with success.
pull/359/head
Federico Leva 4 years ago
parent 0f35d03929
commit 9ec6ce42d3

@ -755,6 +755,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
for xml in getXMLRevisions(config=config, session=session):
numrevs = len(re.findall(r_timestamp, xml))
# Due to how generators work, it's expected this may be less
# TODO: get the page title and reuse the usual format "X title, y edits"
print "%d more revisions exported" % numrevs
xml = cleanXML(xml=xml)
xmlfile.write(xml.encode('utf-8'))
@ -835,7 +836,18 @@ def getXMLRevisions(config={}, session=None, allpages=False):
arvparams['arvprop'] = 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content'
print("Trying to get wikitext from the allrevisions API and to build the XML")
while True:
arvrequest = site.api(**arvparams)
try:
arvrequest = site.api(**arvparams)
except requests.exceptions.ReadTimeout as err:
# Hopefully temporary, just wait a bit and continue with the same request.
# No point putting a limit to retries, we'd need to abort everything.
# TODO: reuse the retry logic of the checkAPI phase? Or force mwclient
# to use the retry adapter we use for our own requests session?
print("ERROR: {}".format(str(err)))
print("Sleeping for 20 seconds")
time.sleep(20)
continue
for page in arvrequest['query']['allrevisions']:
yield makeXmlFromPage(page)
if 'continue' in arvrequest:
@ -851,6 +863,10 @@ def getXMLRevisions(config={}, session=None, allpages=False):
# We only need the revision ID, all the rest will come from the raw export
arvparams['arvprop'] = 'ids'
arvrequest = site.api(**arvparams)
exportparams = {
'action': 'query',
'export': '1',
}
# Skip the namespace if it's empty
if len(arvrequest['query']['allrevisions']) < 1:
continue
@ -862,14 +878,11 @@ def getXMLRevisions(config={}, session=None, allpages=False):
for revision in page['revisions']:
revids.append(str(revision['revid']))
print "%d more revisions listed, until %s" % (len(revids), revids[-1])
# We can now get the XML for one revision at a time
# FIXME: we can actually get them in batches as we used to
# but need to figure out the continuation and avoid that the API
# chooses to give us only the latest for each page
exportparams = {
'action': 'query',
'export': '1',
}
for revid in revids:
exportparams['revids'] = revid
exportrequest = site.api(**exportparams)
@ -883,7 +896,16 @@ def getXMLRevisions(config={}, session=None, allpages=False):
if 'continue' in arvrequest:
# Get the new ones
arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue']
arvrequest = site.api(**arvparams)
try:
arvrequest = site.api(**arvparams)
except requests.exceptions.ReadTimeout as err:
# As above
print("ERROR: {}".format(str(err)))
print("Sleeping for 20 seconds")
time.sleep(20)
# But avoid rewriting the same revisions
arvrequest['query']['allrevisions'] = []
continue
else:
# End of continuation. We are done with this namespace.
break
@ -894,7 +916,11 @@ def getXMLRevisions(config={}, session=None, allpages=False):
if config['curonly']:
# The raw XML export in the API gets a title and gives the latest revision.
# We could also use the allpages API as generator but let's be consistent.
print("Getting titles to export the latest revision for each")
c = 0
for title in readTitles(config):
# TODO: respect verbose flag, reuse output from getXMLPage
print(' {}'.format(title.strip()))
# TODO: as we're doing one page and revision at a time, we might
# as well use xml format and exportnowrap=1 to use the string of,
# XML as is, but need to check how well the library handles it.
@ -905,6 +931,9 @@ def getXMLRevisions(config={}, session=None, allpages=False):
}
exportrequest = site.api(**exportparams)
xml = exportrequest['query']['export']['*']
c += 1
if c % 10 == 0:
print('Downloaded {} pages'.format(c))
# Because we got the fancy XML from the JSON format, clean it:
yield makeXmlPageFromRaw(xml)
else:
@ -914,16 +943,23 @@ def getXMLRevisions(config={}, session=None, allpages=False):
# to be input the page titles; otherwise, the requests are similar.
# The XML needs to be made manually because the export=1 option
# refuses to return an arbitrary number of revisions (see above).
print("Getting titles to export all the revisions of each")
c = 0
for title in readTitles(config):
print(' {}'.format(title.strip()))
# Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded:
# "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}}
pparams = {
'action': 'query',
'titles': title,
'prop': 'revisions',
'rvlimit': 50,
'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content',
}
prequest = site.api(**pparams)
c += 1
if c % 10 == 0:
print('Downloaded {} pages'.format(c))
# The array is called "pages" even if there's only one.
# TODO: we could actually batch titles a bit here if desired. How many?
try:
@ -936,18 +972,21 @@ def getXMLRevisions(config={}, session=None, allpages=False):
for page in pages:
try:
xml = makeXmlFromPage(pages[page])
yield xml
except PageMissingError:
logerror(
config=config,
text=u'Error: empty revision from API. Could not export page: %s' % (title.decode('utf-8'))
)
continue
yield xml
# Get next batch of revisions if there's more.
if 'continue' in prequest:
print("Getting more revisions for page {}".format(title))
pparams['rvcontinue'] = prequest['rvcontinue']
prequest = site.api(**pparams)
else:
break
except mwclient.errors.MwClientError:
@ -958,30 +997,41 @@ def makeXmlFromPage(page):
""" Output an XML document as a string from a page as in the API JSON """
try:
p = E.page(
E.title(page['title']),
E.title(to_unicode(page['title'])),
E.ns(to_unicode(page['ns'])),
E.id(to_unicode(page['pageid'])),
)
for rev in page['revisions']:
# Older releases like MediaWiki 1.16 do not return all fields.
if 'userid' in rev:
userid = rev['userid']
else:
userid = 0
if 'size' in rev:
size = rev['size']
else:
size = 0
revision = E.revision(
E.id(to_unicode(rev['revid'])),
E.parentid(to_unicode(rev['parentid'])),
E.timestamp(rev['timestamp']),
E.contributor(
E.id(to_unicode(rev['userid'])),
E.id(to_unicode(userid)),
E.username(to_unicode(rev['user'])),
),
E.comment(rev['comment']),
E.text(rev['*'], space="preserve", bytes=to_unicode(rev['size'])),
E.text(rev['*'], space="preserve", bytes=to_unicode(size)),
)
if 'comment' in rev:
revision.append(E.comment(to_unicode(rev['comment'])))
if 'contentmodel' in rev:
revision.append(E.model(rev['contentmodel']))
# The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia).
if 'sha1' in rev:
revision.append(E.sha1(rev['sha1']))
p.append(revision)
except KeyError:
raise PageMissingError(page['title'], '')
except KeyError as e:
print(e)
raise PageMissingError(page['title'], e)
return etree.tostring(p, pretty_print=True)
def readTitles(config={}, start=None):

Loading…
Cancel
Save