Implement resume in --xmlrevisions (but not yet with list=allrevisions)

Tested with a partial dumps over 100 MB:
https://tinyvillage.fandom.com/api.php
(grepped <title> to see the previously downloaded ones were kept and the
new ones continued from expected; did not validate a final XML).
pull/368/head
Federico Leva 4 years ago
parent a664b17a9c
commit 9ac1e6d0f1

@ -715,12 +715,16 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
lock = True
if config['xmlrevisions']:
print 'Retrieving the XML for every page from the beginning'
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
xmlfile.write(header.encode('utf-8'))
if start:
print("WARNING: will try to start the download from title: {}".format(start))
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
else:
print 'Retrieving the XML for every page from the beginning'
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
xmlfile.write(header.encode('utf-8'))
try:
r_timestamp = r'<timestamp>([^<]+)</timestamp>'
for xml in getXMLRevisions(config=config, session=session):
for xml in getXMLRevisions(config=config, session=session, start=start):
numrevs = len(re.findall(r_timestamp, xml))
# Due to how generators work, it's expected this may be less
# TODO: get the page title and reuse the usual format "X title, y edits"
@ -776,7 +780,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
xmlfile.close()
print 'XML dump saved at...', xmlfilename
def getXMLRevisions(config={}, session=None, allpages=False):
def getXMLRevisions(config={}, session=None, allpages=False, start=None):
# FIXME: actually figure out the various strategies for each MediaWiki version
apiurl = urlparse(config['api'])
# FIXME: force the protocol we asked for! Or don't verify SSL if we asked HTTP?
@ -790,7 +794,7 @@ def getXMLRevisions(config={}, session=None, allpages=False):
try:
for namespace in namespaces:
print "Trying to export all revisions from namespace %s" % namespace
print("Trying to export all revisions from namespace %s" % namespace)
# arvgeneratexml exists but was deprecated in 1.26 (while arv is from 1.27?!)
arvparams = {
'action': 'query',
@ -910,7 +914,7 @@ def getXMLRevisions(config={}, session=None, allpages=False):
# We could also use the allpages API as generator but let's be consistent.
print("Getting titles to export the latest revision for each")
c = 0
for title in readTitles(config):
for title in readTitles(config, start=start):
# TODO: respect verbose flag, reuse output from getXMLPage
print(' {}'.format(title.strip()))
# TODO: as we're doing one page and revision at a time, we might
@ -944,7 +948,7 @@ def getXMLRevisions(config={}, session=None, allpages=False):
# refuses to return an arbitrary number of revisions (see above).
print("Getting titles to export all the revisions of each")
c = 0
for title in readTitles(config):
for title in readTitles(config, start=start):
print(' {}'.format(title.strip()))
# Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded:
# "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}}

Loading…
Cancel
Save