Don't try to add revisions if the namespace has none

Traceback (most recent call last): File "dumpgenerator.py", line 2362, in <module> File "dumpgenerator.py", line 2354, in main resumePreviousDump(config=config, other=other) File "dumpgenerator.py", line 1921, in createNewDump getPageTitles(config=config, session=other['session']) File "dumpgenerator.py", line 755, in generateXMLDump for xml in getXMLRevisions(config=config, session=session): File "dumpgenerator.py", line 861, in getXMLRevisions revids.append(str(revision['revid'])) IndexError: list index out of range
4 years ago · f10adb71af
parent 3760501f74
commit f10adb71af
1 changed files with 15 additions and 6 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -850,12 +850,14 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                print("Trying to list the revisions and to export them one by one")
                # We only need the revision ID, all the rest will come from the raw export
                arvparams['arvprop'] = 'ids'
+                arvrequest = site.api(**arvparams)
+                # Skip the namespace if it's empty
+                if len(arvrequest['query']['allrevisions']) < 1:
+                    continue
                # Repeat the arvrequest with new arvparams until done
                while True:
                    # Reset revision IDs from the previous batch from arv
                    revids = []
-                    # Get the new ones
-                    arvrequest = site.api(**arvparams)
                    for page in arvrequest['query']['allrevisions']:
                        for revision in page['revisions']:
                            revids.append(str(revision['revid']))
@ -879,19 +881,22 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                        yield makeXmlPageFromRaw(xml)
                        
                    if 'continue' in arvrequest:
+                        # Get the new ones
                        arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue']
+                        arvrequest = site.api(**arvparams)
                    else:
                        # End of continuation. We are done with this namespace.
                        break
                    
-
    except KeyError:
+        # TODO: check whether the KeyError was really for a missing arv API
        print "Warning. Could not use allrevisions. Wiki too old?"
        if config['curonly']:
            # The raw XML export in the API gets a title and gives the latest revision
            for title in readTitles(config):
-                # TODO: as we're doing one page and revision at a time,
-                # we might as well use xml format and exportnowrap=1
+                # TODO: as we're doing one page and revision at a time, we might
+                # as well use xml format and exportnowrap=1 to use the string of,
+                # XML as is, but need to check how well the library handles it.
                exportparams = {
                    'action': 'query',
                    'titles': title,
@ -899,8 +904,13 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                }
                exportrequest = site.api(**exportparams)
                xml = exportrequest['query']['export']['*']
+                # Because we got the fancy XML from the JSON format, clean it:
                yield makeXmlPageFromRaw(xml)
        else:
+            # This is the closest to what we usually do with Special:Export:
+            # take one title at a time and try to get all revisions exported.
+            # The XML needs to be made manually because the export=1 option
+            # refuses to return an arbitrary number of revisions (see above).
            for title in readTitles(config):
                pparams = {
                    'action': 'query',
@ -908,7 +918,6 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                    'prop': 'revisions',
                    'rvlimit': 'max',
                    'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content',
-                    'rawcontinue': 'yes'
                }
                prequest = site.api(**pparams)
                try: