diff --git a/dumpgenerator.py b/dumpgenerator.py index 1c99c24..678272b 100644 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -509,6 +509,12 @@ def getXMLPage(config={}, title='', verbose=True, session=None): xml = getXMLPageCore(params=params, config=config, session=session) if not xml: raise PageMissingError + else: + # strip these sha1s sums which keep showing up in the export and + # which are invalid for the XML schema (they only apply to + # revisions) + xml = re.sub(r'\n\s*\w+\s*\n', r'\n', xml) + xml = re.sub(r'\n\s*\s*\n', r'\n', xml) yield xml.split("")[0]