From eb8b44aef0169f35a8baa5af1c852ea7448bd336 Mon Sep 17 00:00:00 2001 From: Benjamin Mako Hill Date: Fri, 6 Feb 2015 18:50:25 -0800 Subject: [PATCH] strip tags returned under The Wikia API is exporting sha1 sums as part of the response for pages. These are invalid XML and are causing dump parsing code (e.g., MediaWiki-Utilities) to fail. Also, sha1 should be revisions, not pages so it's not entirely clear to me what this is referring to. --- dumpgenerator.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dumpgenerator.py b/dumpgenerator.py index 1c99c24..678272b 100644 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -509,6 +509,12 @@ def getXMLPage(config={}, title='', verbose=True, session=None): xml = getXMLPageCore(params=params, config=config, session=session) if not xml: raise PageMissingError + else: + # strip these sha1s sums which keep showing up in the export and + # which are invalid for the XML schema (they only apply to + # revisions) + xml = re.sub(r'\n\s*\w+\s*\n', r'\n', xml) + xml = re.sub(r'\n\s*\s*\n', r'\n', xml) yield xml.split("")[0]