Support Unicode usernames etc. in makeXmlFromPage()

Test case: Titles saved at... 39fanficwikiacom-20180521-titles.txt 377 page titles loaded http://39fanfic.wikia.com/api.php Getting the XML header from the API Retrieving the XML for every page from the beginning 30 namespaces found Exporting revisions from namespace 0 Warning. Could not use allrevisions, wiki too old. 1 more revisions exported Traceback (most recent call last): File "./dumpgenerator.py", line 2291, in <module> main() File "./dumpgenerator.py", line 2283, in main createNewDump(config=config, other=other) File "./dumpgenerator.py", line 1849, in createNewDump generateXMLDump(config=config, titles=titles, session=other['session']) File "./dumpgenerator.py", line 732, in generateXMLDump for xml in getXMLRevisions(config=config, session=session): File "./dumpgenerator.py", line 861, in getXMLRevisions yield makeXmlFromPage(pages[page]) File "./dumpgenerator.py", line 880, in makeXmlFromPage E.username(str(rev['user'])), UnicodeEncodeError: 'ascii' codec can't encode characters in position 1-3: ordinal not in range(128)
6 years ago · bbcafdf869
parent 3df2513e67
commit bbcafdf869
1 changed files with 7 additions and 7 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -20,7 +20,7 @@
 #     https://github.com/WikiTeam/wikiteam/wiki

 try:
-    from kitchen.text.converters import getwriter
+    from kitchen.text.converters import getwriter, to_unicode
 except ImportError:
    print "Please install the kitchen module."
 import cookielib
@ -868,19 +868,19 @@ def makeXmlFromPage(page):
    """ Output an XML document as a string from a page as in the API JSON """
    p = E.page(
            E.title(page['title']),
-            E.ns(str(page['ns'])),
-            E.id(str(page['pageid'])),
+            E.ns(to_unicode(page['ns'])),
+            E.id(to_unicode(page['pageid'])),
       )
    for rev in page['revisions']:
        revision = E.revision(
-               E.id(str(rev['revid'])),
+               E.id(to_unicode(rev['revid'])),
               E.timestamp(rev['timestamp']),
               E.contributor(
-                    E.id(str(rev['userid'])),
-                    E.username(str(rev['user'])),
+                    E.id(to_unicode(rev['userid'])),
+                    E.username(to_unicode(rev['user'])),
               ),
               E.comment(rev['comment']),
-               E.text(rev['*'], space="preserve", bytes=str(rev['size'])),
+               E.text(rev['*'], space="preserve", bytes=to_unicode(rev['size'])),
               E.sha1(rev['sha1']),
        )
        if 'contentmodel' in rev: