Support Unicode usernames etc. in makeXmlFromPage()

Test case:

Titles saved at... 39fanficwikiacom-20180521-titles.txt
377 page titles loaded
http://39fanfic.wikia.com/api.php
Getting the XML header from the API
Retrieving the XML for every page from the beginning
30 namespaces found
Exporting revisions from namespace 0
Warning. Could not use allrevisions, wiki too old.
1 more revisions exported
Traceback (most recent call last):
  File "./dumpgenerator.py", line 2291, in <module>
    main()
  File "./dumpgenerator.py", line 2283, in main
    createNewDump(config=config, other=other)
  File "./dumpgenerator.py", line 1849, in createNewDump
    generateXMLDump(config=config, titles=titles, session=other['session'])
  File "./dumpgenerator.py", line 732, in generateXMLDump
    for xml in getXMLRevisions(config=config, session=session):
  File "./dumpgenerator.py", line 861, in getXMLRevisions
    yield makeXmlFromPage(pages[page])
  File "./dumpgenerator.py", line 880, in makeXmlFromPage
    E.username(str(rev['user'])),
UnicodeEncodeError: 'ascii' codec can't encode characters in position 1-3: ordinal not in range(128)
pull/319/head
Federico Leva 6 years ago
parent 3df2513e67
commit bbcafdf869

@ -20,7 +20,7 @@
# https://github.com/WikiTeam/wikiteam/wiki
try:
from kitchen.text.converters import getwriter
from kitchen.text.converters import getwriter, to_unicode
except ImportError:
print "Please install the kitchen module."
import cookielib
@ -868,19 +868,19 @@ def makeXmlFromPage(page):
""" Output an XML document as a string from a page as in the API JSON """
p = E.page(
E.title(page['title']),
E.ns(str(page['ns'])),
E.id(str(page['pageid'])),
E.ns(to_unicode(page['ns'])),
E.id(to_unicode(page['pageid'])),
)
for rev in page['revisions']:
revision = E.revision(
E.id(str(rev['revid'])),
E.id(to_unicode(rev['revid'])),
E.timestamp(rev['timestamp']),
E.contributor(
E.id(str(rev['userid'])),
E.username(str(rev['user'])),
E.id(to_unicode(rev['userid'])),
E.username(to_unicode(rev['user'])),
),
E.comment(rev['comment']),
E.text(rev['*'], space="preserve", bytes=str(rev['size'])),
E.text(rev['*'], space="preserve", bytes=to_unicode(rev['size'])),
E.sha1(rev['sha1']),
)
if 'contentmodel' in rev:

Loading…
Cancel
Save