Fix scraper for sites with Unicode titles

10 years ago · 62be069026
parent 7a6ef18339
commit 62be069026
1 changed files with 4 additions and 3 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -267,9 +267,10 @@ def getPageTitlesScraper(config={}):
        c = 0
        m = re.compile(r_title).finditer(rawacum)
        for i in m:
-            if not i.group('title').startswith('Special:'):
-                if not i.group('title') in titles:
-                    titles.append(undoHTMLEntities(text=i.group('title')))
+            t = undoHTMLEntities(text=unicode(i.group('title'), 'utf-8'))
+            if not t.startswith('Special:'):
+                if not t in titles:
+                    titles.append(t)
                    c += 1
        print '    %d titles retrieved in the namespace %d' % (c, namespace)
    return titles