Fix scraper for sites with Unicode titles

pull/124/head
balr0g 10 years ago
parent 7a6ef18339
commit 62be069026

@ -267,9 +267,10 @@ def getPageTitlesScraper(config={}):
c = 0
m = re.compile(r_title).finditer(rawacum)
for i in m:
if not i.group('title').startswith('Special:'):
if not i.group('title') in titles:
titles.append(undoHTMLEntities(text=i.group('title')))
t = undoHTMLEntities(text=unicode(i.group('title'), 'utf-8'))
if not t.startswith('Special:'):
if not t in titles:
titles.append(t)
c += 1
print ' %d titles retrieved in the namespace %d' % (c, namespace)
return titles

Loading…
Cancel
Save