updated wikiadownloader.py to work with new dumps

Bitrot seems to have gotten the best of this script and it sounds like it
hasn't been used. This at least gets it to work by:

- find both .gz and the .7z dumps
- parse the new date format on html
- find dumps in the correct place
- move all chatter to stderr instead of stdout
pull/217/head
Benjamin Mako Hill 9 years ago
parent eb8b44aef0
commit f4ec129bff

@ -42,40 +42,41 @@ f = open('wikia.com', 'r')
wikia = f.read().strip().split('\n')
f.close()
print len(wikia), 'wikis in Wikia'
print >>sys.stderr, len(wikia), 'wikis in Wikia'
start = '!'
if len(sys.argv) > 1:
start = sys.argv[1]
for wiki in wikia:
wiki = wiki.lower()
prefix = wiki.split('http://')[1]
if prefix < start:
continue
print wiki
path = '%s/%s/%s' % (prefix[0], prefix[0:2], prefix)
print >>sys.stderr, "Starting:", wiki
f = urllib.urlopen('%s/wiki/Special:Statistics' % (wiki))
html = f.read()
#print html
f.close()
m = re.compile(r'(?i)<a href="(?P<urldump>http://[^<>]+pages_(?P<dump>current|full)\.xml\.gz)">(?P<hour>\d\d:\d\d), (?P<month>[a-z]+) (?P<day>\d+), (?P<year>\d+)</a>').finditer(html)
for i in m:
m = re.compile(r'(?i)<a href="(?P<urldump>http://[^<>]+pages_(?P<dump>current|full)\.xml\.(?P<compression>gz|7z|bz2))">(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2}) (?P<time>\d\d:\d\d:\d\d)')
for i in m.finditer(html):
urldump = i.group("urldump")
dump = i.group("dump")
date = "%s-%s-%s" % (i.group("year"), i.group("month"), i.group("day"))
compression = i.group("compression")
print 'Downloading', wiki
if not os.path.exists(path):
os.makedirs(path)
print >>sys.stderr, 'Downloading', wiki, dump.lower()
f = urllib.urlopen('%s/index.json' % ('/'.join(urldump.split('/')[:-1])))
json = f.read()
f.close()
#{"name":"pages_full.xml.gz","timestamp":1273755409,"mwtimestamp":"20100513125649"}
#{"name":"pages_current.xml.gz","timestamp":1270731925,"mwtimestamp":"20100408130525"}
date = re.findall(r'{"name":"pages_%s.xml.gz","timestamp":\d+,"mwtimestamp":"(\d{8})\d{6}"}' % (dump.lower()), json)[0]
print urldump, dump, date #, hour, month, day, year
#-q, turn off verbose
os.system('wget -q -c "%s" -O %s/%s-%s-pages-meta-%s.gz' % (urldump, path, prefix, date, dump.lower() == 'current' and 'current' or 'history'))
os.system('wget -q -c "%s" -O %s-%s-pages-meta-%s.%s' % (urldump, prefix, date, dump.lower() == 'current' and 'current' or 'history', compression))
if not m.search(html):
print >>sys.stderr, 'Failed to download:', wiki
print >>sys.stderr, wiki
fail_file.close()

Loading…
Cancel
Save