pull/319/head
emijrp 6 years ago
commit a82a98a40a

File diff suppressed because it is too large Load Diff

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
# dumpgenerator.py A generator of dumps for wikis
# Copyright (C) 2011-2016 WikiTeam developers
# Copyright (C) 2011-2018 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
@ -49,8 +49,17 @@ try:
import wikitools
except ImportError:
print "Please install the wikitools 1.3+ module if you want to use --xmlrevisions."
try:
from lxml import etree
from lxml.builder import E
except ImportError:
print "Please install the lxml module if you want to use --xmlrevisions."
import time
import urllib
try:
from urlparse import urlparse, urlunparse
except ImportError:
from urllib.parse import urlparse, urlunparse
UTF8Writer = getwriter('utf8')
sys.stdout = UTF8Writer(sys.stdout)
@ -155,7 +164,7 @@ def getNamespacesScraper(config={}, session=None):
namespacenames = {0: ''} # main is 0, no prefix
if namespaces:
r = session.post(
url=config['index'], data={'title': 'Special:Allpages'}, timeout=30)
url=config['index'], params={'title': 'Special:Allpages'}, timeout=30)
raw = r.text
delay(config=config, session=session)
@ -192,7 +201,7 @@ def getNamespacesAPI(config={}, session=None):
if namespaces:
r = session.post(
url=config['api'],
data={
params={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'namespaces',
@ -277,7 +286,7 @@ def getPageTitlesAPI(config={}, session=None):
apfrom = jsontitles['continue']['apcontinue']
elif 'apfrom' in jsontitles['continue']:
apfrom = jsontitles['continue']['apfrom']
# print apfrom
# print jsontitles
allpages = jsontitles['query']['allpages']
@ -392,13 +401,11 @@ def getPageTitles(config={}, session=None):
titles = []
if 'api' in config and config['api']:
r = session.post(config['api'], {'action': 'query', 'list': 'allpages', 'format': 'json'}, timeout=30)
test = getJSON(r)
if ('warnings' in test and 'allpages' in test['warnings'] and '*' in test['warnings']['allpages']
and test['warnings']['allpages']['*'] == 'The "allpages" module has been disabled.'):
titles = getPageTitlesScraper(config=config, session=session)
else:
try:
titles = getPageTitlesAPI(config=config, session=session)
except:
print "Error: could not get page titles from the API"
titles = getPageTitlesScraper(config=config, session=session)
elif 'index' in config and config['index']:
titles = getPageTitlesScraper(config=config, session=session)
@ -418,7 +425,7 @@ def getPageTitles(config={}, session=None):
print '%d page titles loaded' % (c)
return titlesfilename
def getImageNames(config={}, session=None):
""" Get list of image names """
@ -442,17 +449,19 @@ def getXMLHeader(config={}, session=None):
# similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/"
# xmlns:x....
randomtitle = 'Main_Page' # previously AMF5LKE43MNFGHKSDMRTJ
print config['api']
if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
xml = None
try:
r = session.get(config['api'] + '?action=query&revids=1&export&exportnowrap', timeout=10)
xml = r.text
print 'Getting the XML header from the API'
r = session.get(config['api'] + '?action=query&revids=1&export&format=json', timeout=10)
xml = r.json()['query']['export']['*']
if not xml:
r = session.get(config['api'] + '?action=query&revids=1&export&exportnowrap', timeout=10)
xml = r.text
except requests.exceptions.RetryError:
pass
if not xml:
r = session.get(config['api'] + '?action=query&revids=1&export&format=json', timeout=10)
xml = r.json()['query']['export']['*']
else:
try:
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
@ -468,7 +477,7 @@ def getXMLHeader(config={}, session=None):
print "Trying the local name for the Special namespace instead"
r = session.post(
url=config['api'],
data={
params={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'namespaces',
@ -485,9 +494,15 @@ def getXMLHeader(config={}, session=None):
header = xml.split('</mediawiki>')[0]
if not re.match(r"\s*<mediawiki", xml):
print 'XML export on this wiki is broken, quitting.'
logerror(u'XML export on this wiki is broken, quitting.')
sys.exit()
if config['xmlrevisions']:
# Try again the old way
print 'Export test via the API failed. Wiki too old? Trying without xmlrevisions.'
config['xmlrevisions'] = False
header, config = getXMLHeader(config=config, session=session)
else:
print 'XML export on this wiki is broken, quitting.'
logerror(u'XML export on this wiki is broken, quitting.')
sys.exit()
return header, config
@ -572,7 +587,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
return '' # empty xml
# FIXME HANDLE HTTP Errors HERE
try:
r = session.post(url=config['index'], data=params, headers=headers, timeout=10)
r = session.post(url=config['index'], params=params, headers=headers, timeout=10)
handleStatusCode(r)
xml = fixBOM(r)
except requests.exceptions.ConnectionError as e:
@ -768,44 +783,110 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
xmlfile.close()
print 'XML dump saved at...', xmlfilename
def getXMLRevisions(config={}, session=None):
def getXMLRevisions(config={}, session=None, allpages=False):
site = wikitools.wiki.Wiki(config['api'])
#if config['namespaces']:
# namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
#else:
namespaces = ['*']
if not 'all' in config['namespaces']:
namespaces = config['namespaces']
else:
namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
for namespace in namespaces:
print "Exporting revisions from namespace %s" % namespace
# TODO: 500 would be nicer, but need to find the wiki's limits
params = {
'action': 'query',
'list': 'allrevisions',
'arvlimit': 50,
'arvprop': 'ids',
try:
for namespace in namespaces:
print "Exporting revisions from namespace %s" % namespace
arvparams = {
'action': 'query',
'list': 'allrevisions',
'arvlimit': 500,
'arvnamespace': namespace
}
request = wikitools.api.APIRequest(site, params)
results = request.queryGen()
try:
for result in results:
revids = []
for page in result['query']['allrevisions']:
for revision in page['revisions']:
revids.append(str(revision['revid']))
if not config['curonly']:
# We have to build the XML manually...
# Skip flags, presumably needed to add <minor/> which is in the schema.
# Also missing: parentid and contentformat.
arvparams['arvprop'] = 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content'
arvrequest = wikitools.api.APIRequest(site, arvparams)
results = arvrequest.queryGen()
for result in results:
for page in result['query']['allrevisions']:
yield makeXmlFromPage(page)
else:
# Just cycle through revision IDs and use the XML as is
arvparams['arvprop'] = 'ids'
arvrequest = wikitools.api.APIRequest(site, arvparams)
arvresults = arvrequest.queryGen()
for result in arvresults:
revids = []
for page in result['query']['allrevisions']:
for revision in page['revisions']:
revids.append(str(revision['revid']))
print "%d more revisions listed, until %s" % (len(revids), revids[-1])
exportparams = {
'action': 'query',
'revids': '|'.join(revids),
'export': '1',
}
exportrequest = wikitools.api.APIRequest(site, exportparams)
exportresults = exportrequest.queryGen()
for exportresult in exportresults:
yield exportresult['query']['export']['*']
print "50 more revisions listed, until %s" % revids[-1]
except KeyError:
print "Warning. Could not use allrevisions, wiki too old."
if config['curonly']:
for title in readTitles(config):
exportparams = {
'action': 'query',
'revids': '|'.join(revids),
'titles': title,
'export': '1',
}
exportrequest = wikitools.api.APIRequest(site, exportparams)
exportresults = exportrequest.queryGen()
for exportresult in exportresults:
yield exportresult['query']['export']['*']
except wikitools.api.APIError:
print "This wikitools version seems not to work for us. Exiting."
sys.exit()
else:
for title in readTitles(config):
pparams = {
'action': 'query',
'titles': title,
'prop': 'revisions',
'rvlimit': 'max',
'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content'
}
prequest = wikitools.api.APIRequest(site, pparams)
results = prequest.queryGen()
for result in results:
pages = result['query']['pages']
for page in pages:
yield makeXmlFromPage(pages[page])
except wikitools.api.APIError:
print "This wikitools version seems not to work for us. Exiting."
sys.exit()
def makeXmlFromPage(page):
""" Output an XML document as a string from a page as in the API JSON """
p = E.page(
E.title(page['title']),
E.ns(str(page['ns'])),
E.id(str(page['pageid'])),
)
for rev in page['revisions']:
revision = E.revision(
E.id(str(rev['revid'])),
E.timestamp(rev['timestamp']),
E.contributor(
E.id(str(rev['userid'])),
E.username(str(rev['user'])),
),
E.comment(rev['comment']),
E.text(rev['*'], space="preserve", bytes=str(rev['size'])),
E.sha1(rev['sha1']),
)
if 'contentmodel' in rev:
revision.append(E.model)
p.append(revision)
return etree.tostring(p, pretty_print=True)
def readTitles(config={}, start=None):
""" Read title list from a file, from the title "start" """
@ -942,7 +1023,7 @@ def getImageNamesScraper(config={}, session=None):
# http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
r = session.post(
url=config['index'],
data={
params={
'title': 'Special:Imagelist',
'limit': limit,
'offset': offset},
@ -1047,7 +1128,7 @@ def getImageNamesAPI(config={}, session=None):
'format': 'json',
'ailimit': 500}
# FIXME Handle HTTP Errors HERE
r = session.post(url=config['api'], data=params, timeout=30)
r = session.post(url=config['api'], params=params, timeout=30)
handleStatusCode(r)
jsonimages = getJSON(r)
delay(config=config, session=session)
@ -1105,7 +1186,7 @@ def getImageNamesAPI(config={}, session=None):
'iiprop': 'user|url',
'format': 'json'}
# FIXME Handle HTTP Errors HERE
r = session.post(url=config['api'], data=params, timeout=30)
r = session.post(url=config['api'], params=params, timeout=30)
handleStatusCode(r)
jsonimages = getJSON(r)
delay(config=config, session=session)
@ -1254,7 +1335,7 @@ def domain2prefix(config={}, session=None):
domain = config['index']
domain = domain.lower()
domain = re.sub(r'(https?://|www\.|/index\.php|/api\.php)', '', domain)
domain = re.sub(r'(https?://|www\.|/index\.php.*|/api\.php.*)', '', domain)
domain = re.sub(r'/', '_', domain)
domain = re.sub(r'\.', '', domain)
domain = re.sub(r'[^A-Za-z0-9]', '_', domain)
@ -1384,10 +1465,9 @@ def getParameters(params=[]):
action='store_true',
help="generates a full history XML dump (--xml --curonly for current revisions only)")
groupDownload.add_argument('--curonly', action='store_true',
help='store only the current version of pages; incompatible with --xmlrevisions')
help='store only the current version of pages')
groupDownload.add_argument('--xmlrevisions', action='store_true',
help='download all revisions from an API generator. Ignores the \
namespace selection')
help='download all revisions from an API generator. MediaWiki 1.27+ only.')
groupDownload.add_argument(
'--images', action='store_true', help="generates an image dump")
groupDownload.add_argument(
@ -1530,15 +1610,20 @@ def getParameters(params=[]):
session=session):
print 'index.php is OK'
else:
index = '/'.join(index.split('/')[:-1])
try:
index = '/'.join(index.split('/')[:-1])
except AttributeError:
index = None
if index and checkIndex(
index=index,
cookies=args.cookies,
session=session):
print 'index.php is OK'
else:
print 'Error in index.php, please, provide a correct path to index.php'
sys.exit(1)
print 'Error in index.php.'
if not args.xmlrevisions:
print 'Please, provide a correct path to index.php or use --xmlrevisions. Terminating.'
sys.exit(1)
# check user and pass (one requires both)
if (args.user and not args.password) or (args.password and not args.user):
@ -1628,10 +1713,14 @@ def checkAPI(api=None, session=None):
'format': 'json'},
timeout=30
)
if r.url == api:
if r.status_code == 200:
break
else:
api = r.url
elif r.status_code < 400:
p = r.url
api = urlunparse([p.scheme, p.netloc, p.path, '', '', ''])
elif r.status_code > 400:
print "MediaWiki API URL not found or giving error: HTTP %d" % r.status_code
return False
if "MediaWiki API is not enabled for this site." in r.text:
return False
try:
@ -1693,7 +1782,11 @@ def getJSON(request):
"""Strip Unicode BOM"""
if request.text.startswith(u'\ufeff'):
request.encoding = 'utf-8-sig'
return request.json()
try:
return request.json()
except:
# Maybe an older API version which did not return correct JSON
return {}
def fixBOM(request):
@ -1785,7 +1878,7 @@ def resumePreviousDump(config={}, other={}):
if lasttitle == '':
lasttitle=lasttitles.next()
except:
pass # probably file does not exists
lasttitle = '' # probably file does not exists
if lasttitle == '--END--':
# titles list is complete
print 'Title list was completed in the previous session'
@ -1916,7 +2009,7 @@ def saveSpecialVersion(config={}, session=None):
else:
print 'Downloading Special:Version with extensions and other related info'
r = session.post(
url=config['index'], data={'title': 'Special:Version'}, timeout=10)
url=config['index'], params={'title': 'Special:Version'}, timeout=10)
raw = r.text
delay(config=config, session=session)
raw = removeIP(raw=raw)
@ -1931,14 +2024,13 @@ def saveIndexPHP(config={}, session=None):
print 'index.html exists, do not overwrite'
else:
print 'Downloading index.php (Main Page) as index.html'
r = session.post(url=config['index'], data={}, timeout=10)
r = session.post(url=config['index'], params={}, timeout=10)
raw = r.text
delay(config=config, session=session)
raw = removeIP(raw=raw)
with open('%s/index.html' % (config['path']), 'w') as outfile:
outfile.write(raw.encode('utf-8'))
def saveSiteInfo(config={}, session=None):
""" Save a file with site info """
@ -1951,7 +2043,7 @@ def saveSiteInfo(config={}, session=None):
# MediaWiki 1.13+
r = session.post(
url=config['api'],
data={
params={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo',
@ -1962,7 +2054,7 @@ def saveSiteInfo(config={}, session=None):
if not 'query' in getJSON(r):
r = session.post(
url=config['api'],
data={
params={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap',
@ -1972,7 +2064,7 @@ def saveSiteInfo(config={}, session=None):
if not 'query' in getJSON(r):
r = session.post(
url=config['api'],
data={
params={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'general|namespaces',
@ -1988,10 +2080,14 @@ def avoidWikimediaProjects(config={}, other={}):
""" Skip Wikimedia projects and redirect to the dumps website """
# notice about wikipedia dumps
url = ''
if config['api']:
url = url + config['api']
if config['index']:
url = url + config['index']
if re.findall(
r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org',
config['api'] +
config['index']):
url):
print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!'
print 'Download the dumps from http://dumps.wikimedia.org'
if not other['force']:

@ -16,6 +16,7 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import getopt
import argparse
import os
import re
import subprocess
@ -30,89 +31,41 @@ from internetarchive import get_item
import dumpgenerator
# Configuration goes here
# You need a file named keys.txt with access and secret keys, in two different lines
accesskey = open('keys.txt', 'r').readlines()[0].strip()
secretkey = open('keys.txt', 'r').readlines()[1].strip()
# Use --admin if you are a wikiteam collection admin, or specify another collection:
collection = 'opensource'
# Nothing to change below
convertlang = {'ar': 'Arabic', 'de': 'German', 'en': 'English', 'es': 'Spanish', 'fr': 'French', 'it': 'Italian', 'ja': 'Japanese', 'nl': 'Dutch', 'pl': 'Polish', 'pt': 'Portuguese', 'ru': 'Russian'}
listfile = sys.argv[1]
uploadeddumps = []
try:
uploadeddumps = [l.split(';')[1] for l in open('uploader-%s.log' % (listfile), 'r').read().strip().splitlines() if len(l.split(';'))>1]
except:
pass
print '%d dumps uploaded previously' % (len(uploadeddumps))
def getParameters(params=[]):
if not params:
params = sys.argv[2:]
config = {
'prune-directories': False,
'prune-wikidump': False,
'collection': collection,
'update': False,
}
#console params
try:
opts, args = getopt.getopt(params, "", ["h", "help", "prune-directories", "prune-wikidump", "admin", "update"])
except getopt.GetoptError, err:
# print help information and exit:
print str(err) # will print something like "option -a not recognized"
usage()
sys.exit(2)
for o, a in opts:
if o in ("-h","--help"):
usage()
sys.exit()
elif o in ("--prune-directories"):
config['prune-directories'] = True
elif o in ("--prune-wikidump"):
config['prune-wikidump'] = True
elif o in ("--admin"):
config['collection'] = "wikiteam"
elif o in ("--update"):
config['update'] = True
return config
def usage():
""" """
print """uploader.py
This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org.
The list must be a text file with the wiki's api.php URLs, one per line.
Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format
as produced by launcher.py (explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump ).
You need a file named keys.txt with access and secret keys, in two different lines https://archive.org/account/s3.php
You also need dumpgenerator.py in the same directory as this script.
Use --help to print this help."""
def log(wiki, dump, msg):
f = open('uploader-%s.log' % (listfile), 'a')
def log(wiki, dump, msg, config={}):
f = open('uploader-%s.log' % (config.listfile), 'a')
f.write('\n%s;%s;%s' % (wiki, dump, msg))
f.close()
def upload(wikis, config={}):
def upload(wikis, config={}, uploadeddumps=[]):
headers = {'User-Agent': dumpgenerator.getUserAgent()}
dumpdir = config.wikidump_dir
filelist = os.listdir(dumpdir)
for wiki in wikis:
print "#"*73
print "# Uploading", wiki
print "#"*73
wiki = wiki.lower()
prefix = dumpgenerator.domain2prefix(config={'api': wiki})
configtemp = config
try:
prefix = dumpgenerator.domain2prefix(config={'api': wiki})
except KeyError:
print "ERROR: could not produce the prefix for %s" % wiki
config = configtemp
wikiname = prefix.split('-')[0]
dumps = []
for dirname, dirnames, filenames in os.walk('.'):
if dirname == '.':
for f in filenames:
if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
dumps.append(f)
for f in filelist:
if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
print "%s found" % f
dumps.append(f)
break
c = 0
@ -120,30 +73,33 @@ def upload(wikis, config={}):
wikidate = dump.split('-')[1]
item = get_item('wiki-' + wikiname)
if dump in uploadeddumps:
if config['prune-directories']:
if config.prune_directories:
rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate)
# With -f the deletion might have happened before and we won't know
if not os.system(rmline):
print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate)
if config['prune-wikidump'] and dump.endswith('wikidump.7z'):
if config.prune_wikidump and dump.endswith('wikidump.7z'):
# Simplistic quick&dirty check for the presence of this file in the item
stdout, stderr = subprocess.Popen(["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
print "Checking content in previously uploaded files"
stdout, stderr = subprocess.Popen(["md5sum", dumpdir + '/' + dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
dumphash = re.sub(' +.+\n?', '', stdout)
if dumphash in map(lambda x: x['md5'], item.files):
log(wiki, dump, 'verified')
rmline='rm -rf %s' % dump
log(wiki, dump, 'verified', config)
rmline='rm -rf %s' % dumpdir + '/' + dump
if not os.system(rmline):
print 'DELETED ' + dump
print 'DELETED ' + dumpdir + '/' + dump
print '%s was uploaded before, skipping...' % (dump)
continue
else:
print 'ERROR: The online item misses ' + dump
log(wiki, dump, 'missing')
log(wiki, dump, 'missing', config)
# We'll exit this if and go upload the dump
else:
print '%s was uploaded before, skipping...' % (dump)
continue
else:
print '%s was not uploaded before' % dump
time.sleep(0.1)
wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8]
@ -155,7 +111,7 @@ def upload(wikis, config={}):
# Logo path
logourl = ''
if ismissingitem or config['update']:
if ismissingitem or config.update:
#get metadata from api.php
#first sitename and base url
params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'}
@ -163,7 +119,7 @@ def upload(wikis, config={}):
req = urllib2.Request(url=wiki, data=data, headers=headers)
xml = ''
try:
f = urllib2.urlopen(req)
f = urllib2.urlopen(req, timeout=10)
xml = f.read()
f.close()
except:
@ -198,7 +154,7 @@ def upload(wikis, config={}):
req = urllib2.Request(url=wiki, data=data, headers=headers)
xml = ''
try:
f = urllib2.urlopen(req)
f = urllib2.urlopen(req, timeout=10)
xml = f.read()
f.close()
except:
@ -214,7 +170,7 @@ def upload(wikis, config={}):
raw = ''
try:
f = urllib.urlopen(baseurl)
f = urllib.urlopen(baseurl, timeout=10)
raw = f.read()
f.close()
except:
@ -238,7 +194,6 @@ def upload(wikis, config={}):
logourl = re.findall(ur'p-logo["\'][^>]*>\s*<a [^>]*background-image:\s*(?:url\()?([^;)"]+)', raw)[0]
except:
pass
print logourl
#retrieve some info from the wiki
wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia
@ -264,7 +219,7 @@ def upload(wikis, config={}):
# Item metadata
md = {
'mediatype': 'web',
'collection': config['collection'],
'collection': config.collection,
'title': wikititle,
'description': wikidesc,
'language': lang,
@ -277,25 +232,54 @@ def upload(wikis, config={}):
#Upload files and update metadata
try:
item.upload(dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True)
item.upload(dumpdir + '/' + dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True, queue_derive=False)
item.modify_metadata(md) # update
print 'You can find it in https://archive.org/details/wiki-%s' % (wikiname)
uploadeddumps.append(dump)
log(wiki, dump, 'ok', config)
if logourl:
logo = StringIO.StringIO(urllib.urlopen(urlparse.urljoin(wiki, logourl)).read())
logo = StringIO.StringIO(urllib.urlopen(urlparse.urljoin(wiki, logourl), timeout=10).read())
logoextension = logourl.split('.')[-1] if logourl.split('.') else 'unknown'
logo.name = 'wiki-' + wikiname + '_logo.' + logoextension
item.upload(logo, access_key=accesskey, secret_key=secretkey, verbose=True)
uploadeddumps.append(dump)
log(wiki, dump, 'ok')
except:
print wiki, dump, 'error when uploading?'
except Exception as e:
print wiki, dump, 'Error when uploading?'
print e.message
c += 1
def main(params=[]):
config = getParameters(params=params)
parser = argparse.ArgumentParser("""uploader.py
This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org.
The list must be a text file with the wiki's api.php URLs, one per line.
Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format
as produced by launcher.py (explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump ).
You need a file named keys.txt with access and secret keys, in two different lines
You also need dumpgenerator.py in the same directory as this script.
Use --help to print this help.""")
parser.add_argument('-pd', '--prune_directories', action='store_true')
parser.add_argument('-pw', '--prune_wikidump', action='store_true')
parser.add_argument('-a', '--admin', action='store_true')
parser.add_argument('-c', '--collection', default='opensource')
parser.add_argument('-wd', '--wikidump_dir', default='.')
parser.add_argument('-u', '--update', action='store_true')
parser.add_argument('listfile')
config = parser.parse_args()
if config.admin:
config.collection = 'wikiteam'
uploadeddumps = []
listfile = config.listfile
try:
uploadeddumps = [l.split(';')[1] for l in open('uploader-%s.log' % (listfile), 'r').read().strip().splitlines() if len(l.split(';'))>1]
except:
pass
print '%d dumps uploaded previously' % (len(uploadeddumps))
wikis = open(listfile, 'r').read().strip().splitlines()
upload(wikis, config)
upload(wikis, config, uploadeddumps)
if __name__ == "__main__":
main()

Loading…
Cancel
Save