.

6 years ago · a82a98a40a
parent 9352bc9af5 4351e09d80
commit a82a98a40a
4 changed files with 4347 additions and 154 deletions
--- a/batchdownload/taskforce/mediawikis_notarchived_2018.txt
+++ b/batchdownload/taskforce/mediawikis_notarchived_2018.txt
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-

 # dumpgenerator.py A generator of dumps for wikis
-# Copyright (C) 2011-2016 WikiTeam developers
+# Copyright (C) 2011-2018 WikiTeam developers
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
@ -49,8 +49,17 @@ try:
    import wikitools
 except ImportError:
    print "Please install the wikitools 1.3+ module if you want to use --xmlrevisions."
+try:
+    from lxml import etree
+    from lxml.builder import E
+except ImportError:
+    print "Please install the lxml module if you want to use --xmlrevisions."
 import time
 import urllib
+try:
+    from urlparse import urlparse, urlunparse
+except ImportError:
+    from urllib.parse import urlparse, urlunparse
 UTF8Writer = getwriter('utf8')
 sys.stdout = UTF8Writer(sys.stdout)

@ -155,7 +164,7 @@ def getNamespacesScraper(config={}, session=None):
    namespacenames = {0: ''}  # main is 0, no prefix
    if namespaces:
        r = session.post(
-            url=config['index'], data={'title': 'Special:Allpages'}, timeout=30)
+            url=config['index'], params={'title': 'Special:Allpages'}, timeout=30)
        raw = r.text
        delay(config=config, session=session)

@ -192,7 +201,7 @@ def getNamespacesAPI(config={}, session=None):
    if namespaces:
        r = session.post(
            url=config['api'],
-            data={
+            params={
                'action': 'query',
                'meta': 'siteinfo',
                'siprop': 'namespaces',
@ -277,7 +286,7 @@ def getPageTitlesAPI(config={}, session=None):
                    apfrom = jsontitles['continue']['apcontinue']
                elif 'apfrom' in jsontitles['continue']:
                    apfrom = jsontitles['continue']['apfrom']
-            
+
            # print apfrom
            # print jsontitles
            allpages = jsontitles['query']['allpages']
@ -392,13 +401,11 @@ def getPageTitles(config={}, session=None):

    titles = []
    if 'api' in config and config['api']:
-        r = session.post(config['api'], {'action': 'query', 'list': 'allpages', 'format': 'json'}, timeout=30)
-        test = getJSON(r)
-        if ('warnings' in test and 'allpages' in test['warnings'] and '*' in test['warnings']['allpages']
-                and test['warnings']['allpages']['*'] == 'The "allpages" module has been disabled.'):
-            titles = getPageTitlesScraper(config=config, session=session)
-        else:
+        try:
            titles = getPageTitlesAPI(config=config, session=session)
+        except:
+            print "Error: could not get page titles from the API"
+            titles = getPageTitlesScraper(config=config, session=session)
    elif 'index' in config and config['index']:
        titles = getPageTitlesScraper(config=config, session=session)

@ -418,7 +425,7 @@ def getPageTitles(config={}, session=None):

    print '%d page titles loaded' % (c)
    return titlesfilename
-    
+
 def getImageNames(config={}, session=None):
    """ Get list of image names """

@ -442,17 +449,19 @@ def getXMLHeader(config={}, session=None):
    # similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/"
    # xmlns:x....
    randomtitle = 'Main_Page'  # previously AMF5LKE43MNFGHKSDMRTJ
+    print config['api']
    if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
        xml = None
        try:
-            r = session.get(config['api'] + '?action=query&revids=1&export&exportnowrap', timeout=10)
-            xml = r.text
+            print 'Getting the XML header from the API'
+            r = session.get(config['api'] + '?action=query&revids=1&export&format=json', timeout=10)
+            xml = r.json()['query']['export']['*']
+            if not xml:
+                r = session.get(config['api'] + '?action=query&revids=1&export&exportnowrap', timeout=10)
+                xml = r.text
        except requests.exceptions.RetryError:
            pass

-        if not xml:
-            r = session.get(config['api'] + '?action=query&revids=1&export&format=json', timeout=10)
-            xml = r.json()['query']['export']['*']
    else:
        try:
            xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
@ -468,7 +477,7 @@ def getXMLHeader(config={}, session=None):
                    print "Trying the local name for the Special namespace instead"
                    r = session.post(
                    url=config['api'],
-                    data={
+                    params={
                        'action': 'query',
                        'meta': 'siteinfo',
                        'siprop': 'namespaces',
@ -485,9 +494,15 @@ def getXMLHeader(config={}, session=None):

    header = xml.split('</mediawiki>')[0]
    if not re.match(r"\s*<mediawiki", xml):
-        print 'XML export on this wiki is broken, quitting.'
-        logerror(u'XML export on this wiki is broken, quitting.')
-        sys.exit()
+        if config['xmlrevisions']:
+            # Try again the old way
+            print 'Export test via the API failed. Wiki too old? Trying without xmlrevisions.'
+            config['xmlrevisions'] = False
+            header, config = getXMLHeader(config=config, session=session)
+        else:
+            print 'XML export on this wiki is broken, quitting.'
+            logerror(u'XML export on this wiki is broken, quitting.')
+            sys.exit()
    return header, config


@ -572,7 +587,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
                return ''  # empty xml
        # FIXME HANDLE HTTP Errors HERE
        try:
-            r = session.post(url=config['index'], data=params, headers=headers, timeout=10)
+            r = session.post(url=config['index'], params=params, headers=headers, timeout=10)
            handleStatusCode(r)
            xml = fixBOM(r)
        except requests.exceptions.ConnectionError as e:
@ -768,44 +783,110 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
    xmlfile.close()
    print 'XML dump saved at...', xmlfilename

-def getXMLRevisions(config={}, session=None):
+def getXMLRevisions(config={}, session=None, allpages=False):
    site = wikitools.wiki.Wiki(config['api'])
-    #if config['namespaces']:
-    #    namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
-    #else:
-    namespaces = ['*']
+    if not 'all' in config['namespaces']:
+        namespaces = config['namespaces']
+    else:
+        namespaces, namespacenames = getNamespacesAPI(config=config, session=session)

-    for namespace in namespaces:
-        print "Exporting revisions from namespace %s" % namespace
-        # TODO: 500 would be nicer, but need to find the wiki's limits
-        params = {
-            'action': 'query',
-            'list': 'allrevisions',
-            'arvlimit': 50,
-            'arvprop': 'ids',
+    try:
+        for namespace in namespaces:
+            print "Exporting revisions from namespace %s" % namespace
+            arvparams = {
+                'action': 'query',
+                'list': 'allrevisions',
+                'arvlimit': 500,
+                'arvnamespace': namespace
            }
-        request = wikitools.api.APIRequest(site, params)
-        results = request.queryGen()
-        try:
-            for result in results:
-                revids = []
-                for page in result['query']['allrevisions']:
-                    for revision in page['revisions']:
-                        revids.append(str(revision['revid']))
+            if not config['curonly']:
+                # We have to build the XML manually...
+                # Skip flags, presumably needed to add <minor/> which is in the schema.
+                # Also missing: parentid and contentformat.
+                arvparams['arvprop'] = 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content'
+                arvrequest = wikitools.api.APIRequest(site, arvparams)
+                results = arvrequest.queryGen()
+                for result in results:
+                    for page in result['query']['allrevisions']:
+                        yield makeXmlFromPage(page)
+            else:
+                # Just cycle through revision IDs and use the XML as is
+                arvparams['arvprop'] = 'ids'
+                arvrequest = wikitools.api.APIRequest(site, arvparams)
+                arvresults = arvrequest.queryGen()
+                for result in arvresults:
+                    revids = []
+                    for page in result['query']['allrevisions']:
+                        for revision in page['revisions']:
+                            revids.append(str(revision['revid']))
+                    print "%d more revisions listed, until %s" % (len(revids), revids[-1])
+
+                    exportparams = {
+                        'action': 'query',
+                        'revids': '|'.join(revids),
+                        'export': '1',
+                    }
+                    exportrequest = wikitools.api.APIRequest(site, exportparams)
+                    exportresults = exportrequest.queryGen()
+                    for exportresult in exportresults:
+                        yield exportresult['query']['export']['*']

-                print "50 more revisions listed, until %s" % revids[-1]
+    except KeyError:
+        print "Warning. Could not use allrevisions, wiki too old."
+        if config['curonly']:
+            for title in readTitles(config):
                exportparams = {
                    'action': 'query',
-                    'revids': '|'.join(revids),
+                    'titles': title,
                    'export': '1',
                }
                exportrequest = wikitools.api.APIRequest(site, exportparams)
                exportresults = exportrequest.queryGen()
                for exportresult in exportresults:
                    yield exportresult['query']['export']['*']
-        except wikitools.api.APIError:
-            print "This wikitools version seems not to work for us. Exiting."
-            sys.exit()
+        else:
+            for title in readTitles(config):
+                pparams = {
+                    'action': 'query',
+                    'titles': title,
+                    'prop': 'revisions',
+                    'rvlimit': 'max',
+                    'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content'
+                }
+                prequest = wikitools.api.APIRequest(site, pparams)
+                results = prequest.queryGen()
+                for result in results:
+                    pages = result['query']['pages']
+                    for page in pages:
+                        yield makeXmlFromPage(pages[page])
+
+    except wikitools.api.APIError:
+        print "This wikitools version seems not to work for us. Exiting."
+        sys.exit()
+
+def makeXmlFromPage(page):
+    """ Output an XML document as a string from a page as in the API JSON """
+    p = E.page(
+            E.title(page['title']),
+            E.ns(str(page['ns'])),
+            E.id(str(page['pageid'])),
+       )
+    for rev in page['revisions']:
+        revision = E.revision(
+               E.id(str(rev['revid'])),
+               E.timestamp(rev['timestamp']),
+               E.contributor(
+                    E.id(str(rev['userid'])),
+                    E.username(str(rev['user'])),
+               ),
+               E.comment(rev['comment']),
+               E.text(rev['*'], space="preserve", bytes=str(rev['size'])),
+               E.sha1(rev['sha1']),
+        )
+        if 'contentmodel' in rev:
+            revision.append(E.model)
+        p.append(revision)
+    return etree.tostring(p, pretty_print=True)

 def readTitles(config={}, start=None):
    """ Read title list from a file, from the title "start" """
@ -942,7 +1023,7 @@ def getImageNamesScraper(config={}, session=None):
        # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
        r = session.post(
            url=config['index'],
-            data={
+            params={
                'title': 'Special:Imagelist',
                'limit': limit,
                'offset': offset},
@ -1047,7 +1128,7 @@ def getImageNamesAPI(config={}, session=None):
            'format': 'json',
            'ailimit': 500}
        # FIXME Handle HTTP Errors HERE
-        r = session.post(url=config['api'], data=params, timeout=30)
+        r = session.post(url=config['api'], params=params, timeout=30)
        handleStatusCode(r)
        jsonimages = getJSON(r)
        delay(config=config, session=session)
@ -1105,7 +1186,7 @@ def getImageNamesAPI(config={}, session=None):
                'iiprop': 'user|url',
                'format': 'json'}
            # FIXME Handle HTTP Errors HERE
-            r = session.post(url=config['api'], data=params, timeout=30)
+            r = session.post(url=config['api'], params=params, timeout=30)
            handleStatusCode(r)
            jsonimages = getJSON(r)
            delay(config=config, session=session)
@ -1254,7 +1335,7 @@ def domain2prefix(config={}, session=None):
        domain = config['index']

    domain = domain.lower()
-    domain = re.sub(r'(https?://|www\.|/index\.php|/api\.php)', '', domain)
+    domain = re.sub(r'(https?://|www\.|/index\.php.*|/api\.php.*)', '', domain)
    domain = re.sub(r'/', '_', domain)
    domain = re.sub(r'\.', '', domain)
    domain = re.sub(r'[^A-Za-z0-9]', '_', domain)
@ -1384,10 +1465,9 @@ def getParameters(params=[]):
        action='store_true',
        help="generates a full history XML dump (--xml --curonly for current revisions only)")
    groupDownload.add_argument('--curonly', action='store_true',
-        help='store only the current version of pages; incompatible with --xmlrevisions')
+        help='store only the current version of pages')
    groupDownload.add_argument('--xmlrevisions', action='store_true',
-                               help='download all revisions from an API generator. Ignores the \
-                               namespace selection')
+                               help='download all revisions from an API generator. MediaWiki 1.27+ only.')
    groupDownload.add_argument(
        '--images', action='store_true', help="generates an image dump")
    groupDownload.add_argument(
@ -1530,15 +1610,20 @@ def getParameters(params=[]):
                session=session):
            print 'index.php is OK'
        else:
-            index = '/'.join(index.split('/')[:-1])
+            try:
+                index = '/'.join(index.split('/')[:-1])
+            except AttributeError:
+                index = None
            if index and checkIndex(
                    index=index,
                    cookies=args.cookies,
                    session=session):
                print 'index.php is OK'
            else:
-                print 'Error in index.php, please, provide a correct path to index.php'
-                sys.exit(1)
+                print 'Error in index.php.'
+                if not args.xmlrevisions:
+                    print 'Please, provide a correct path to index.php or use --xmlrevisions. Terminating.'
+                    sys.exit(1)

    # check user and pass (one requires both)
    if (args.user and not args.password) or (args.password and not args.user):
@ -1628,10 +1713,14 @@ def checkAPI(api=None, session=None):
                'format': 'json'},
            timeout=30
        )
-        if r.url == api:
+        if r.status_code == 200:
            break
-        else:
-            api = r.url
+        elif r.status_code < 400:
+            p = r.url
+            api = urlunparse([p.scheme, p.netloc, p.path, '', '', ''])
+        elif r.status_code > 400:
+            print "MediaWiki API URL not found or giving error: HTTP %d" % r.status_code
+            return False
    if "MediaWiki API is not enabled for this site." in r.text:
        return False
    try:
@ -1693,7 +1782,11 @@ def getJSON(request):
    """Strip Unicode BOM"""
    if request.text.startswith(u'\ufeff'):
        request.encoding = 'utf-8-sig'
-    return request.json()
+    try:
+        return request.json()
+    except:
+        # Maybe an older API version which did not return correct JSON
+        return {}


 def fixBOM(request):
@ -1785,7 +1878,7 @@ def resumePreviousDump(config={}, other={}):
            if lasttitle == '':
                lasttitle=lasttitles.next()
        except:
-            pass  # probably file does not exists
+            lasttitle = ''  # probably file does not exists
        if lasttitle == '--END--':
            # titles list is complete
            print 'Title list was completed in the previous session'
@ -1916,7 +2009,7 @@ def saveSpecialVersion(config={}, session=None):
    else:
        print 'Downloading Special:Version with extensions and other related info'
        r = session.post(
-            url=config['index'], data={'title': 'Special:Version'}, timeout=10)
+            url=config['index'], params={'title': 'Special:Version'}, timeout=10)
        raw = r.text
        delay(config=config, session=session)
        raw = removeIP(raw=raw)
@ -1931,14 +2024,13 @@ def saveIndexPHP(config={}, session=None):
        print 'index.html exists, do not overwrite'
    else:
        print 'Downloading index.php (Main Page) as index.html'
-        r = session.post(url=config['index'], data={}, timeout=10)
+        r = session.post(url=config['index'], params={}, timeout=10)
        raw = r.text
        delay(config=config, session=session)
        raw = removeIP(raw=raw)
        with open('%s/index.html' % (config['path']), 'w') as outfile:
            outfile.write(raw.encode('utf-8'))

-
 def saveSiteInfo(config={}, session=None):
    """ Save a file with site info """

@ -1951,7 +2043,7 @@ def saveSiteInfo(config={}, session=None):
            # MediaWiki 1.13+
            r = session.post(
                url=config['api'],
-                data={
+                params={
                    'action': 'query',
                    'meta': 'siteinfo',
                    'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo',
@ -1962,7 +2054,7 @@ def saveSiteInfo(config={}, session=None):
            if not 'query' in getJSON(r):
                r = session.post(
                    url=config['api'],
-                    data={
+                    params={
                        'action': 'query',
                        'meta': 'siteinfo',
                        'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap',
@ -1972,7 +2064,7 @@ def saveSiteInfo(config={}, session=None):
            if not 'query' in getJSON(r):
                r = session.post(
                    url=config['api'],
-                    data={
+                    params={
                        'action': 'query',
                        'meta': 'siteinfo',
                        'siprop': 'general|namespaces',
@ -1988,10 +2080,14 @@ def avoidWikimediaProjects(config={}, other={}):
    """ Skip Wikimedia projects and redirect to the dumps website """

    # notice about wikipedia dumps
+    url = ''
+    if config['api']:
+        url = url + config['api']
+    if config['index']:
+        url = url + config['index']
    if re.findall(
            r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org',
-            config['api'] +
-            config['index']):
+            url):
        print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!'
        print 'Download the dumps from http://dumps.wikimedia.org'
        if not other['force']:
--- a/batchdownload/launcher.py
+++ b/batchdownload/launcher.py
--- a/uploader.py
+++ b/uploader.py
@ -16,6 +16,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

 import getopt
+import argparse
 import os
 import re
 import subprocess
@ -30,89 +31,41 @@ from internetarchive import get_item

 import dumpgenerator

-# Configuration goes here
 # You need a file named keys.txt with access and secret keys, in two different lines
 accesskey = open('keys.txt', 'r').readlines()[0].strip()
 secretkey = open('keys.txt', 'r').readlines()[1].strip()
-# Use --admin if you are a wikiteam collection admin, or specify another collection:
-collection = 'opensource'

 # Nothing to change below
 convertlang = {'ar': 'Arabic', 'de': 'German', 'en': 'English', 'es': 'Spanish', 'fr': 'French', 'it': 'Italian', 'ja': 'Japanese', 'nl': 'Dutch', 'pl': 'Polish', 'pt': 'Portuguese', 'ru': 'Russian'}
-listfile = sys.argv[1]
-uploadeddumps = []
-try:
-    uploadeddumps = [l.split(';')[1] for l in open('uploader-%s.log' % (listfile), 'r').read().strip().splitlines() if len(l.split(';'))>1]
-except:
-    pass
-print '%d dumps uploaded previously' % (len(uploadeddumps))
-
-def getParameters(params=[]):
-    if not params:
-        params = sys.argv[2:]
-    config = {
-        'prune-directories': False,
-        'prune-wikidump': False,
-        'collection': collection,
-        'update': False,
-    }
-    #console params
-    try:
-        opts, args = getopt.getopt(params, "", ["h", "help", "prune-directories", "prune-wikidump", "admin", "update"])
-    except getopt.GetoptError, err:
-        # print help information and exit:
-        print str(err) # will print something like "option -a not recognized"
-        usage()
-        sys.exit(2)
-    for o, a in opts:
-        if o in ("-h","--help"):
-            usage()
-            sys.exit()
-        elif o in ("--prune-directories"):
-            config['prune-directories'] = True
-        elif o in ("--prune-wikidump"):
-            config['prune-wikidump'] = True
-        elif o in ("--admin"):
-            config['collection'] = "wikiteam"
-        elif o in ("--update"):
-            config['update'] = True
-    return config
-
-def usage():
-    """  """
-    print """uploader.py
-
-This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org.
-The list must be a text file with the wiki's api.php URLs, one per line.
-Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format
-as produced by launcher.py (explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump ).
-You need a file named keys.txt with access and secret keys, in two different lines https://archive.org/account/s3.php
-You also need dumpgenerator.py in the same directory as this script.
-
-Use --help to print this help."""

-def log(wiki, dump, msg):
-    f = open('uploader-%s.log' % (listfile), 'a')
+def log(wiki, dump, msg, config={}):
+    f = open('uploader-%s.log' % (config.listfile), 'a')
    f.write('\n%s;%s;%s' % (wiki, dump, msg))
    f.close()

-def upload(wikis, config={}):
+def upload(wikis, config={}, uploadeddumps=[]):
    headers = {'User-Agent': dumpgenerator.getUserAgent()}
+    dumpdir = config.wikidump_dir

+    filelist = os.listdir(dumpdir)
    for wiki in wikis:
        print "#"*73
        print "# Uploading", wiki
        print "#"*73
        wiki = wiki.lower()
-        prefix = dumpgenerator.domain2prefix(config={'api': wiki})
+        configtemp = config
+        try:
+            prefix = dumpgenerator.domain2prefix(config={'api': wiki})
+        except KeyError:
+            print "ERROR: could not produce the prefix for %s" % wiki
+        config = configtemp

        wikiname = prefix.split('-')[0]
        dumps = []
-        for dirname, dirnames, filenames in os.walk('.'):
-            if dirname == '.':
-                for f in filenames:
-                    if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
-                        dumps.append(f)
+        for f in filelist:
+            if f.startswith('%s-' % (wikiname)) and (f.endswith('-wikidump.7z') or f.endswith('-history.xml.7z')):
+                print "%s found" % f
+                dumps.append(f)
                break

        c = 0
@ -120,30 +73,33 @@ def upload(wikis, config={}):
            wikidate = dump.split('-')[1]
            item = get_item('wiki-' + wikiname)
            if dump in uploadeddumps:
-                if config['prune-directories']:
+                if config.prune_directories:
                    rmline='rm -rf %s-%s-wikidump/' % (wikiname, wikidate)
                    # With -f the deletion might have happened before and we won't know
                    if not os.system(rmline):
                        print 'DELETED %s-%s-wikidump/' % (wikiname, wikidate)
-                if config['prune-wikidump'] and dump.endswith('wikidump.7z'):
+                if config.prune_wikidump and dump.endswith('wikidump.7z'):
                        # Simplistic quick&dirty check for the presence of this file in the item
-                        stdout, stderr = subprocess.Popen(["md5sum", dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
+                        print "Checking content in previously uploaded files"
+                        stdout, stderr = subprocess.Popen(["md5sum", dumpdir + '/' + dump], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
                        dumphash = re.sub(' +.+\n?', '', stdout)

                        if dumphash in map(lambda x: x['md5'], item.files):
-                            log(wiki, dump, 'verified')
-                            rmline='rm -rf %s' % dump
+                            log(wiki, dump, 'verified', config)
+                            rmline='rm -rf %s' % dumpdir + '/' + dump
                            if not os.system(rmline):
-                                print 'DELETED ' + dump
+                                print 'DELETED ' + dumpdir + '/' + dump
                            print '%s was uploaded before, skipping...' % (dump)
                            continue
                        else:
                            print 'ERROR: The online item misses ' + dump
-                            log(wiki, dump, 'missing')
+                            log(wiki, dump, 'missing', config)
                            # We'll exit this if and go upload the dump
                else:
                    print '%s was uploaded before, skipping...' % (dump)
                    continue
+            else:
+                print '%s was not uploaded before' % dump

            time.sleep(0.1)
            wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8]
@ -155,7 +111,7 @@ def upload(wikis, config={}):
            # Logo path
            logourl = ''

-            if ismissingitem or config['update']:
+            if ismissingitem or config.update:
                #get metadata from api.php
                #first sitename and base url
                params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'}
@ -163,7 +119,7 @@ def upload(wikis, config={}):
                req = urllib2.Request(url=wiki, data=data, headers=headers)
                xml = ''
                try:
-                    f = urllib2.urlopen(req)
+                    f = urllib2.urlopen(req, timeout=10)
                    xml = f.read()
                    f.close()
                except:
@ -198,7 +154,7 @@ def upload(wikis, config={}):
                req = urllib2.Request(url=wiki, data=data, headers=headers)
                xml = ''
                try:
-                    f = urllib2.urlopen(req)
+                    f = urllib2.urlopen(req, timeout=10)
                    xml = f.read()
                    f.close()
                except:
@ -214,7 +170,7 @@ def upload(wikis, config={}):

                raw = ''
                try:
-                    f = urllib.urlopen(baseurl)
+                    f = urllib.urlopen(baseurl, timeout=10)
                    raw = f.read()
                    f.close()
                except:
@ -238,7 +194,6 @@ def upload(wikis, config={}):
                    logourl = re.findall(ur'p-logo["\'][^>]*>\s*<a [^>]*background-image:\s*(?:url\()?([^;)"]+)', raw)[0]
                except:
                    pass
-                print logourl

                #retrieve some info from the wiki
                wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia
@ -264,7 +219,7 @@ def upload(wikis, config={}):
                # Item metadata
                md = {
                    'mediatype': 'web',
-                    'collection': config['collection'],
+                    'collection': config.collection,
                    'title': wikititle,
                    'description': wikidesc,
                    'language': lang,
@ -277,25 +232,54 @@ def upload(wikis, config={}):

            #Upload files and update metadata
            try:
-                item.upload(dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True)
+                item.upload(dumpdir + '/' + dump, metadata=md, access_key=accesskey, secret_key=secretkey, verbose=True, queue_derive=False)
                item.modify_metadata(md) # update
                print 'You can find it in https://archive.org/details/wiki-%s' % (wikiname)
+                uploadeddumps.append(dump)
+                log(wiki, dump, 'ok', config)
                if logourl:
-                    logo = StringIO.StringIO(urllib.urlopen(urlparse.urljoin(wiki, logourl)).read())
+                    logo = StringIO.StringIO(urllib.urlopen(urlparse.urljoin(wiki, logourl), timeout=10).read())
                    logoextension = logourl.split('.')[-1] if logourl.split('.') else 'unknown'
                    logo.name = 'wiki-' + wikiname + '_logo.' + logoextension
                    item.upload(logo, access_key=accesskey, secret_key=secretkey, verbose=True)
-                uploadeddumps.append(dump)
-                log(wiki, dump, 'ok')
-            except:
-                print wiki, dump, 'error when uploading?'
+            except Exception as e:
+                print wiki, dump, 'Error when uploading?'
+                print e.message

            c += 1

 def main(params=[]):
-    config = getParameters(params=params)
+    parser = argparse.ArgumentParser("""uploader.py
+
+This script takes the filename of a list of wikis as argument and uploads their dumps to archive.org.
+The list must be a text file with the wiki's api.php URLs, one per line.
+Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format
+as produced by launcher.py (explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump ).
+You need a file named keys.txt with access and secret keys, in two different lines
+You also need dumpgenerator.py in the same directory as this script.
+
+Use --help to print this help.""")
+
+    parser.add_argument('-pd', '--prune_directories', action='store_true')
+    parser.add_argument('-pw', '--prune_wikidump', action='store_true')
+    parser.add_argument('-a', '--admin', action='store_true')
+    parser.add_argument('-c', '--collection', default='opensource')
+    parser.add_argument('-wd', '--wikidump_dir', default='.')
+    parser.add_argument('-u', '--update', action='store_true')
+    parser.add_argument('listfile')
+    config = parser.parse_args()
+    if config.admin:
+        config.collection = 'wikiteam'
+    uploadeddumps = []
+    listfile = config.listfile
+    try:
+        uploadeddumps = [l.split(';')[1] for l in open('uploader-%s.log' % (listfile), 'r').read().strip().splitlines() if len(l.split(';'))>1]
+    except:
+        pass
+    print '%d dumps uploaded previously' % (len(uploadeddumps))
    wikis = open(listfile, 'r').read().strip().splitlines()
-    upload(wikis, config)
+
+    upload(wikis, config, uploadeddumps)

 if __name__ == "__main__":
    main()