Removed all traces of urllib except for encode/decode; more bugs fixed.

10 years ago · 9aa3c4a0e1
parent c8e11a949b
commit 9aa3c4a0e1
1 changed files with 87 additions and 42 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -22,7 +22,12 @@
 import cookielib
 import cPickle
 import datetime
-import argparse
+import sys
+try:
+    import argparse
+except ImportError:
+    print "Please install the argparse module."
+    sys.exit(1)
 import json
 try:
    from hashlib import md5
@ -30,9 +35,12 @@ except ImportError:             # Python 2.4 compatibility
    from md5 import new as md5
 import os
 import re
-import requests
+try:
+    import requests
+except ImportError:
+    print "Please install or update the Requests module."
+    sys.exit(1)
 import subprocess
-import sys
 import time
 import urllib

@ -73,13 +81,45 @@ def cleanHTML(raw=''):
        sys.exit()
    return raw

+def handleStatusCode(response):
+    statuscode = response.status_code
+    if statuscode >= 200 and statuscode < 300:
+        return
+
+    print "HTTP Error %d." % statuscode
+    if statuscode >= 300 and statuscode < 400:
+        print "Redirect should happen automatically: please report this as a bug."
+        print response.url
+
+    elif statuscode == 400:
+        print "Bad Request: The wiki may be malfunctioning."
+        print "Please try again later."
+        print response.url
+        sys.exit(1)
+
+    elif statuscode == 401 or statuscode == 403:
+        print "Authentication required."
+        print "Please use --userpass."
+        print response.url
+
+    elif statuscode == 404:
+        print "Not found. Is Special:Export enabled for this wiki?"
+        print response.url
+        sys.exit(1)
+
+    elif statuscode == 429 or (statuscode >= 500 and statuscode < 600):
+        print "Server error, max retries exceeded."
+        print "Please resume the dump later."
+        print response.url
+        sys.exit(1)
+
 def getNamespacesScraper(config={}, session=None):
    """ Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages """
    """ Function called if no API is available """
    namespaces = config['namespaces']
    namespacenames = {0:''} # main is 0, no prefix
    if namespaces:
-        r = session.post(url=config['index'], data={'title': 'Special:Allpages', }, headers={'User-Agent': getUserAgent()})
+        r = session.post(url=config['index'], data={'title': 'Special:Allpages'})
        raw = r.text
        delay(config=config, session=session)

@ -109,7 +149,7 @@ def getNamespacesAPI(config={}, session=None):
    namespaces = config['namespaces']
    namespacenames = {0:''} # main is 0, no prefix
    if namespaces:
-        r = session.post(url=config['api'], data={'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}, headers={'User-Agent': getUserAgent()})
+        r = session.post(url=config['api'], data={'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'})
        result = json.loads(r.text)
        delay(config=config, session=session)

@ -148,12 +188,12 @@ def getPageTitlesAPI(config={}, session=None):
        
        c = 0
        print '    Retrieving titles in the namespace %d' % (namespace)
-        headers = {'User-Agent': getUserAgent()}
        apfrom = '!'
        while apfrom:
            sys.stderr.write('.') #progress
            params = {'action': 'query', 'list': 'allpages', 'apnamespace': namespace, 'apfrom': apfrom.encode('utf-8'), 'format': 'json', 'aplimit': 500}
-            r = session.post(url=config['api'], data=params, headers=headers)
+            r = session.post(url=config['api'], data=params)
+            handleStatusCode(r)
            #FIXME Handle HTTP errors here!
            jsontitles = json.loads(r.text)
            apfrom = ''
@ -182,7 +222,7 @@ def getPageTitlesScraper(config={}, session=None):
    for namespace in namespaces:
        print '    Retrieving titles in the namespace', namespace
        url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace)
-        r = session.get(url=url, headers={'User-Agent': getUserAgent()})
+        r = session.get(url=url)
        raw = r.text
        raw = cleanHTML(raw)
        
@ -219,7 +259,7 @@ def getPageTitlesScraper(config={}, session=None):
                if not name in checked_suballpages:
                    checked_suballpages.append(name) #to avoid reload dupe subpages links
                    delay(config=config, session=session)
-                    r2 = session.get(url=url, headers={'User-Agent': getUserAgent()})
+                    r2 = session.get(url=url)
                    raw2 = r2.text
                    raw2 = cleanHTML(raw2)
                    rawacum += raw2 #merge it after removed junk
@ -327,6 +367,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
                return '' # empty xml
        #FIXME HANDLE HTTP Errors HERE
        r = session.post(url=config['index'], data=params, headers=headers)
+        handleStatusCode(r)
        xml = r.text
        c += 1
    
@ -343,8 +384,7 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
    title_ = title
    title_ = re.sub(' ', '_', title_)
    #do not convert & into %26, title_ = re.sub('&', '%26', title_)
-    headers = {'User-Agent': getUserAgent()}
-    params = {'title': 'Special:Export', 'pages': title_, 'action': 'submit', }
+    params = {'title': 'Special:Export', 'pages': title_, 'action': 'submit'}
    if config['curonly']:
        params['curonly'] = 1
        params['limit'] = 1
@ -354,7 +394,7 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
    if config.has_key('templates') and config['templates']: #in other case, do not set params['templates']
        params['templates'] = 1
    
-    xml = getXMLPageCore(headers=headers, params=params, config=config, session=session)
+    xml = getXMLPageCore(params=params, config=config, session=session)

    #if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
    #else, warning about Special:Export truncating large page histories
@ -362,7 +402,7 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
    if not config['curonly'] and re.search(r_timestamp, xml): # search for timestamps in xml to avoid analysing empty pages like Special:Allpages and the random one
        while not truncated and params['offset']: #next chunk
            params['offset'] = re.findall(r_timestamp, xml)[-1] #get the last timestamp from the acum XML
-            xml2 = getXMLPageCore(headers=headers, params=params, config=config, session=session)
+            xml2 = getXMLPageCore(params=params, config=config, session=session)
            
            if re.findall(r_timestamp, xml2): #are there more edits in this next XML chunk or no <page></page>?
                if re.findall(r_timestamp, xml2)[-1] == params['offset']:
@ -498,7 +538,7 @@ def getImageFilenamesURL(config={}, session=None):
    retries = 5
    while offset:
        #5000 overload some servers, but it is needed for sites like this with no next links http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
-        r = session.post(url=config['index'], data={'title': 'Special:Imagelist', 'limit': limit, 'offset': offset, }, headers={'User-Agent': getUserAgent()})
+        r = session.post(url=config['index'], data={'title': 'Special:Imagelist', 'limit': limit, 'offset': offset})
        raw = r.text
        delay(config=config, session=session)
        if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): # delicate wiki
@ -573,14 +613,14 @@ def getImageFilenamesURLAPI(config={}, session=None):
    """ Retrieve file list: filename, url, uploader """
    
    print 'Retrieving image filenames'
-    headers = {'User-Agent': getUserAgent()}
    aifrom = '!'
    images = []
    while aifrom:
        sys.stderr.write('.') #progress
        params = {'action': 'query', 'list': 'allimages', 'aiprop': 'url|user', 'aifrom': aifrom, 'format': 'json', 'ailimit': 500}
        #FIXME Handle HTTP Errors HERE
-        r = session.post(url=config['api'], data=params, headers=headers)
+        r = session.post(url=config['api'], data=params)
+        handleStatusCode(r)
        jsonimages = json.loads(r.text)
        delay(config=config, session=session)
        aifrom = ''
@ -651,18 +691,11 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
            # split last . (extension) and then merge
            filename2 = truncateFilename(other=other, filename=filename2)
            print 'Filename is too long, truncating. Now it is:', filename2
-        # We need to set the user agent for urlretrieve but we can't do it in its call
-        # so we just override the class here; all I know about this method comes from
-        # http://docs.python.org/2/library/urllib.html#urllib._urlopener ,
-        # http://docs.python.org/2/tutorial/classes.html#class-definition-syntax .
-        # TODO: Override the user agent for all functions in a more sensible place.
-        class URLopenerUserAgent(urllib.FancyURLopener):
-            version = "%s" % getUserAgent()
-        urllib._urlopener = URLopenerUserAgent()
        filename3 = u'%s/%s' % (imagepath, filename2)
-        urllib.urlretrieve(url=url, filename=filename3.encode('utf-8'))
-        # TODO: data=urllib.urlencode({}) removed image; request fails on wikipedia and POST neither works?
-        
+        imagefile = open(filename3, 'wb')
+        r = requests.get(url=url)
+        imagefile.write(r.content)
+        imagefile.close()
        #saving description if any
        xmlfiledesc = getXMLFileDesc(config=config, title=u'Image:%s' % (filename), session=session) # use Image: for backwards compatibility
        f = open('%s/%s.desc' % (imagepath, filename2), 'w')
@ -787,8 +820,9 @@ def getParameters(params=[]):
    parser.add_argument('-v', '--version', action='version', version=getVersion())
    parser.add_argument('--cookies', metavar="cookies.txt", help="path to a cookies.txt file")
    parser.add_argument('--delay', metavar=5, default=0, help="adds a delay (in seconds)")
+    parser.add_argument('--retries', metavar=5, default=5, help="Maximum number of retries for ")
    parser.add_argument('--get-wiki-engine', action='store_true', help="returns the wiki engine")
-    
+
    groupWikiOrAPIOrIndex = parser.add_mutually_exclusive_group(required=True)
    groupWikiOrAPIOrIndex.add_argument('wiki', default='', nargs='?', help="URL to wiki")
    groupWikiOrAPIOrIndex.add_argument('--api', help="URL to api.php")
@ -806,6 +840,9 @@ def getParameters(params=[]):
    parser.add_argument('--namespaces', metavar="1,2,3", help='comma-separated value of namespaces to include (all by default)')
    parser.add_argument('--exnamespaces', metavar="1,2,3", help='comma-separated value of namespaces to exclude')
    
+    parser.add_argument('--user', help='Username if authentication is required.')
+    parser.add_argument('--pass', dest='password', help='Password if authentication is required.')
+    
    args = parser.parse_args()
    #print args
    
@ -827,6 +864,12 @@ def getParameters(params=[]):
        print 'ERROR: URL to index.php must start with http:// or https://\n'
        parser.print_usage()
        sys.exit(1)
+        
+    # check user and pass (one requires both)
+    if (args.user and not args.password) or (args.password and not args.user):
+        print 'Both --user and --pass are required for authentication.'
+        parser.print_usage()
+        sys.exit(1)

    namespaces = ['all']
    exnamespaces = []
@ -877,6 +920,9 @@ def getParameters(params=[]):
    session = requests.Session()
    session.cookies = cj
    session.headers = {'User-Agent': getUserAgent()}
+    if args.user and args.password:
+        session.auth = (args.user, args.password)
+    #session.mount(args.api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret))

    config = {
        'curonly': args.curonly,
@ -924,7 +970,7 @@ def getParameters(params=[]):
 def checkAPI(api, config={}, session=None):
    """ Checking API availability """
    global cj
-    r = session.post(url=api, data={'action': 'query', 'meta': 'siteinfo', 'format': 'json'}, headers={'User-Agent': getUserAgent()})
+    r = session.post(url=api, data={'action': 'query', 'meta': 'siteinfo', 'format': 'json'})
    resultText = r.text
    print 'Checking api.php...', api
    if "MediaWiki API is not enabled for this site." in resultText:
@ -937,7 +983,7 @@ def checkAPI(api, config={}, session=None):

 def checkIndexphp(indexphp, config={}, session=None):
    """ Checking index.php availability """
-    r = session.post(url=indexphp, data={'title': 'Special:Version'}, headers={'User-Agent': getUserAgent()})
+    r = session.post(url=indexphp, data={'title': 'Special:Version'})
    raw = r.text
    delay(config=config, session=session)
    print 'Checking index.php...', indexphp
@ -1021,7 +1067,7 @@ def resumePreviousDump(config={}, other={}):
        #load titles
        lasttitle = ''
        try:
-            f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config, session=session), config['date']), 'r')
+            f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config, session=other['session']), config['date']), 'r')
            raw = unicode(f.read(), 'utf-8')
            titles = raw.split('\n')
            lasttitle = titles[-1]
@ -1042,7 +1088,7 @@ def resumePreviousDump(config={}, other={}):
        xmliscomplete = False
        lastxmltitle = ''
        try:
-            f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config, session=session), config['date'], config['curonly'] and 'current' or 'history'), 'r')
+            f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config, session=other['session']), config['date'], config['curonly'] and 'current' or 'history'), 'r')
            for l in f:
                if re.findall('</mediawiki>', l):
                    #xml dump is complete
@ -1062,17 +1108,17 @@ def resumePreviousDump(config={}, other={}):
        elif lastxmltitle:
            #resuming...
            print 'Resuming XML dump from "%s"' % (lastxmltitle)
-            generateXMLDump(config=config, titles=titles, start=lastxmltitle)
+            generateXMLDump(config=config, titles=titles, start=lastxmltitle, session=other['session'])
        else:
            #corrupt? only has XML header?
            print 'XML is corrupt? Regenerating...'
-            generateXMLDump(config=config, titles=titles)
+            generateXMLDump(config=config, titles=titles, session=other['session'])
    
    if config['images']:
        #load images
        lastimage = ''
        try:
-            f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config, session=session), config['date']), 'r')
+            f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r')
            raw = unicode(f.read(), 'utf-8').strip()
            lines = raw.split('\n')
            for l in lines:
@ -1088,9 +1134,9 @@ def resumePreviousDump(config={}, other={}):
            print 'Image list is incomplete. Reloading...'
            #do not resume, reload, to avoid inconsistences, deleted images or so
            if config['api']:
-                images=getImageFilenamesURLAPI(config=config, session=session)
+                images=getImageFilenamesURLAPI(config=config, session=other['session'])
            else:
-                images = getImageFilenamesURL(config=config, session=session)
+                images = getImageFilenamesURL(config=config, session=other['session'])
            saveImageFilenamesURL(config=config, images=images)
        #checking images directory
        listdir = []
@ -1118,7 +1164,7 @@ def resumePreviousDump(config={}, other={}):
            #image dump is complete
            print 'Image dump was completed in the previous session'
        else:
-            generateImageDump(config=config, other=other, images=images, start=lastfilename2) # we resume from previous image, which may be corrupted (or missing .desc)  by the previous session ctrl-c or abort
+            generateImageDump(config=config, other=other, images=images, start=lastfilename2, session=other['session']) # we resume from previous image, which may be corrupted (or missing .desc)  by the previous session ctrl-c or abort
    
    if config['logs']:
        #fix
@ -1131,7 +1177,7 @@ def saveSpecialVersion(config={}, session=None):
        print 'Special:Version.html exists, do not overwrite'
    else:
        print 'Downloading Special:Version with extensions and other related info'
-        r = session.post(url=config['index'], data={'title': 'Special:Version', }, headers={'User-Agent': getUserAgent()})
+        r = session.post(url=config['index'], data={'title': 'Special:Version'})
        raw = r.text
        delay(config=config, session=session)
        raw = removeIP(raw=raw)
@ -1146,7 +1192,7 @@ def saveIndexPHP(config={}, session=None):
        print 'index.html exists, do not overwrite'
    else:
        print 'Downloading index.php (Main Page) as index.html'
-        r = session.post(url=config['index'], data={}, headers={'User-Agent': getUserAgent()})
+        r = session.post(url=config['index'], data={})
        raw = r.text
        delay(config=config, session=session)
        raw = removeIP(raw=raw)
@ -1162,7 +1208,7 @@ def saveSiteInfo(config={}, session=None):
            print 'siteinfo.json exists, do not overwrite'
        else:
            print 'Downloading site info as siteinfo.json'
-            r = session.post(url=config['api'], data = {'action': 'query', 'meta': 'siteinfo', 'format': 'json'}, headers={'User-Agent': getUserAgent()})
+            r = session.post(url=config['api'], data = {'action': 'query', 'meta': 'siteinfo', 'format': 'json'})
            result = json.loads(r.text)
            delay(config=config, session=session)
            f = open('%s/siteinfo.json' % (config['path']), 'w')
@ -1205,7 +1251,6 @@ def main(params=[]):
    """ Main function """
    
    configfilename = 'config.txt'
-    session = requests.Session()
    config, other = getParameters(params=params)
    avoidWikimediaProjects(config=config, other=other)