Removed all traces of urllib except for encode/decode; more bugs fixed.

pull/139/head
balr0g 10 years ago
parent c8e11a949b
commit 9aa3c4a0e1

@ -22,7 +22,12 @@
import cookielib
import cPickle
import datetime
import argparse
import sys
try:
import argparse
except ImportError:
print "Please install the argparse module."
sys.exit(1)
import json
try:
from hashlib import md5
@ -30,9 +35,12 @@ except ImportError: # Python 2.4 compatibility
from md5 import new as md5
import os
import re
import requests
try:
import requests
except ImportError:
print "Please install or update the Requests module."
sys.exit(1)
import subprocess
import sys
import time
import urllib
@ -73,13 +81,45 @@ def cleanHTML(raw=''):
sys.exit()
return raw
def handleStatusCode(response):
statuscode = response.status_code
if statuscode >= 200 and statuscode < 300:
return
print "HTTP Error %d." % statuscode
if statuscode >= 300 and statuscode < 400:
print "Redirect should happen automatically: please report this as a bug."
print response.url
elif statuscode == 400:
print "Bad Request: The wiki may be malfunctioning."
print "Please try again later."
print response.url
sys.exit(1)
elif statuscode == 401 or statuscode == 403:
print "Authentication required."
print "Please use --userpass."
print response.url
elif statuscode == 404:
print "Not found. Is Special:Export enabled for this wiki?"
print response.url
sys.exit(1)
elif statuscode == 429 or (statuscode >= 500 and statuscode < 600):
print "Server error, max retries exceeded."
print "Please resume the dump later."
print response.url
sys.exit(1)
def getNamespacesScraper(config={}, session=None):
""" Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages """
""" Function called if no API is available """
namespaces = config['namespaces']
namespacenames = {0:''} # main is 0, no prefix
if namespaces:
r = session.post(url=config['index'], data={'title': 'Special:Allpages', }, headers={'User-Agent': getUserAgent()})
r = session.post(url=config['index'], data={'title': 'Special:Allpages'})
raw = r.text
delay(config=config, session=session)
@ -109,7 +149,7 @@ def getNamespacesAPI(config={}, session=None):
namespaces = config['namespaces']
namespacenames = {0:''} # main is 0, no prefix
if namespaces:
r = session.post(url=config['api'], data={'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}, headers={'User-Agent': getUserAgent()})
r = session.post(url=config['api'], data={'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'})
result = json.loads(r.text)
delay(config=config, session=session)
@ -148,12 +188,12 @@ def getPageTitlesAPI(config={}, session=None):
c = 0
print ' Retrieving titles in the namespace %d' % (namespace)
headers = {'User-Agent': getUserAgent()}
apfrom = '!'
while apfrom:
sys.stderr.write('.') #progress
params = {'action': 'query', 'list': 'allpages', 'apnamespace': namespace, 'apfrom': apfrom.encode('utf-8'), 'format': 'json', 'aplimit': 500}
r = session.post(url=config['api'], data=params, headers=headers)
r = session.post(url=config['api'], data=params)
handleStatusCode(r)
#FIXME Handle HTTP errors here!
jsontitles = json.loads(r.text)
apfrom = ''
@ -182,7 +222,7 @@ def getPageTitlesScraper(config={}, session=None):
for namespace in namespaces:
print ' Retrieving titles in the namespace', namespace
url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace)
r = session.get(url=url, headers={'User-Agent': getUserAgent()})
r = session.get(url=url)
raw = r.text
raw = cleanHTML(raw)
@ -219,7 +259,7 @@ def getPageTitlesScraper(config={}, session=None):
if not name in checked_suballpages:
checked_suballpages.append(name) #to avoid reload dupe subpages links
delay(config=config, session=session)
r2 = session.get(url=url, headers={'User-Agent': getUserAgent()})
r2 = session.get(url=url)
raw2 = r2.text
raw2 = cleanHTML(raw2)
rawacum += raw2 #merge it after removed junk
@ -327,6 +367,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
return '' # empty xml
#FIXME HANDLE HTTP Errors HERE
r = session.post(url=config['index'], data=params, headers=headers)
handleStatusCode(r)
xml = r.text
c += 1
@ -343,8 +384,7 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
title_ = title
title_ = re.sub(' ', '_', title_)
#do not convert & into %26, title_ = re.sub('&', '%26', title_)
headers = {'User-Agent': getUserAgent()}
params = {'title': 'Special:Export', 'pages': title_, 'action': 'submit', }
params = {'title': 'Special:Export', 'pages': title_, 'action': 'submit'}
if config['curonly']:
params['curonly'] = 1
params['limit'] = 1
@ -354,7 +394,7 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
if config.has_key('templates') and config['templates']: #in other case, do not set params['templates']
params['templates'] = 1
xml = getXMLPageCore(headers=headers, params=params, config=config, session=session)
xml = getXMLPageCore(params=params, config=config, session=session)
#if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
#else, warning about Special:Export truncating large page histories
@ -362,7 +402,7 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
if not config['curonly'] and re.search(r_timestamp, xml): # search for timestamps in xml to avoid analysing empty pages like Special:Allpages and the random one
while not truncated and params['offset']: #next chunk
params['offset'] = re.findall(r_timestamp, xml)[-1] #get the last timestamp from the acum XML
xml2 = getXMLPageCore(headers=headers, params=params, config=config, session=session)
xml2 = getXMLPageCore(params=params, config=config, session=session)
if re.findall(r_timestamp, xml2): #are there more edits in this next XML chunk or no <page></page>?
if re.findall(r_timestamp, xml2)[-1] == params['offset']:
@ -498,7 +538,7 @@ def getImageFilenamesURL(config={}, session=None):
retries = 5
while offset:
#5000 overload some servers, but it is needed for sites like this with no next links http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
r = session.post(url=config['index'], data={'title': 'Special:Imagelist', 'limit': limit, 'offset': offset, }, headers={'User-Agent': getUserAgent()})
r = session.post(url=config['index'], data={'title': 'Special:Imagelist', 'limit': limit, 'offset': offset})
raw = r.text
delay(config=config, session=session)
if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): # delicate wiki
@ -573,14 +613,14 @@ def getImageFilenamesURLAPI(config={}, session=None):
""" Retrieve file list: filename, url, uploader """
print 'Retrieving image filenames'
headers = {'User-Agent': getUserAgent()}
aifrom = '!'
images = []
while aifrom:
sys.stderr.write('.') #progress
params = {'action': 'query', 'list': 'allimages', 'aiprop': 'url|user', 'aifrom': aifrom, 'format': 'json', 'ailimit': 500}
#FIXME Handle HTTP Errors HERE
r = session.post(url=config['api'], data=params, headers=headers)
r = session.post(url=config['api'], data=params)
handleStatusCode(r)
jsonimages = json.loads(r.text)
delay(config=config, session=session)
aifrom = ''
@ -651,18 +691,11 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
# split last . (extension) and then merge
filename2 = truncateFilename(other=other, filename=filename2)
print 'Filename is too long, truncating. Now it is:', filename2
# We need to set the user agent for urlretrieve but we can't do it in its call
# so we just override the class here; all I know about this method comes from
# http://docs.python.org/2/library/urllib.html#urllib._urlopener ,
# http://docs.python.org/2/tutorial/classes.html#class-definition-syntax .
# TODO: Override the user agent for all functions in a more sensible place.
class URLopenerUserAgent(urllib.FancyURLopener):
version = "%s" % getUserAgent()
urllib._urlopener = URLopenerUserAgent()
filename3 = u'%s/%s' % (imagepath, filename2)
urllib.urlretrieve(url=url, filename=filename3.encode('utf-8'))
# TODO: data=urllib.urlencode({}) removed image; request fails on wikipedia and POST neither works?
imagefile = open(filename3, 'wb')
r = requests.get(url=url)
imagefile.write(r.content)
imagefile.close()
#saving description if any
xmlfiledesc = getXMLFileDesc(config=config, title=u'Image:%s' % (filename), session=session) # use Image: for backwards compatibility
f = open('%s/%s.desc' % (imagepath, filename2), 'w')
@ -787,8 +820,9 @@ def getParameters(params=[]):
parser.add_argument('-v', '--version', action='version', version=getVersion())
parser.add_argument('--cookies', metavar="cookies.txt", help="path to a cookies.txt file")
parser.add_argument('--delay', metavar=5, default=0, help="adds a delay (in seconds)")
parser.add_argument('--retries', metavar=5, default=5, help="Maximum number of retries for ")
parser.add_argument('--get-wiki-engine', action='store_true', help="returns the wiki engine")
groupWikiOrAPIOrIndex = parser.add_mutually_exclusive_group(required=True)
groupWikiOrAPIOrIndex.add_argument('wiki', default='', nargs='?', help="URL to wiki")
groupWikiOrAPIOrIndex.add_argument('--api', help="URL to api.php")
@ -806,6 +840,9 @@ def getParameters(params=[]):
parser.add_argument('--namespaces', metavar="1,2,3", help='comma-separated value of namespaces to include (all by default)')
parser.add_argument('--exnamespaces', metavar="1,2,3", help='comma-separated value of namespaces to exclude')
parser.add_argument('--user', help='Username if authentication is required.')
parser.add_argument('--pass', dest='password', help='Password if authentication is required.')
args = parser.parse_args()
#print args
@ -827,6 +864,12 @@ def getParameters(params=[]):
print 'ERROR: URL to index.php must start with http:// or https://\n'
parser.print_usage()
sys.exit(1)
# check user and pass (one requires both)
if (args.user and not args.password) or (args.password and not args.user):
print 'Both --user and --pass are required for authentication.'
parser.print_usage()
sys.exit(1)
namespaces = ['all']
exnamespaces = []
@ -877,6 +920,9 @@ def getParameters(params=[]):
session = requests.Session()
session.cookies = cj
session.headers = {'User-Agent': getUserAgent()}
if args.user and args.password:
session.auth = (args.user, args.password)
#session.mount(args.api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret))
config = {
'curonly': args.curonly,
@ -924,7 +970,7 @@ def getParameters(params=[]):
def checkAPI(api, config={}, session=None):
""" Checking API availability """
global cj
r = session.post(url=api, data={'action': 'query', 'meta': 'siteinfo', 'format': 'json'}, headers={'User-Agent': getUserAgent()})
r = session.post(url=api, data={'action': 'query', 'meta': 'siteinfo', 'format': 'json'})
resultText = r.text
print 'Checking api.php...', api
if "MediaWiki API is not enabled for this site." in resultText:
@ -937,7 +983,7 @@ def checkAPI(api, config={}, session=None):
def checkIndexphp(indexphp, config={}, session=None):
""" Checking index.php availability """
r = session.post(url=indexphp, data={'title': 'Special:Version'}, headers={'User-Agent': getUserAgent()})
r = session.post(url=indexphp, data={'title': 'Special:Version'})
raw = r.text
delay(config=config, session=session)
print 'Checking index.php...', indexphp
@ -1021,7 +1067,7 @@ def resumePreviousDump(config={}, other={}):
#load titles
lasttitle = ''
try:
f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config, session=session), config['date']), 'r')
f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config, session=other['session']), config['date']), 'r')
raw = unicode(f.read(), 'utf-8')
titles = raw.split('\n')
lasttitle = titles[-1]
@ -1042,7 +1088,7 @@ def resumePreviousDump(config={}, other={}):
xmliscomplete = False
lastxmltitle = ''
try:
f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config, session=session), config['date'], config['curonly'] and 'current' or 'history'), 'r')
f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config, session=other['session']), config['date'], config['curonly'] and 'current' or 'history'), 'r')
for l in f:
if re.findall('</mediawiki>', l):
#xml dump is complete
@ -1062,17 +1108,17 @@ def resumePreviousDump(config={}, other={}):
elif lastxmltitle:
#resuming...
print 'Resuming XML dump from "%s"' % (lastxmltitle)
generateXMLDump(config=config, titles=titles, start=lastxmltitle)
generateXMLDump(config=config, titles=titles, start=lastxmltitle, session=other['session'])
else:
#corrupt? only has XML header?
print 'XML is corrupt? Regenerating...'
generateXMLDump(config=config, titles=titles)
generateXMLDump(config=config, titles=titles, session=other['session'])
if config['images']:
#load images
lastimage = ''
try:
f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config, session=session), config['date']), 'r')
f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r')
raw = unicode(f.read(), 'utf-8').strip()
lines = raw.split('\n')
for l in lines:
@ -1088,9 +1134,9 @@ def resumePreviousDump(config={}, other={}):
print 'Image list is incomplete. Reloading...'
#do not resume, reload, to avoid inconsistences, deleted images or so
if config['api']:
images=getImageFilenamesURLAPI(config=config, session=session)
images=getImageFilenamesURLAPI(config=config, session=other['session'])
else:
images = getImageFilenamesURL(config=config, session=session)
images = getImageFilenamesURL(config=config, session=other['session'])
saveImageFilenamesURL(config=config, images=images)
#checking images directory
listdir = []
@ -1118,7 +1164,7 @@ def resumePreviousDump(config={}, other={}):
#image dump is complete
print 'Image dump was completed in the previous session'
else:
generateImageDump(config=config, other=other, images=images, start=lastfilename2) # we resume from previous image, which may be corrupted (or missing .desc) by the previous session ctrl-c or abort
generateImageDump(config=config, other=other, images=images, start=lastfilename2, session=other['session']) # we resume from previous image, which may be corrupted (or missing .desc) by the previous session ctrl-c or abort
if config['logs']:
#fix
@ -1131,7 +1177,7 @@ def saveSpecialVersion(config={}, session=None):
print 'Special:Version.html exists, do not overwrite'
else:
print 'Downloading Special:Version with extensions and other related info'
r = session.post(url=config['index'], data={'title': 'Special:Version', }, headers={'User-Agent': getUserAgent()})
r = session.post(url=config['index'], data={'title': 'Special:Version'})
raw = r.text
delay(config=config, session=session)
raw = removeIP(raw=raw)
@ -1146,7 +1192,7 @@ def saveIndexPHP(config={}, session=None):
print 'index.html exists, do not overwrite'
else:
print 'Downloading index.php (Main Page) as index.html'
r = session.post(url=config['index'], data={}, headers={'User-Agent': getUserAgent()})
r = session.post(url=config['index'], data={})
raw = r.text
delay(config=config, session=session)
raw = removeIP(raw=raw)
@ -1162,7 +1208,7 @@ def saveSiteInfo(config={}, session=None):
print 'siteinfo.json exists, do not overwrite'
else:
print 'Downloading site info as siteinfo.json'
r = session.post(url=config['api'], data = {'action': 'query', 'meta': 'siteinfo', 'format': 'json'}, headers={'User-Agent': getUserAgent()})
r = session.post(url=config['api'], data = {'action': 'query', 'meta': 'siteinfo', 'format': 'json'})
result = json.loads(r.text)
delay(config=config, session=session)
f = open('%s/siteinfo.json' % (config['path']), 'w')
@ -1205,7 +1251,6 @@ def main(params=[]):
""" Main function """
configfilename = 'config.txt'
session = requests.Session()
config, other = getParameters(params=params)
avoidWikimediaProjects(config=config, other=other)

Loading…
Cancel
Save