Merge pull request #194 from mrshu/mrshu/dumpgenerator-pep8fied

dumpgenerator: AutoPEP8-fied
pull/197/head
nemobis 10 years ago
commit b3ef165529

@ -52,7 +52,8 @@ def getVersion():
def truncateFilename(other={}, filename=''): def truncateFilename(other={}, filename=''):
""" Truncate filenames when downloading images with large filenames """ """ Truncate filenames when downloading images with large filenames """
return filename[:other['filenamelimit']] + md5(filename).hexdigest() + '.' + filename.split('.')[-1] return filename[:other['filenamelimit']] + \
md5(filename).hexdigest() + '.' + filename.split('.')[-1]
def delay(config={}, session=None): def delay(config={}, session=None):
@ -79,9 +80,11 @@ def cleanHTML(raw=''):
elif re.search('<!-- content -->', raw): elif re.search('<!-- content -->', raw):
raw = raw.split('<!-- content -->')[1].split('<!-- mw_content -->')[0] raw = raw.split('<!-- content -->')[1].split('<!-- mw_content -->')[0]
elif re.search('<article id="WikiaMainContent" class="WikiaMainContent">', raw): elif re.search('<article id="WikiaMainContent" class="WikiaMainContent">', raw):
raw = raw.split('<article id="WikiaMainContent" class="WikiaMainContent">')[1].split('</article>')[0] raw = raw.split('<article id="WikiaMainContent" class="WikiaMainContent">')[
1].split('</article>')[0]
elif re.search('<body class=', raw): elif re.search('<body class=', raw):
raw = raw.split('<body class=')[1].split('<div class="printfooter">')[0] raw = raw.split('<body class=')[1].split(
'<div class="printfooter">')[0]
else: else:
print raw[:250] print raw[:250]
print 'This wiki doesn\'t use marks to split content' print 'This wiki doesn\'t use marks to split content'
@ -164,8 +167,14 @@ def getNamespacesAPI(config={}, session=None):
namespaces = config['namespaces'] namespaces = config['namespaces']
namespacenames = {0: ''} # main is 0, no prefix namespacenames = {0: ''} # main is 0, no prefix
if namespaces: if namespaces:
r = session.post(url=config['api'], data={ r = session.post(
'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}) url=config['api'],
data={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'namespaces',
'format': 'json'}
)
result = json.loads(r.text) result = json.loads(r.text)
delay(config=config, session=session) delay(config=config, session=session)
@ -180,11 +189,13 @@ def getNamespacesAPI(config={}, session=None):
# check if those namespaces really exist in this wiki # check if those namespaces really exist in this wiki
namespaces2 = [] namespaces2 = []
for i in result['query']['namespaces'].keys(): for i in result['query']['namespaces'].keys():
if int(i) < 0: # -1: Special, -2: Media, excluding bi = i
i = int(i)
if i < 0: # -1: Special, -2: Media, excluding
continue continue
if int(i) in namespaces: if i in namespaces:
namespaces2.append(int(i)) namespaces2.append(i)
namespacenames[int(i)] = result['query']['namespaces'][i]['*'] namespacenames[i] = result['query']['namespaces'][bi]['*']
namespaces = namespaces2 namespaces = namespaces2
else: else:
namespaces = [0] namespaces = [0]
@ -209,16 +220,23 @@ def getPageTitlesAPI(config={}, session=None):
apfrom = '!' apfrom = '!'
while apfrom: while apfrom:
sys.stderr.write('.') # progress sys.stderr.write('.') # progress
params = {'action': 'query', 'list': 'allpages', 'apnamespace': namespace, params = {
'apfrom': apfrom.encode('utf-8'), 'format': 'json', 'aplimit': 500} 'action': 'query',
'list': 'allpages',
'apnamespace': namespace,
'apfrom': apfrom.encode('utf-8'),
'format': 'json',
'aplimit': 500}
r = session.post(url=config['api'], data=params) r = session.post(url=config['api'], data=params)
handleStatusCode(r) handleStatusCode(r)
# FIXME Handle HTTP errors here! # FIXME Handle HTTP errors here!
jsontitles = json.loads(r.text) jsontitles = json.loads(r.text)
apfrom = '' apfrom = ''
if 'query-continue' in jsontitles and 'allpages' in jsontitles['query-continue']: if 'query-continue' in jsontitles and 'allpages' in jsontitles[
'query-continue']:
if 'apcontinue' in jsontitles['query-continue']['allpages']: if 'apcontinue' in jsontitles['query-continue']['allpages']:
apfrom = jsontitles['query-continue']['allpages']['apcontinue'] apfrom = jsontitles[
'query-continue']['allpages']['apcontinue']
elif 'apfrom' in jsontitles['query-continue']['allpages']: elif 'apfrom' in jsontitles['query-continue']['allpages']:
apfrom = jsontitles['query-continue']['allpages']['apfrom'] apfrom = jsontitles['query-continue']['allpages']['apfrom']
# print apfrom # print apfrom
@ -299,7 +317,9 @@ def getPageTitlesScraper(config={}, session=None):
raw2 = r2.text raw2 = r2.text
raw2 = cleanHTML(raw2) raw2 = cleanHTML(raw2)
rawacum += raw2 # merge it after removed junk rawacum += raw2 # merge it after removed junk
print ' Reading', name, len(raw2), 'bytes', len(re.findall(r_suballpages, raw2)), 'subpages', len(re.findall(r_title, raw2)), 'pages' print ' Reading', name, len(raw2), 'bytes', \
len(re.findall(r_suballpages, raw2)), 'subpages', \
len(re.findall(r_title, raw2)), 'pages'
delay(config=config, session=session) delay(config=config, session=session)
c += 1 c += 1
@ -338,8 +358,7 @@ def getPageTitles(config={}, session=None):
# removing dupes (e.g. in CZ appears Widget:AddThis two times (main # removing dupes (e.g. in CZ appears Widget:AddThis two times (main
# namespace and widget namespace)) # namespace and widget namespace))
titles = list(set(titles)) titles = sorted(set(titles))
titles.sort()
print '%d page titles loaded' % (len(titles)) print '%d page titles loaded' % (len(titles))
return titles return titles
@ -347,7 +366,7 @@ def getPageTitles(config={}, session=None):
def getImageNames(config={}, session=None): def getImageNames(config={}, session=None):
""" Get list of image names """ """ Get list of image names """
print 'Retrieving image filenames' print 'Retrieving image filenames'
images = [] images = []
if 'api' in config and config['api']: if 'api' in config and config['api']:
@ -355,7 +374,7 @@ def getImageNames(config={}, session=None):
elif 'index' in config and config['index']: elif 'index' in config and config['index']:
images = getImageNamesScraper(config=config, session=session) images = getImageNamesScraper(config=config, session=session)
#images = list(set(images)) # it is a list of lists # images = list(set(images)) # it is a list of lists
images.sort() images.sort()
print '%d image names loaded' % (len(images)) print '%d image names loaded' % (len(images))
@ -380,7 +399,12 @@ def getXMLHeader(config={}, session=None):
def getXMLFileDesc(config={}, title='', session=None): def getXMLFileDesc(config={}, title='', session=None):
""" Get XML for image description page """ """ Get XML for image description page """
config['curonly'] = 1 # tricky to get only the most recent desc config['curonly'] = 1 # tricky to get only the most recent desc
return getXMLPage(config=config, title=title, verbose=False, session=session) return getXMLPage(
config=config,
title=title,
verbose=False,
session=session
)
def getUserAgent(): def getUserAgent():
@ -433,20 +457,30 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
if not config['curonly']: if not config['curonly']:
print ' Trying to save only the last revision for this page...' print ' Trying to save only the last revision for this page...'
params['curonly'] = 1 params['curonly'] = 1
logerror(config=config, text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' % ( logerror(
params['pages'])) config=config,
return getXMLPageCore(headers=headers, params=params, config=config, session=session) text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' %
(params['pages'])
)
return getXMLPageCore(
headers=headers,
params=params,
config=config,
session=session
)
else: else:
print ' Saving in the errors log, and skipping...' print ' Saving in the errors log, and skipping...'
logerror(config=config, text='Error while retrieving the last revision of "%s". Skipping.' % ( logerror(
params['pages'])) config=config,
text='Error while retrieving the last revision of "%s". Skipping.' %
(params['pages']))
return '' # empty xml return '' # empty xml
# FIXME HANDLE HTTP Errors HERE # FIXME HANDLE HTTP Errors HERE
try: try:
r = session.post(url=config['index'], data=params, headers=headers) r = session.post(url=config['index'], data=params, headers=headers)
handleStatusCode(r) handleStatusCode(r)
xml = r.text xml = r.text
except requests.exceptions.ConnectionError, e: except requests.exceptions.ConnectionError as e:
xml = '' xml = ''
c += 1 c += 1
@ -543,7 +577,8 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
header = getXMLHeader(config=config, session=session) header = getXMLHeader(config=config, session=session)
footer = '</mediawiki>\n' # new line at the end footer = '</mediawiki>\n' # new line at the end
xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config), xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config),
config['date'], config['curonly'] and 'current' or 'history') config['date'],
config['curonly'] and 'current' or 'history')
xmlfile = '' xmlfile = ''
lock = True lock = True
if start: if start:
@ -569,7 +604,10 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
os.remove('%s/%s' % (config['path'], xmlfilename)) os.remove('%s/%s' % (config['path'], xmlfilename))
# move correctly truncated dump to its real name # move correctly truncated dump to its real name
os.rename( os.rename(
'%s/%s2' % (config['path'], xmlfilename), '%s/%s' % (config['path'], xmlfilename)) '%s/%s2' %
(config['path'], xmlfilename), '%s/%s' %
(config['path'], xmlfilename)
)
else: else:
# requested complete xml dump # requested complete xml dump
lock = False lock = False
@ -593,7 +631,10 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
xml = cleanXML(xml=xml) xml = cleanXML(xml=xml)
if not xml: if not xml:
logerror( logerror(
config=config, text=u'The page "%s" was missing in the wiki (probably deleted)' % (title)) config=config,
text=u'The page "%s" was missing in the wiki (probably deleted)' %
(title)
)
# here, XML is a correct <page> </page> chunk or # here, XML is a correct <page> </page> chunk or
# an empty string due to a deleted page (logged in errors log) or # an empty string due to a deleted page (logged in errors log) or
# an empty string due to an error while retrieving the page from server # an empty string due to an error while retrieving the page from server
@ -624,8 +665,18 @@ def saveImageNames(config={}, images=[], session=None):
imagesfilename = '%s-%s-images.txt' % ( imagesfilename = '%s-%s-images.txt' % (
domain2prefix(config=config), config['date']) domain2prefix(config=config), config['date'])
imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'w') imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'w')
imagesfile.write(('\n'.join(['%s\t%s\t%s' % ( imagesfile.write(
filename, url, uploader) for filename, url, uploader in images]).encode('utf-8'))) ('\n'.join(
[
'%s\t%s\t%s' %
(filename,
url,
uploader) for filename,
url,
uploader in images]
).encode('utf-8')
)
)
imagesfile.write('\n--END--') imagesfile.write('\n--END--')
imagesfile.close() imagesfile.close()
@ -634,26 +685,31 @@ def saveImageNames(config={}, images=[], session=None):
def curateImageURL(config={}, url=''): def curateImageURL(config={}, url=''):
""" Returns an absolute URL for an image, adding the domain if missing """ """ Returns an absolute URL for an image, adding the domain if missing """
if 'index' in config and config['index']: if 'index' in config and config['index']:
#remove from :// (http or https) until the first / after domain # remove from :// (http or https) until the first / after domain
domainalone = config['index'].split('://')[0] + '://' + config['index'].split('://')[1].split('/')[0] domainalone = config['index'].split(
'://')[0] + '://' + config['index'].split('://')[1].split('/')[0]
elif 'api' in config and config['api']: elif 'api' in config and config['api']:
domainalone = config['api'].split('://')[0] + '://' + config['api'].split('://')[1].split('/')[0] domainalone = config['api'].split(
'://')[0] + '://' + config['api'].split('://')[1].split('/')[0]
else: else:
print 'ERROR: no index nor API' print 'ERROR: no index nor API'
sys.exit() sys.exit()
if url.startswith('//'): # Orain wikifarm returns URLs starting with // if url.startswith('//'): # Orain wikifarm returns URLs starting with //
url = u'%s:%s' % (domainalone.split('://')[0], url) url = u'%s:%s' % (domainalone.split('://')[0], url)
elif url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')): #is it a relative URL? # is it a relative URL?
if url[0] == '/': #slash is added later elif url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')):
if url[0] == '/': # slash is added later
url = url[1:] url = url[1:]
url = u'%s/%s' % (domainalone, url) # concat http(s) + domain + relative url # concat http(s) + domain + relative url
url = u'%s/%s' % (domainalone, url)
url = undoHTMLEntities(text=url) url = undoHTMLEntities(text=url)
#url = urllib.unquote(url) #do not use unquote with url, it break some urls with odd chars # url = urllib.unquote(url) #do not use unquote with url, it break some
# urls with odd chars
url = re.sub(' ', '_', url) url = re.sub(' ', '_', url)
return url return url
@ -670,12 +726,18 @@ def getImageNamesScraper(config={}, session=None):
# 5000 overload some servers, but it is needed for sites like this with # 5000 overload some servers, but it is needed for sites like this with
# no next links # no next links
# http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
r = session.post(url=config['index'], data={ r = session.post(
'title': 'Special:Imagelist', 'limit': limit, 'offset': offset}) url=config['index'],
data={
'title': 'Special:Imagelist',
'limit': limit,
'offset': offset})
raw = r.text raw = r.text
delay(config=config, session=session) delay(config=config, session=session)
# delicate wiki # delicate wiki
if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): if re.search(
ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)',
raw):
if limit > 10: if limit > 10:
print 'Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit) print 'Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit)
limit = limit / 10 limit = limit / 10
@ -704,11 +766,12 @@ def getImageNamesScraper(config={}, session=None):
# http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
# (<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br /> # (<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br />
r_images4 = r'(?im)<a href=[^>]+ title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+<a[^>]+>(?P<uploader>[^<]+)</a>' r_images4 = r'(?im)<a href=[^>]+ title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+<a[^>]+>(?P<uploader>[^<]+)</a>'
r_images5 = (r'(?im)<td class="TablePager_col_img_name">\s*<a href[^>]*?>(?P<filename>[^>]+)</a>\s*\(<a href="(?P<url>[^>]+)">[^<]*?</a>\s*\)\s*</td>\s*' r_images5 = (
'<td class="TablePager_col_thumb">[^\n\r]*?</td>\s*' r'(?im)<td class="TablePager_col_img_name">\s*<a href[^>]*?>(?P<filename>[^>]+)</a>\s*\(<a href="(?P<url>[^>]+)">[^<]*?</a>\s*\)\s*</td>\s*'
'<td class="TablePager_col_img_size">[^<]*?</td>\s*' '<td class="TablePager_col_thumb">[^\n\r]*?</td>\s*'
'<td class="TablePager_col_img_user_text">\s*(<a href="[^>]*?" title="[^>]*?">)?(?P<uploader>[^<]+?)(</a>)?\s*</td>') '<td class="TablePager_col_img_size">[^<]*?</td>\s*'
'<td class="TablePager_col_img_user_text">\s*(<a href="[^>]*?" title="[^>]*?">)?(?P<uploader>[^<]+?)(</a>)?\s*</td>')
# Select the regexp that returns more results # Select the regexp that returns more results
regexps = [r_images1, r_images2, r_images3, r_images4, r_images5] regexps = [r_images1, r_images2, r_images3, r_images4, r_images5]
count = 0 count = 0
@ -720,7 +783,7 @@ def getImageNamesScraper(config={}, session=None):
regexp_best = i regexp_best = i
i += 1 i += 1
m = re.compile(regexps[regexp_best]).finditer(raw) m = re.compile(regexps[regexp_best]).finditer(raw)
# Iter the image results # Iter the image results
for i in m: for i in m:
url = i.group('url') url = i.group('url')
@ -761,61 +824,86 @@ def getImageNamesAPI(config={}, session=None):
images = [] images = []
while aifrom: while aifrom:
sys.stderr.write('.') # progress sys.stderr.write('.') # progress
params = {'action': 'query', 'list': 'allimages', 'aiprop': params = {
'url|user', 'aifrom': aifrom, 'format': 'json', 'ailimit': 500} 'action': 'query',
'list': 'allimages',
'aiprop': 'url|user',
'aifrom': aifrom,
'format': 'json',
'ailimit': 500}
# FIXME Handle HTTP Errors HERE # FIXME Handle HTTP Errors HERE
r = session.post(url=config['api'], data=params) r = session.post(url=config['api'], data=params)
handleStatusCode(r) handleStatusCode(r)
jsonimages = json.loads(r.text) jsonimages = json.loads(r.text)
delay(config=config, session=session) delay(config=config, session=session)
if 'query' in jsonimages: if 'query' in jsonimages:
aifrom = '' aifrom = ''
if jsonimages.has_key('query-continue') and jsonimages['query-continue'].has_key('allimages'): if 'query-continue' in jsonimages and 'allimages' in jsonimages[
if jsonimages['query-continue']['allimages'].has_key('aicontinue'): 'query-continue']:
aifrom = jsonimages['query-continue']['allimages']['aicontinue'] if 'aicontinue' in jsonimages['query-continue']['allimages']:
elif jsonimages['query-continue']['allimages'].has_key('aifrom'): aifrom = jsonimages[
aifrom = jsonimages['query-continue']['allimages']['aifrom'] 'query-continue']['allimages']['aicontinue']
#print aifrom elif 'aifrom' in jsonimages['query-continue']['allimages']:
aifrom = jsonimages[
'query-continue']['allimages']['aifrom']
# print aifrom
for image in jsonimages['query']['allimages']: for image in jsonimages['query']['allimages']:
url = image['url'] url = image['url']
url = curateImageURL(config=config, url=url) url = curateImageURL(config=config, url=url)
# encoding to ascii is needed to work around this horrible bug: http://bugs.python.org/issue8136 # encoding to ascii is needed to work around this horrible bug:
filename = unicode(urllib.unquote((re.sub('_', ' ', url.split('/')[-1])).encode('ascii','ignore')), 'utf-8') # http://bugs.python.org/issue8136
filename = unicode(urllib.unquote(
(re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore')), 'utf-8')
uploader = re.sub('_', ' ', image['user']) uploader = re.sub('_', ' ', image['user'])
images.append([filename, url, uploader]) images.append([filename, url, uploader])
else: else:
oldAPI = True oldAPI = True
break break
if oldAPI: if oldAPI:
gapfrom = '!' gapfrom = '!'
images = [] images = []
while gapfrom: while gapfrom:
sys.stderr.write('.') #progress sys.stderr.write('.') # progress
# Some old APIs doesn't have allimages query # Some old APIs doesn't have allimages query
# In this case use allpages (in nm=6) as generator for imageinfo # In this case use allpages (in nm=6) as generator for imageinfo
# Example: http://minlingo.wiki-site.com/api.php?action=query&generator=allpages&gapnamespace=6 &gaplimit=500&prop=imageinfo&iiprop=user|url&gapfrom=! # Example:
params = {'action': 'query', 'generator': 'allpages', 'gapnamespace': 6, 'gaplimit': 500, 'gapfrom': gapfrom, 'prop': 'imageinfo', 'iiprop': 'user|url', 'format': 'json'} # http://minlingo.wiki-site.com/api.php?action=query&generator=allpages&gapnamespace=6
#FIXME Handle HTTP Errors HERE # &gaplimit=500&prop=imageinfo&iiprop=user|url&gapfrom=!
params = {
'action': 'query',
'generator': 'allpages',
'gapnamespace': 6,
'gaplimit': 500,
'gapfrom': gapfrom,
'prop': 'imageinfo',
'iiprop': 'user|url',
'format': 'json'}
# FIXME Handle HTTP Errors HERE
r = session.post(url=config['api'], data=params) r = session.post(url=config['api'], data=params)
handleStatusCode(r) handleStatusCode(r)
jsonimages = json.loads(r.text) jsonimages = json.loads(r.text)
delay(config=config, session=session) delay(config=config, session=session)
if 'query' in jsonimages: if 'query' in jsonimages:
gapfrom = '' gapfrom = ''
if jsonimages.has_key('query-continue') and jsonimages['query-continue'].has_key('allpages'): if 'query-continue' in jsonimages and 'allpages' in jsonimages[
if jsonimages['query-continue']['allpages'].has_key('gapfrom'): 'query-continue']:
gapfrom = jsonimages['query-continue']['allpages']['gapfrom'] if 'gapfrom' in jsonimages['query-continue']['allpages']:
#print gapfrom gapfrom = jsonimages[
#print jsonimages['query'] 'query-continue']['allpages']['gapfrom']
# print gapfrom
# print jsonimages['query']
for image, props in jsonimages['query']['pages'].items(): for image, props in jsonimages['query']['pages'].items():
url = props['imageinfo'][0]['url'] url = props['imageinfo'][0]['url']
url = curateImageURL(config=config, url=url) url = curateImageURL(config=config, url=url)
filename = re.sub('_', ' ', ':'.join(props['title'].split(':')[1:]))
tmp_filename = ':'.join(props['title'].split(':')[1:])
filename = re.sub('_', ' ', tmp_filename)
uploader = re.sub('_', ' ', props['imageinfo'][0]['user']) uploader = re.sub('_', ' ', props['imageinfo'][0]['user'])
images.append([filename, url, uploader]) images.append([filename, url, uploader])
@ -876,8 +964,11 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
imagefile.write(r.content) imagefile.write(r.content)
imagefile.close() imagefile.close()
# saving description if any # saving description if any
xmlfiledesc = getXMLFileDesc(config=config, title=u'Image:%s' % ( xmlfiledesc = getXMLFileDesc(
filename), session=session) # use Image: for backwards compatibility config=config,
title=u'Image:%s' %
(filename),
session=session) # use Image: for backwards compatibility
f = open('%s/%s.desc' % (imagepath, filename2), 'w') f = open('%s/%s.desc' % (imagepath, filename2), 'w')
# <text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text> # <text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text>
if not re.search(r'</mediawiki>', xmlfiledesc): if not re.search(r'</mediawiki>', xmlfiledesc):
@ -1001,73 +1092,103 @@ def getParameters(params=[]):
params = sys.argv params = sys.argv
parser = argparse.ArgumentParser(description='') parser = argparse.ArgumentParser(description='')
# General params # General params
parser.add_argument( parser.add_argument(
'-v', '--version', action='version', version=getVersion()) '-v', '--version', action='version', version=getVersion())
parser.add_argument( parser.add_argument(
'--cookies', metavar="cookies.txt", help="path to a cookies.txt file") '--cookies', metavar="cookies.txt", help="path to a cookies.txt file")
parser.add_argument( parser.add_argument(
'--delay', metavar=5, default=0, type=float, help="adds a delay (in seconds)") '--delay',
metavar=5,
default=0,
type=float,
help="adds a delay (in seconds)")
parser.add_argument( parser.add_argument(
'--retries', metavar=5, default=5, help="Maximum number of retries for ") '--retries',
metavar=5,
default=5,
help="Maximum number of retries for ")
parser.add_argument('--path', help='path to store wiki dump at') parser.add_argument('--path', help='path to store wiki dump at')
parser.add_argument('--resume', action='store_true', parser.add_argument(
help='resumes previous incomplete dump (requires --path)') '--resume',
action='store_true',
help='resumes previous incomplete dump (requires --path)')
parser.add_argument('--force', action='store_true', help='') parser.add_argument('--force', action='store_true', help='')
parser.add_argument( parser.add_argument(
'--user', help='Username if authentication is required.') '--user', help='Username if authentication is required.')
parser.add_argument( parser.add_argument(
'--pass', dest='password', help='Password if authentication is required.') '--pass',
dest='password',
help='Password if authentication is required.')
# URL params # URL params
groupWikiOrAPIOrIndex = parser.add_argument_group() groupWikiOrAPIOrIndex = parser.add_argument_group()
groupWikiOrAPIOrIndex.add_argument( groupWikiOrAPIOrIndex.add_argument(
'wiki', default='', nargs='?', help="URL to wiki (e.g. http://wiki.domain.org)") 'wiki',
groupWikiOrAPIOrIndex.add_argument('--api', help="URL to API (e.g. http://wiki.domain.org/w/api.php)") default='',
groupWikiOrAPIOrIndex.add_argument('--index', help="URL to index.php (e.g. http://wiki.domain.org/w/index.php)") nargs='?',
help="URL to wiki (e.g. http://wiki.domain.org)")
groupWikiOrAPIOrIndex.add_argument(
'--api',
help="URL to API (e.g. http://wiki.domain.org/w/api.php)")
groupWikiOrAPIOrIndex.add_argument(
'--index',
help="URL to index.php (e.g. http://wiki.domain.org/w/index.php)")
# Download params # Download params
groupDownload = parser.add_argument_group('Data to download', 'What info download from the wiki') groupDownload = parser.add_argument_group(
'Data to download',
'What info download from the wiki')
groupDownload.add_argument( groupDownload.add_argument(
'--xml', action='store_true', help="generates a full history XML dump (--xml --curonly for current revisions only)") '--xml',
action='store_true',
help="generates a full history XML dump (--xml --curonly for current revisions only)")
groupDownload.add_argument('--curonly', action='store_true', groupDownload.add_argument('--curonly', action='store_true',
help='store only the current version of pages') help='store only the current version of pages')
groupDownload.add_argument( groupDownload.add_argument(
'--images', action='store_true', help="generates an image dump") '--images', action='store_true', help="generates an image dump")
groupDownload.add_argument('--namespaces', metavar="1,2,3", groupDownload.add_argument(
help='comma-separated value of namespaces to include (all by default)') '--namespaces',
groupDownload.add_argument('--exnamespaces', metavar="1,2,3", metavar="1,2,3",
help='comma-separated value of namespaces to exclude') help='comma-separated value of namespaces to include (all by default)')
groupDownload.add_argument(
'--exnamespaces',
metavar="1,2,3",
help='comma-separated value of namespaces to exclude')
# Meta info params # Meta info params
groupMeta = parser.add_argument_group('Meta info', 'What meta info to retrieve from the wiki') groupMeta = parser.add_argument_group(
'Meta info',
'What meta info to retrieve from the wiki')
groupMeta.add_argument( groupMeta.add_argument(
'--get-wiki-engine', action='store_true', help="returns the wiki engine") '--get-wiki-engine',
action='store_true',
help="returns the wiki engine")
args = parser.parse_args() args = parser.parse_args()
# print args # print args
# Don't mix download params and meta info params # Don't mix download params and meta info params
if (args.xml or args.images) and \ if (args.xml or args.images) and \
(args.get_wiki_engine): (args.get_wiki_engine):
print 'ERROR: Don\'t mix download params and meta info params' print 'ERROR: Don\'t mix download params and meta info params'
parser.print_help() parser.print_help()
sys.exit(1) sys.exit(1)
# No download params and no meta info params? Exit # No download params and no meta info params? Exit
if (not args.xml and not args.images) and \ if (not args.xml and not args.images) and \
(not args.get_wiki_engine): (not args.get_wiki_engine):
print 'ERROR: Use at least one download param or meta info param' print 'ERROR: Use at least one download param or meta info param'
parser.print_help() parser.print_help()
sys.exit(1) sys.exit(1)
# Execute meta info params # Execute meta info params
if args.wiki: if args.wiki:
if args.get_wiki_engine: if args.get_wiki_engine:
print getWikiEngine(url=args.wiki) print getWikiEngine(url=args.wiki)
sys.exit() sys.exit()
# Create session # Create session
cj = cookielib.MozillaCookieJar() cj = cookielib.MozillaCookieJar()
if args.cookies: if args.cookies:
@ -1080,7 +1201,7 @@ def getParameters(params=[]):
if args.user and args.password: if args.user and args.password:
session.auth = (args.user, args.password) session.auth = (args.user, args.password)
# session.mount(args.api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret)) # session.mount(args.api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret))
# check URLs # check URLs
for url in [args.api, args.index, args.wiki]: for url in [args.api, args.index, args.wiki]:
if url and (not url.startswith('http://') and not url.startswith('https://')): if url and (not url.startswith('http://') and not url.startswith('https://')):
@ -1088,7 +1209,7 @@ def getParameters(params=[]):
print 'ERROR: URLs must start with http:// or https://\n' print 'ERROR: URLs must start with http:// or https://\n'
parser.print_help() parser.print_help()
sys.exit(1) sys.exit(1)
# Get API and index and verify # Get API and index and verify
api = args.api and args.api or '' api = args.api and args.api or ''
index = args.index and args.index or '' index = args.index and args.index or ''
@ -1108,11 +1229,11 @@ def getParameters(params=[]):
pass pass
elif index == '': elif index == '':
index = '/'.join(api.split('/')[:-1]) + '/index.php' index = '/'.join(api.split('/')[:-1]) + '/index.php'
#print api # print api
#print index # print index
index2 = None index2 = None
check = api and checkAPI(api=api, session=session) check = api and checkAPI(api=api, session=session)
if check: if check:
index2 = check[1] index2 = check[1]
@ -1120,19 +1241,25 @@ def getParameters(params=[]):
else: else:
print 'Error in API, please, provide a correct path to API' print 'Error in API, please, provide a correct path to API'
sys.exit(1) sys.exit(1)
if index and checkIndex(index=index, cookies=args.cookies, session=session): if index and checkIndex(
index=index,
cookies=args.cookies,
session=session):
print 'index.php is OK' print 'index.php is OK'
else: else:
index = index2 index = index2
if index and index.startswith('//'): if index and index.startswith('//'):
index = args.wiki.split('//')[0] + index index = args.wiki.split('//')[0] + index
if index and checkIndex(index=index, cookies=args.cookies, session=session): if index and checkIndex(
print 'index.php is OK' index=index,
cookies=args.cookies,
session=session):
print 'index.php is OK'
else: else:
print 'Error in index.php, please, provide a correct path to index.php' print 'Error in index.php, please, provide a correct path to index.php'
sys.exit(1) sys.exit(1)
# check user and pass (one requires both) # check user and pass (one requires both)
if (args.user and not args.password) or (args.password and not args.user): if (args.user and not args.password) or (args.password and not args.user):
print 'ERROR: Both --user and --pass are required for authentication.' print 'ERROR: Both --user and --pass are required for authentication.'
@ -1144,7 +1271,9 @@ def getParameters(params=[]):
# Process namespace inclusions # Process namespace inclusions
if args.namespaces: if args.namespaces:
# fix, why - ? and... --namespaces= all with a space works? # fix, why - ? and... --namespaces= all with a space works?
if re.search(r'[^\d, \-]', args.namespaces) and args.namespaces.lower() != 'all': if re.search(
r'[^\d, \-]',
args.namespaces) and args.namespaces.lower() != 'all':
print "Invalid namespace values.\nValid format is integer(s) separated by commas" print "Invalid namespace values.\nValid format is integer(s) separated by commas"
sys.exit() sys.exit()
else: else:
@ -1205,7 +1334,11 @@ def checkAPI(api=None, session=None):
""" Checking API availability """ """ Checking API availability """
global cj global cj
r = session.post( r = session.post(
url=api, data={'action': 'query', 'meta': 'siteinfo', 'format': 'json'}) url=api,
data={
'action': 'query',
'meta': 'siteinfo',
'format': 'json'})
resultText = r.text resultText = r.text
print 'Checking API...', api print 'Checking API...', api
if "MediaWiki API is not enabled for this site." in resultText: if "MediaWiki API is not enabled for this site." in resultText:
@ -1213,12 +1346,17 @@ def checkAPI(api=None, session=None):
try: try:
result = json.loads(resultText) result = json.loads(resultText)
if 'query' in result: if 'query' in result:
if 'general' in result['query'] and 'script' in result['query']['general'] and 'server' in result['query']['general']: query = result['query']
return (True, result['query']['general']['server']+result['query']['general']['script']) general = result['query']['general']
else: if 'general' in query and 'script' in general and 'server' in general:
return (
True,
result['query']['general']['server'] +
result['query']['general']['script'])
else:
return (True, None) return (True, None)
except ValueError: except ValueError:
return False return False
return False return False
@ -1228,10 +1366,14 @@ def checkIndex(index=None, cookies=None, session=None):
raw = r.text raw = r.text
print 'Checking index.php...', index print 'Checking index.php...', index
# Workaround for issue 71 # Workaround for issue 71
if re.search(r'(Special:Badtitle</a>|class="permissions-errors"|"wgCanonicalSpecialPageName":"Badtitle"|Login Required</h1>)', raw) and not cookies: if re.search(
r'(Special:Badtitle</a>|class="permissions-errors"|"wgCanonicalSpecialPageName":"Badtitle"|Login Required</h1>)',
raw) and not cookies:
print "ERROR: This wiki requires login and we are not authenticated" print "ERROR: This wiki requires login and we are not authenticated"
return False return False
if re.search(r'(This wiki is powered by|<h2 id="mw-version-license">|meta name="generator" content="MediaWiki)', raw): if re.search(
r'(This wiki is powered by|<h2 id="mw-version-license">|meta name="generator" content="MediaWiki)',
raw):
return True return True
return False return False
@ -1243,7 +1385,9 @@ def removeIP(raw=''):
# http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html # http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html
# weird cases as :: are not included # weird cases as :: are not included
raw = re.sub( raw = re.sub(
r'(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}', '0:0:0:0:0:0:0:0', raw) r'(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}',
'0:0:0:0:0:0:0:0',
raw)
return raw return raw
@ -1258,7 +1402,15 @@ def checkXMLIntegrity(config={}, titles=[], session=None):
checkpageclose = 0 checkpageclose = 0
checkrevisionopen = 0 checkrevisionopen = 0
checkrevisionclose = 0 checkrevisionclose = 0
for line in file('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config, session=session), config['date'], config['curonly'] and 'current' or 'history'), 'r').read().splitlines(): for line in file(
'%s/%s-%s-%s.xml' %
(config['path'],
domain2prefix(
config=config,
session=session),
config['date'],
config['curonly'] and 'current' or 'history'),
'r').read().splitlines():
if "<revision>" in line: if "<revision>" in line:
checkrevisionopen += 1 checkrevisionopen += 1
elif "</revision>" in line: elif "</revision>" in line:
@ -1292,11 +1444,18 @@ def createNewDump(config={}, other={}):
titles += getPageTitles(config=config, session=other['session']) titles += getPageTitles(config=config, session=other['session'])
saveTitles(config=config, titles=titles) saveTitles(config=config, titles=titles)
generateXMLDump(config=config, titles=titles, session=other['session']) generateXMLDump(config=config, titles=titles, session=other['session'])
checkXMLIntegrity(config=config, titles=titles, session=other['session']) checkXMLIntegrity(
config=config,
titles=titles,
session=other['session'])
if config['images']: if config['images']:
images += getImageNames(config=config, session=other['session']) images += getImageNames(config=config, session=other['session'])
saveImageNames(config=config, images=images, session=other['session']) saveImageNames(config=config, images=images, session=other['session'])
generateImageDump(config=config, other=other, images=images, session=other['session']) generateImageDump(
config=config,
other=other,
images=images,
session=other['session'])
if config['logs']: if config['logs']:
saveLogs(config=config, session=other['session']) saveLogs(config=config, session=other['session'])
@ -1332,8 +1491,15 @@ def resumePreviousDump(config={}, other={}):
xmliscomplete = False xmliscomplete = False
lastxmltitle = '' lastxmltitle = ''
try: try:
f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config, session=other[ f = open(
'session']), config['date'], config['curonly'] and 'current' or 'history'), 'r') '%s/%s-%s-%s.xml' %
(config['path'],
domain2prefix(
config=config,
session=other['session']),
config['date'],
config['curonly'] and 'current' or 'history'),
'r')
for l in f: for l in f:
if re.findall('</mediawiki>', l): if re.findall('</mediawiki>', l):
# xml dump is complete # xml dump is complete
@ -1355,7 +1521,10 @@ def resumePreviousDump(config={}, other={}):
# resuming... # resuming...
print 'Resuming XML dump from "%s"' % (lastxmltitle) print 'Resuming XML dump from "%s"' % (lastxmltitle)
generateXMLDump( generateXMLDump(
config=config, titles=titles, start=lastxmltitle, session=other['session']) config=config,
titles=titles,
start=lastxmltitle,
session=other['session'])
else: else:
# corrupt? only has XML header? # corrupt? only has XML header?
print 'XML is corrupt? Regenerating...' print 'XML is corrupt? Regenerating...'
@ -1366,8 +1535,13 @@ def resumePreviousDump(config={}, other={}):
# load images # load images
lastimage = '' lastimage = ''
try: try:
f = open('%s/%s-%s-images.txt' % f = open(
(config['path'], domain2prefix(config=config), config['date']), 'r') '%s/%s-%s-images.txt' %
(config['path'],
domain2prefix(
config=config),
config['date']),
'r')
raw = unicode(f.read(), 'utf-8').strip() raw = unicode(f.read(), 'utf-8').strip()
lines = raw.split('\n') lines = raw.split('\n')
for l in lines: for l in lines:
@ -1415,7 +1589,11 @@ def resumePreviousDump(config={}, other={}):
# we resume from previous image, which may be corrupted (or missing # we resume from previous image, which may be corrupted (or missing
# .desc) by the previous session ctrl-c or abort # .desc) by the previous session ctrl-c or abort
generateImageDump( generateImageDump(
config=config, other=other, images=images, start=lastfilename2, session=other['session']) config=config,
other=other,
images=images,
start=lastfilename2,
session=other['session'])
if config['logs']: if config['logs']:
# fix # fix
@ -1461,25 +1639,34 @@ def saveSiteInfo(config={}, session=None):
print 'siteinfo.json exists, do not overwrite' print 'siteinfo.json exists, do not overwrite'
else: else:
print 'Downloading site info as siteinfo.json' print 'Downloading site info as siteinfo.json'
# MediaWiki 1.13+ # MediaWiki 1.13+
r = session.post(url=config['api'], data={ r = session.post(
'action': 'query', url=config['api'],
'meta': 'siteinfo', data={
'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo', 'action': 'query',
'sinumberingroup': 1, 'meta': 'siteinfo',
'format': 'json'}) 'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo',
'sinumberingroup': 1,
'format': 'json'})
# MediaWiki 1.11-1.12 # MediaWiki 1.11-1.12
if not 'query' in json.loads(r.text): if not 'query' in json.loads(r.text):
r = session.post(url=config['api'], data={ r = session.post(
'action': 'query', url=config['api'],
'meta': 'siteinfo', data={
'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap', 'action': 'query',
'format': 'json'}) 'meta': 'siteinfo',
'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap',
'format': 'json'})
# MediaWiki 1.8-1.10 # MediaWiki 1.8-1.10
if not 'query' in json.loads(r.text): if not 'query' in json.loads(r.text):
r = session.post(url=config['api'], data={ r = session.post(
'action': 'query', 'meta': 'siteinfo', 'siprop': 'general|namespaces', 'format': 'json'}) url=config['api'],
data={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'general|namespaces',
'format': 'json'})
result = json.loads(r.text) result = json.loads(r.text)
delay(config=config, session=session) delay(config=config, session=session)
with open('%s/siteinfo.json' % (config['path']), 'w') as outfile: with open('%s/siteinfo.json' % (config['path']), 'w') as outfile:
@ -1490,7 +1677,10 @@ def avoidWikimediaProjects(config={}, other={}):
""" Skip Wikimedia projects and redirect to the dumps website """ """ Skip Wikimedia projects and redirect to the dumps website """
# notice about wikipedia dumps # notice about wikipedia dumps
if re.findall(r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org', config['api'] + config['index']): if re.findall(
r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org',
config['api'] +
config['index']):
print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!' print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!'
print 'Download the dumps from http://dumps.wikimedia.org' print 'Download the dumps from http://dumps.wikimedia.org'
if not other['force']: if not other['force']:
@ -1509,7 +1699,9 @@ def getWikiEngine(url=''):
result = r.text result = r.text
wikiengine = 'Unknown' wikiengine = 'Unknown'
if re.search(ur'(?im)(<meta name="generator" content="DokuWiki)|dokuwiki__site', result): if re.search(
ur'(?im)(<meta name="generator" content="DokuWiki)|dokuwiki__site',
result):
wikiengine = 'DokuWiki' wikiengine = 'DokuWiki'
elif re.search(ur'(?im)(alt="Powered by MediaWiki"|<meta name="generator" content="MediaWiki)', result): elif re.search(ur'(?im)(alt="Powered by MediaWiki"|<meta name="generator" content="MediaWiki)', result):
wikiengine = 'MediaWiki' wikiengine = 'MediaWiki'
@ -1536,7 +1728,7 @@ def getWikiEngine(url=''):
elif re.search(ur'(?im)(Wheeled by <a class="external-link" href="http://www\.wagn\.org">|<body id="wagn">)', result): elif re.search(ur'(?im)(Wheeled by <a class="external-link" href="http://www\.wagn\.org">|<body id="wagn">)', result):
wikiengine = 'Wagn' wikiengine = 'Wagn'
elif re.search(ur'(?im)(<meta name="generator" content="MindTouch)', result): elif re.search(ur'(?im)(<meta name="generator" content="MindTouch)', result):
wikiengine = 'MindTouch' # formerly DekiWiki wikiengine = 'MindTouch' # formerly DekiWiki
elif re.search(ur'(?im)(<div class="wikiversion">\s*(<p>)?JSPWiki|xmlns:jspwiki="http://www\.jspwiki\.org")', result): elif re.search(ur'(?im)(<div class="wikiversion">\s*(<p>)?JSPWiki|xmlns:jspwiki="http://www\.jspwiki\.org")', result):
wikiengine = 'JSPWiki' wikiengine = 'JSPWiki'
elif re.search(ur'(?im)(Powered by:?\s*(<br ?/>)?\s*<a href="http://kwiki\.org">|\bKwikiNavigation\b)', result): elif re.search(ur'(?im)(Powered by:?\s*(<br ?/>)?\s*<a href="http://kwiki\.org">|\bKwikiNavigation\b)', result):
@ -1551,7 +1743,7 @@ def getWikiEngine(url=''):
wikiengine = 'Zwiki' wikiengine = 'Zwiki'
# WakkaWiki forks # WakkaWiki forks
elif re.search(ur'(?im)(<meta name="generator" content="WikkaWiki|<a class="ext" href="(http://wikka\.jsnx\.com/|http://wikkawiki\.org/)">)', result): elif re.search(ur'(?im)(<meta name="generator" content="WikkaWiki|<a class="ext" href="(http://wikka\.jsnx\.com/|http://wikkawiki\.org/)">)', result):
wikiengine = 'WikkaWiki' # formerly WikkaWakkaWiki wikiengine = 'WikkaWiki' # formerly WikkaWakkaWiki
elif re.search(ur'(?im)(<meta name="generator" content="CoMa Wiki)', result): elif re.search(ur'(?im)(<meta name="generator" content="CoMa Wiki)', result):
wikiengine = 'CoMaWiki' wikiengine = 'CoMaWiki'
elif re.search(ur'(?im)(Fonctionne avec <a href="http://www\.wikini\.net)', result): elif re.search(ur'(?im)(Fonctionne avec <a href="http://www\.wikini\.net)', result):
@ -1561,7 +1753,8 @@ def getWikiEngine(url=''):
elif re.search(ur'(?im)(Powered by <a href="http://wackowiki\.com/|title="WackoWiki")', result): elif re.search(ur'(?im)(Powered by <a href="http://wackowiki\.com/|title="WackoWiki")', result):
wikiengine = 'WackoWiki' wikiengine = 'WackoWiki'
elif re.search(ur'(?im)(Powered by <a href="http://www\.wakkawiki\.com)', result): elif re.search(ur'(?im)(Powered by <a href="http://www\.wakkawiki\.com)', result):
# This may not work for heavily modded/themed installations, e.g. http://operawiki.info/ # This may not work for heavily modded/themed installations, e.g.
# http://operawiki.info/
wikiengine = 'WakkaWiki' wikiengine = 'WakkaWiki'
# Custom wikis used by wiki farms # Custom wikis used by wiki farms
elif re.search(ur'(?im)(var wikispaces_page|<div class="WikispacesContent)', result): elif re.search(ur'(?im)(var wikispaces_page|<div class="WikispacesContent)', result):
@ -1573,36 +1766,42 @@ def getWikiEngine(url=''):
elif re.search(ur'(?im)(<div id="footer-pbwiki">|ws-nav-search|PBinfo *= *{)', result): elif re.search(ur'(?im)(<div id="footer-pbwiki">|ws-nav-search|PBinfo *= *{)', result):
# formerly PBwiki # formerly PBwiki
wikiengine = 'PBworks' wikiengine = 'PBworks'
#if wikiengine == 'Unknown': print result # if wikiengine == 'Unknown': print result
return wikiengine return wikiengine
def mwGetAPIAndIndex(url=''): def mwGetAPIAndIndex(url=''):
""" Returns the MediaWiki API and Index.php """ """ Returns the MediaWiki API and Index.php """
api = '' api = ''
index = '' index = ''
session = requests.Session() session = requests.Session()
session.headers = {'User-Agent': getUserAgent()} session.headers = {'User-Agent': getUserAgent()}
r = session.post(url=url) r = session.post(url=url)
result = r.text result = r.text
# API # API
m = re.findall(ur'(?im)<\s*link\s*rel="EditURI"\s*type="application/rsd\+xml"\s*href="([^>]+?)\?action=rsd"\s*/\s*>', result) m = re.findall(
ur'(?im)<\s*link\s*rel="EditURI"\s*type="application/rsd\+xml"\s*href="([^>]+?)\?action=rsd"\s*/\s*>',
result)
if m: if m:
api = m[0] api = m[0]
if api.startswith('//'): # gentoo wiki if api.startswith('//'): # gentoo wiki
api = url.split('//')[0] + api api = url.split('//')[0] + api
else: else:
pass # build API using index and check it pass # build API using index and check it
# Index.php # Index.php
m = re.findall(ur'<li id="ca-viewsource"[^>]*?>\s*(?:<span>)?\s*<a href="([^\?]+?)\?', result) m = re.findall(
ur'<li id="ca-viewsource"[^>]*?>\s*(?:<span>)?\s*<a href="([^\?]+?)\?',
result)
if m: if m:
index = m[0] index = m[0]
else: else:
m = re.findall(ur'<li id="ca-history"[^>]*?>\s*(?:<span>)?\s*<a href="([^\?]+?)\?', result) m = re.findall(
ur'<li id="ca-history"[^>]*?>\s*(?:<span>)?\s*<a href="([^\?]+?)\?',
result)
if m: if m:
index = m[0] index = m[0]
if index: if index:
@ -1610,13 +1809,19 @@ def mwGetAPIAndIndex(url=''):
index = '/'.join(api.split('/')[:-1]) + '/' + index.split('/')[-1] index = '/'.join(api.split('/')[:-1]) + '/' + index.split('/')[-1]
else: else:
if api: if api:
if len(re.findall(ur'/index\.php5\?', result)) > len(re.findall(ur'/index\.php\?', result)): if len(
re.findall(
ur'/index\.php5\?',
result)) > len(
re.findall(
ur'/index\.php\?',
result)):
index = '/'.join(api.split('/')[:-1]) + '/index.php5' index = '/'.join(api.split('/')[:-1]) + '/index.php5'
else: else:
index = '/'.join(api.split('/')[:-1]) + '/index.php' index = '/'.join(api.split('/')[:-1]) + '/index.php'
return api, index return api, index
def main(params=[]): def main(params=[]):
""" Main function """ """ Main function """
@ -1637,8 +1842,11 @@ def main(params=[]):
print '\nWarning!: "%s" path exists' % (config['path']) print '\nWarning!: "%s" path exists' % (config['path'])
reply = '' reply = ''
while reply.lower() not in ['yes', 'y', 'no', 'n']: while reply.lower() not in ['yes', 'y', 'no', 'n']:
reply = raw_input('There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], [no, n])? ' % ( reply = raw_input(
config['path'], config['path'], configfilename)) 'There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], [no, n])? ' %
(config['path'],
config['path'],
configfilename))
if reply.lower() in ['yes', 'y']: if reply.lower() in ['yes', 'y']:
if not os.path.isfile('%s/%s' % (config['path'], configfilename)): if not os.path.isfile('%s/%s' % (config['path'], configfilename)):
print 'No config file found. I can\'t resume. Aborting.' print 'No config file found. I can\'t resume. Aborting.'

Loading…
Cancel
Save