diff --git a/dumpgenerator.py b/dumpgenerator.py index cd6a58e..ef3ba66 100644 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -1,1291 +1,1292 @@ -#!/usr/bin/env python2 # -*- coding: utf-8 -*- -# dumpgenerator.py A generator of dumps for wikis -# Copyright (C) 2011-2014 WikiTeam developers +# Copyright (C) 2013 Hydriz Scholz +# Copyright (C) 2014 WikiTeam +# # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. -# +# # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA, or visit +# + +####################################################################### +# dumpgenerator.py is a script to generate backups of MediaWiki wikis # +# To learn more, read the documentation: # +# http://code.google.com/p/wikiteam/wiki/NewTutorial # +####################################################################### + +# For developers: +# * All functions and classes are displayed in alphabetical order for easier accessibility. +# * Script exit codes reference: +# * 0 - Script ran well without problems +# * 1 - Script failed due to user's incorrect use +# * 2 - Script failed due to destination server issue +# * For testing purposes, add the --debug parameter and edit DumpGenerator.debug() accordingly. + +###### +# TODO LIST +# 0. Download index.html and Special:Version.html +# 1. Index.php support. +# 2. Special:Log pages support +# 3. GUI (Question and Answer if no parameters are given) +# 4. Resuming of dump +# 5. Place the images in various folders so as to avoid hitting the limit of number of files in a directory +# 6. Speed up the script. A run with --xml --images on test.wikidata.org came up with 9 min 23 sec on 2.0 and 3 min 58 sec on 1.0 + +# WHAT IS WORKING +# 1. XML dumping +# 2. Complete dumping using API (except for --logs) +# 3. Automatic updating +# 4. Dumping of XML based on a list of titles +# 5. Integrity check for XML dump -# To learn more, read the documentation: -# https://github.com/WikiTeam/wikiteam/wiki - -import cookielib -import cPickle import datetime -import sys -try: - import argparse -except ImportError: - print "Please install the argparse module." - sys.exit(1) +import getopt +import hashlib import json -try: - from hashlib import md5 -except ImportError: # Python 2.4 compatibility - from md5 import new as md5 import os import re -try: - import requests -except ImportError: - print "Please install or update the Requests module." - sys.exit(1) -import subprocess +import shutil +import sys import time import urllib - -__VERSION__ = '0.2.2' #major, minor, micro - -def getVersion(): - return(__VERSION__) - - -def truncateFilename(other={}, filename=''): - """ Truncate filenames when downloading images with large filenames """ - return filename[:other['filenamelimit']] + md5(filename).hexdigest() + '.' + filename.split('.')[-1] - -def delay(config={}, session=None): - """ Add a delay if configured for that """ - if config['delay'] > 0: - print 'Sleeping... %d seconds...' % (config['delay']) - time.sleep(config['delay']) - -def cleanHTML(raw=''): - """ Extract only the real wiki content and remove rubbish """ - """ This function is ONLY used to retrieve page titles and file names when no API is available """ - """ DO NOT use this function to extract page content """ - #different "tags" used by different MediaWiki versions to mark where starts and ends content - if re.search('', raw): - raw = raw.split('')[1].split('')[0] - elif re.search('', raw): - raw = raw.split('')[1].split('')[0] - elif re.search('', raw): - raw = raw.split('')[1].split('')[0] - elif re.search('', raw): - raw = raw.split('')[1].split('')[0] - elif re.search('
', raw): - raw = raw.split('
')[1].split('
')[0] - else: - print raw[:250] - print 'This wiki doesn\'t use marks to split content' - sys.exit() - return raw - -def handleStatusCode(response): - statuscode = response.status_code - if statuscode >= 200 and statuscode < 300: - return - - print "HTTP Error %d." % statuscode - if statuscode >= 300 and statuscode < 400: - print "Redirect should happen automatically: please report this as a bug." - print response.url - - elif statuscode == 400: - print "Bad Request: The wiki may be malfunctioning." - print "Please try again later." - print response.url - sys.exit(1) - - elif statuscode == 401 or statuscode == 403: - print "Authentication required." - print "Please use --userpass." - print response.url - - elif statuscode == 404: - print "Not found. Is Special:Export enabled for this wiki?" - print response.url - sys.exit(1) - - elif statuscode == 429 or (statuscode >= 500 and statuscode < 600): - print "Server error, max retries exceeded." - print "Please resume the dump later." - print response.url - sys.exit(1) - -def getNamespacesScraper(config={}, session=None): - """ Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages """ - """ Function called if no API is available """ - namespaces = config['namespaces'] - namespacenames = {0:''} # main is 0, no prefix - if namespaces: - r = session.post(url=config['index'], data={'title': 'Special:Allpages'}) - raw = r.text - delay(config=config, session=session) - - m = re.compile(r'').finditer(raw) # [^>]*? to include selected="selected" - if 'all' in namespaces: - namespaces = [] - for i in m: - namespaces.append(int(i.group("namespaceid"))) - namespacenames[int(i.group("namespaceid"))] = i.group("namespacename") - else: - #check if those namespaces really exist in this wiki - namespaces2 = [] - for i in m: - if int(i.group("namespaceid")) in namespaces: - namespaces2.append(int(i.group("namespaceid"))) - namespacenames[int(i.group("namespaceid"))] = i.group("namespacename") - namespaces = namespaces2 - else: - namespaces = [0] - - namespaces = list(set(namespaces)) #uniques - print '%d namespaces found' % (len(namespaces)) - return namespaces, namespacenames - -def getNamespacesAPI(config={}, session=None): - """ Uses the API to get the list of namespaces names and ids """ - namespaces = config['namespaces'] - namespacenames = {0:''} # main is 0, no prefix - if namespaces: - r = session.post(url=config['api'], data={'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}) - result = json.loads(r.text) - delay(config=config, session=session) - - if 'all' in namespaces: - namespaces = [] - for i in result['query']['namespaces'].keys(): - if int(i) < 0: # -1: Special, -2: Media, excluding - continue - namespaces.append(int(i)) - namespacenames[int(i)] = result['query']['namespaces'][i]['*'] - else: - #check if those namespaces really exist in this wiki - namespaces2 = [] - for i in result['query']['namespaces'].keys(): - if int(i) < 0: # -1: Special, -2: Media, excluding - continue - if int(i) in namespaces: - namespaces2.append(int(i)) - namespacenames[int(i)] = result['query']['namespaces'][i]['*'] - namespaces = namespaces2 - else: - namespaces = [0] - - namespaces = list(set(namespaces)) #uniques - print '%d namespaces found' % (len(namespaces)) - return namespaces, namespacenames - -def getPageTitlesAPI(config={}, session=None): - """ Uses the API to get the list of page titles """ - titles = [] - namespaces, namespacenames = getNamespacesAPI(config=config, session=session) - for namespace in namespaces: - if namespace in config['exnamespaces']: - print ' Skipping namespace = %d' % (namespace) - continue - - c = 0 - print ' Retrieving titles in the namespace %d' % (namespace) - apfrom = '!' - while apfrom: - sys.stderr.write('.') #progress - params = {'action': 'query', 'list': 'allpages', 'apnamespace': namespace, 'apfrom': apfrom.encode('utf-8'), 'format': 'json', 'aplimit': 500} - r = session.post(url=config['api'], data=params) - handleStatusCode(r) - #FIXME Handle HTTP errors here! - jsontitles = json.loads(r.text) - apfrom = '' - if jsontitles.has_key('query-continue') and jsontitles['query-continue'].has_key('allpages'): - if jsontitles['query-continue']['allpages'].has_key('apcontinue'): - apfrom = jsontitles['query-continue']['allpages']['apcontinue'] - elif jsontitles['query-continue']['allpages'].has_key('apfrom'): - apfrom = jsontitles['query-continue']['allpages']['apfrom'] - #print apfrom - #print jsontitles - titles += [page['title'] for page in jsontitles['query']['allpages']] - if len(titles) != len(set(titles)): - #probably we are in a loop, server returning dupe titles, stop it - print 'Probably a loop, finishing' - titles = list(set(titles)) - apfrom = '' - c += len(jsontitles['query']['allpages']) - delay(config=config, session=session) - print ' %d titles retrieved in the namespace %d' % (c, namespace) - return titles - -def getPageTitlesScraper(config={}, session=None): - """ """ - titles = [] - namespaces, namespacenames = getNamespacesScraper(config=config, session=session) - for namespace in namespaces: - print ' Retrieving titles in the namespace', namespace - url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace) - r = session.get(url=url) - raw = r.text - raw = cleanHTML(raw) - - r_title = r'title="(?P[^>]+)">' - r_suballpages = '' - r_suballpages1 = r'&from=(?P<from>[^>]+)&to=(?P<to>[^>]+)">' - r_suballpages2 = r'Special:Allpages/(?P<from>[^>]+)">' - if re.search(r_suballpages1, raw): - r_suballpages = r_suballpages1 - elif re.search(r_suballpages2, raw): - r_suballpages = r_suballpages2 - else: - pass #perhaps no subpages - - deep = 3 # 3 is the current deep of English Wikipedia for Special:Allpages, 3 levels - c = 0 - checked_suballpages = [] - rawacum = raw - while r_suballpages and re.search(r_suballpages, raw) and c < deep: - #load sub-Allpages - m = re.compile(r_suballpages).finditer(raw) - for i in m: - fr = i.group('from') - - if r_suballpages == r_suballpages1: - to = i.group('to') - name = '%s-%s' % (fr, to) - url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % (config['index'], namespace, fr, to) #do not put urllib.quote in fr or to - elif r_suballpages == r_suballpages2: #fix, esta regexp no carga bien todas? o falla el r_title en este tipo de subpag? (wikiindex) - fr = fr.split('&namespace=')[0] #clean &namespace=\d, sometimes happens - name = fr - url = '%s?title=Special:Allpages/%s&namespace=%s' % (config['index'], name, namespace) - - if not name in checked_suballpages: - checked_suballpages.append(name) #to avoid reload dupe subpages links - delay(config=config, session=session) - r2 = session.get(url=url) - raw2 = r2.text - raw2 = cleanHTML(raw2) - rawacum += raw2 #merge it after removed junk - print ' Reading', name, len(raw2), 'bytes', len(re.findall(r_suballpages, raw2)), 'subpages', len(re.findall(r_title, raw2)), 'pages' - - delay(config=config, session=session) - c += 1 - - c = 0 - m = re.compile(r_title).finditer(rawacum) - for i in m: - t = undoHTMLEntities(text=i.group('title')) - if not t.startswith('Special:'): - if not t in titles: - titles.append(t) - c += 1 - print ' %d titles retrieved in the namespace %d' % (c, namespace) - return titles - -def getPageTitles(config={}, session=None): - """ Get list of page titles """ - #http://en.wikipedia.org/wiki/Special:AllPages - #http://archiveteam.org/index.php?title=Special:AllPages - #http://www.wikanda.es/wiki/Especial:Todas - print 'Loading page titles from namespaces = %s' % (config['namespaces'] and ','.join([str(i) for i in config['namespaces']]) or 'None') - print 'Excluding titles from namespaces = %s' % (config['exnamespaces'] and ','.join([str(i) for i in config['exnamespaces']]) or 'None') - - titles = [] - if config['api']: - titles = getPageTitlesAPI(config=config, session=session) - elif config['index']: - titles = getPageTitlesScraper(config=config, session=session) - - titles = list(set(titles)) #removing dupes (e.g. in CZ appears Widget:AddThis two times (main namespace and widget namespace)) - titles.sort() #sorting - - print '%d page titles loaded' % (len(titles)) - return titles - -def getXMLHeader(config={}, session=None): - """ Retrieve a random page to extract XML headers (namespace info, etc) """ - #get the header of a random page, to attach it in the complete XML backup - #similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:x.... - randomtitle = 'Main_Page' #previously AMF5LKE43MNFGHKSDMRTJ - xml = getXMLPage(config=config, title=randomtitle, verbose=False, session=session) - header = xml.split('</mediawiki>')[0] - if not xml: - print 'XML export on this wiki is broken, quitting.' - sys.exit() - return header - -def getXMLFileDesc(config={}, title='', session=None): - """ Get XML for image description page """ - config['curonly'] = 1 #tricky to get only the most recent desc - return getXMLPage(config=config, title=title, verbose=False, session=session) - -def getUserAgent(): - """ Return a cool user-agent to hide Python user-agent """ - useragents = [ - #firefox - 'Mozilla/5.0 (X11; Linux i686; rv:24.0) Gecko/20100101 Firefox/24.0', - 'Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0', - ] - return useragents[0] - -def logerror(config={}, text=''): - """ Log error in file """ - if text: - with open('%s/errors.log' % (config['path']), 'a') as outfile: - output = u'%s: %s\n' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text) - outfile.write(output.encode('utf-8')) - -def getXMLPageCore(headers={}, params={}, config={}, session=None): - """ """ - #returns a XML containing params['limit'] revisions (or current only), ending in </mediawiki> - #if retrieving params['limit'] revisions fails, returns a current only version - #if all fail, returns the empty string - xml = '' - c = 0 - maxseconds = 100 #max seconds to wait in a single sleeping - maxretries = 5 # x retries and skip - increment = 20 #increment every retry - while not re.search(r'</mediawiki>', xml): - if c > 0 and c < maxretries: - wait = increment * c < maxseconds and increment * c or maxseconds # incremental until maxseconds - print ' XML for "%s" is wrong. Waiting %d seconds and reloading...' % (params['pages'], wait) - time.sleep(wait) - if params['limit'] > 1: # reducing server load requesting smallest chunks (if curonly then limit = 1 from mother function) - params['limit'] = params['limit'] / 2 # half - if c >= maxretries: - print ' We have retried %d times' % (c) - print ' MediaWiki error for "%s", network error or whatever...' % (params['pages']) - # If it's not already what we tried: our last chance, preserve only the last revision... - # config['curonly'] means that the whole dump is configured to save nonly the last - # params['curonly'] should mean that we've already tried this fallback, because it's set by the following if and passed to getXMLPageCore - if not config['curonly']: - print ' Trying to save only the last revision for this page...' - params['curonly'] = 1 - logerror(config=config, text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' % (params['pages'])) - return getXMLPageCore(headers=headers, params=params, config=config) - else: - print ' Saving in the errors log, and skipping...' - logerror(config=config, text='Error while retrieving the last revision of "%s". Skipping.' % (params['pages'])) - return '' # empty xml - #FIXME HANDLE HTTP Errors HERE - r = session.post(url=config['index'], data=params, headers=headers) - handleStatusCode(r) - xml = r.text - c += 1 - - return xml - -def getXMLPage(config={}, title='', verbose=True, session=None): - """ Get the full history (or current only) of a page """ - - #if server errors occurs while retrieving the full page history, it may return [oldest OK versions] + last version, excluding middle revisions, so it would be partialy truncated - #http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F - - limit = 1000 - truncated = False - title_ = title - title_ = re.sub(' ', '_', title_) - #do not convert & into %26, title_ = re.sub('&', '%26', title_) - params = {'title': 'Special:Export', 'pages': title_, 'action': 'submit'} - if config['curonly']: - params['curonly'] = 1 - params['limit'] = 1 - else: - params['offset'] = '1' # 1 always < 2000s - params['limit'] = limit - if config.has_key('templates') and config['templates']: #in other case, do not set params['templates'] - params['templates'] = 1 - - xml = getXMLPageCore(params=params, config=config, session=session) - - #if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available - #else, warning about Special:Export truncating large page histories - r_timestamp = r'<timestamp>([^<]+)</timestamp>' - if not config['curonly'] and re.search(r_timestamp, xml): # search for timestamps in xml to avoid analysing empty pages like Special:Allpages and the random one - while not truncated and params['offset']: #next chunk - params['offset'] = re.findall(r_timestamp, xml)[-1] #get the last timestamp from the acum XML - xml2 = getXMLPageCore(params=params, config=config, session=session) - - if re.findall(r_timestamp, xml2): #are there more edits in this next XML chunk or no <page></page>? - if re.findall(r_timestamp, xml2)[-1] == params['offset']: - #again the same XML, this wiki does not support params in Special:Export, offer complete XML up to X edits (usually 1000) - print 'ATTENTION: This wiki does not allow some parameters in Special:Export, therefore pages with large histories may be truncated' - truncated = True - break - else: - """ </namespaces> - </siteinfo> - <page> - <title>Main Page - 15580374 - edit=sysop:move=sysop (?) - - 418009832 - 2011-03-09T19:57:06Z - - """ - #offset is OK in this wiki, merge with the previous chunk of this page history and continue - xml = xml.split('')[0] + ' ' + (''.join(xml2.split('')[1:])) - else: - params['offset'] = '' #no more edits in this page history - - if verbose: - numberofedits = len(re.findall(r_timestamp, xml)) - if (numberofedits == 1): - print ' %s, 1 edit' % (title) - else: - print ' %s, %d edits' % (title, numberofedits) - - return xml - -def cleanXML(xml=''): - """ Trim redundant info """ - #do not touch XML codification, leave AS IS - if re.search(r'\n', xml) and re.search(r'', xml): - xml = xml.split('\n')[1] - xml = xml.split('')[0] - return xml - -def generateXMLDump(config={}, titles=[], start='', session=None): - """ Generates a XML dump for a list of titles """ - - print 'Retrieving the XML for every page from "%s"' % (start and start or 'start') - header = getXMLHeader(config=config, session=session) - footer = '\n' #new line at the end - xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config), config['date'], config['curonly'] and 'current' or 'history') - xmlfile = '' - lock = True - if start: - #remove the last chunk of xml dump (it is probably incomplete) - xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'r') - xmlfile2 = open('%s/%s2' % (config['path'], xmlfilename), 'w') - prev = '' - c = 0 - for l in xmlfile: - #removing \n until end of file - if c != 0: #lock to avoid write an empty line at the begining of file - if not re.search(r'%s' % (start), l): - xmlfile2.write(prev) - else: - break - c += 1 - prev = l - xmlfile.close() - xmlfile2.close() - #subst xml with xml2 - os.remove('%s/%s' % (config['path'], xmlfilename)) #remove previous xml dump - os.rename('%s/%s2' % (config['path'], xmlfilename), '%s/%s' % (config['path'], xmlfilename)) #move correctly truncated dump to its real name - else: - #requested complete xml dump - lock = False - xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w') - xmlfile.write(header.encode('utf-8')) - xmlfile.close() - - xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a') - c = 1 - for title in titles: - if not title.strip(): - continue - if title == start: #start downloading from start, included - lock = False - if lock: - continue - delay(config=config, session=session) - if c % 10 == 0: - print 'Downloaded %d pages' % (c) - xml = getXMLPage(config=config, title=title, session=session) - xml = cleanXML(xml=xml) - if not xml: - logerror(config=config, text=u'The page "%s" was missing in the wiki (probably deleted)' % (title)) - #here, XML is a correct chunk or - #an empty string due to a deleted page (logged in errors log) or - #an empty string due to an error while retrieving the page from server (logged in errors log) - xmlfile.write(xml.encode('utf-8')) - c += 1 - xmlfile.write(footer) - xmlfile.close() - print 'XML dump saved at...', xmlfilename - -def saveTitles(config={}, titles=[]): - """ Save title list in a file """ - - titlesfilename = '%s-%s-titles.txt' % (domain2prefix(config=config), config['date']) - titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'w') - output = u"%s\n--END--" % ('\n'.join(titles)) - titlesfile.write(output.encode('utf-8')) - titlesfile.close() - - print 'Titles saved at...', titlesfilename - -def saveImageFilenamesURL(config={}, images=[], session=None): - """ Save image list in a file, including filename, url and uploader """ - - imagesfilename = '%s-%s-images.txt' % (domain2prefix(config=config), config['date']) - imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'w') - imagesfile.write(('\n'.join(['%s\t%s\t%s' % (filename, url, uploader) for filename, url, uploader in images]).encode('utf-8'))) - imagesfile.write('\n--END--') - imagesfile.close() - - print 'Image filenames and URLs saved at...', imagesfilename - -def getImageFilenamesURL(config={}, session=None): - """ Retrieve file list: filename, url, uploader """ - - print 'Retrieving image filenames' - r_next = r'(?\d+)&' # (? 10: - print 'Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit) - limit = limit/10 - continue - elif retries > 0: # waste retries, then exit - retries -= 1 - print 'Retrying...' - continue - else: - print 'No more retries, exit...' - break - - raw = cleanHTML(raw) - #archiveteam 1.15.1 Yahoovideo.jpg (file) - #wikanda 1.15.5 Fernandocg - r_images1 = r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+\s*]+>(?P[^<]+)' - #wikijuegos 1.9.5 http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old mediawiki version - r_images2 = r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+\s*[^<]+\s*[^<]+\s*]+>(?P[^<]+)' - #gentoowiki 1.18 18:15, 3 April 2011Asus eeepc-1201nl.png (file)37 KBYannails 1 - r_images3 = r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+]+>]+>[^<]+]+>(?P[^<]+)' - #http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= - #(desc) 109 0923.JPG . . 885,713 bytes . . Bfalconer . . 18:44, 17 November 2005
- r_images4 = r'(?im)]+ title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+]+>(?P[^<]+)' - m = [] - #different mediawiki versions - if re.search(r_images1, raw): - m = re.compile(r_images1).finditer(raw) - elif re.search(r_images2, raw): - m = re.compile(r_images2).finditer(raw) - elif re.search(r_images3, raw): - m = re.compile(r_images3).finditer(raw) - elif re.search(r_images4, raw): - m = re.compile(r_images4).finditer(raw) - - for i in m: - url = i.group('url') - if url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')): #is it a relative URL? - if url[0] == '/': #slash is added later - url = url[1:] - domainalone = config['index'].split('://')[1].split('/')[0] #remove from :// (http or https) until the first / after domain - url = u'%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url - url = undoHTMLEntities(text=url) - #url = urllib.unquote(url) #do not use unquote with url, it break some urls with odd chars - url = re.sub(' ', '_', url) - filename = re.sub('_', ' ', i.group('filename')) - filename = undoHTMLEntities(text=filename) - filename = urllib.unquote(filename) - uploader = re.sub('_', ' ', i.group('uploader')) - uploader = undoHTMLEntities(text=uploader) - uploader = urllib.unquote(uploader) - images.append([filename, url, uploader]) - #print filename, url - - if re.search(r_next, raw): - offset = re.findall(r_next, raw)[0] - retries += 5 # add more retries if we got a page with offset - else: - offset = '' - - if (len(images) == 1): - print ' Found 1 image' - else: - print ' Found %d images' % (len(images)) - - images.sort() - return images - -def getImageFilenamesURLAPI(config={}, session=None): - """ Retrieve file list: filename, url, uploader """ - - print 'Retrieving image filenames' - aifrom = '!' - images = [] - while aifrom: - sys.stderr.write('.') #progress - params = {'action': 'query', 'list': 'allimages', 'aiprop': 'url|user', 'aifrom': aifrom, 'format': 'json', 'ailimit': 500} - #FIXME Handle HTTP Errors HERE - r = session.post(url=config['api'], data=params) - handleStatusCode(r) - jsonimages = json.loads(r.text) - delay(config=config, session=session) - aifrom = '' - if jsonimages.has_key('query-continue') and jsonimages['query-continue'].has_key('allimages'): - if jsonimages['query-continue']['allimages'].has_key('aicontinue'): - aifrom = jsonimages['query-continue']['allimages']['aicontinue'] - elif jsonimages['query-continue']['allimages'].has_key('aifrom'): - aifrom = jsonimages['query-continue']['allimages']['aifrom'] - #print aifrom - - for image in jsonimages['query']['allimages']: - url = image['url'] - if url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')): #is it a relative URL? - if url[0] == '/': #slash is added later - url = url[1:] - domainalone = config['index'].split('://')[1].split('/')[0] #remove from :// (http or https) until the first / after domain - url = u'%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url - url = re.sub(' ', '_', url) - # encoding to ascii is needed to work around this horrible bug: http://bugs.python.org/issue8136 - filename = unicode(urllib.unquote((re.sub('_', ' ', url.split('/')[-1])).encode('ascii','ignore')), 'utf-8') - uploader = re.sub('_', ' ', image['user']) - images.append([filename, url, uploader]) - - if (len(images) == 1): - print ' Found 1 image' - else: - print ' Found %d images' % (len(images)) - - images.sort() - return images - -def undoHTMLEntities(text=''): - """ Undo some HTML codes """ - - text = re.sub('<', '<', text) # i guess only < > & " ' need conversion http://www.w3schools.com/html/html_entities.asp - text = re.sub('>', '>', text) - text = re.sub('&', '&', text) - text = re.sub('"', '"', text) - text = re.sub(''', '\'', text) - - return text - -def generateImageDump(config={}, other={}, images=[], start='', session=None): - """ Save files and descriptions using a file list """ - - #fix use subdirectories md5 - print 'Retrieving images from "%s"' % (start and start or 'start') - imagepath = '%s/images' % (config['path']) - if not os.path.isdir(imagepath): - print 'Creating "%s" directory' % (imagepath) - os.makedirs(imagepath) - - c = 0 - lock = True - if not start: - lock = False - for filename, url, uploader in images: - if filename == start: #start downloading from start (included) - lock = False - if lock: - continue - delay(config=config, session=session) - - #saving file - #truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash limit). Later .desc is added to filename, so better 100 as max) - filename2 = urllib.unquote(filename) - if len(filename2) > other['filenamelimit']: - # split last . (extension) and then merge - filename2 = truncateFilename(other=other, filename=filename2) - print 'Filename is too long, truncating. Now it is:', filename2 - filename3 = u'%s/%s' % (imagepath, filename2) - imagefile = open(filename3, 'wb') - r = requests.get(url=url) - imagefile.write(r.content) - imagefile.close() - #saving description if any - xmlfiledesc = getXMLFileDesc(config=config, title=u'Image:%s' % (filename), session=session) # use Image: for backwards compatibility - f = open('%s/%s.desc' % (imagepath, filename2), 'w') - if not re.search(r'', xmlfiledesc): #Banner featuring SG1, SGA, SGU teams - #failure when retrieving desc? then save it as empty .desc - xmlfiledesc = '' - f.write(xmlfiledesc.encode('utf-8')) - f.close() - delay(config=config, session=session) - c += 1 - if c % 10 == 0: - print ' Downloaded %d images' % (c) - - print 'Downloaded %d images' % (c) - -def saveLogs(config={}, session=None): - """ Save Special:Log """ - #get all logs from Special:Log - """parse - - """ - delay(config=config, session=session) - -def domain2prefix(config={}, session=None): - """ Convert domain name to a valid prefix filename. """ - - # At this point, both api and index are supposed to be defined - domain = '' - if config['api']: - domain = config['api'] - elif config['index']: - domain = config['index'] - - domain = domain.lower() - domain = re.sub(r'(https?://|www\.|/index\.php|/api\.php)', '', domain) - domain = re.sub(r'/', '_', domain) - domain = re.sub(r'\.', '', domain) - domain = re.sub(r'[^A-Za-z0-9]', '_', domain) - - return domain - -def loadConfig(config={}, configfilename=''): - """ Load config file """ - - try: - with open('%s/%s' % (config['path'], configfilename), 'r') as infile: - config = cPickle.load(infile) - except: - print 'There is no config file. we can\'t resume. Start a new dump.' - sys.exit() - - return config - -def saveConfig(config={}, configfilename=''): - """ Save config file """ - - with open('%s/%s' % (config['path'], configfilename), 'w') as outfile: - cPickle.dump(config, outfile) - -def welcome(): - message = '' - """ Opening message """ - message += "#"*73 - message += """ -# Welcome to DumpGenerator %s by WikiTeam (GPL v3) # -# More info at: https://github.com/WikiTeam/wikiteam #""" % (getVersion()) - message += "\n" - message += "#"*73 - message += "\n" - message += '' - message += "\n" - message += "#"*73 - message += """ -# Copyright (C) 2011-2014 WikiTeam # -# This program is free software: you can redistribute it and/or modify # -# it under the terms of the GNU General Public License as published by # -# the Free Software Foundation, either version 3 of the License, or # -# (at your option) any later version. # -# # -# This program is distributed in the hope that it will be useful, # -# but WITHOUT ANY WARRANTY; without even the implied warranty of # -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # -# GNU General Public License for more details. # -# # -# You should have received a copy of the GNU General Public License # -# along with this program. If not, see . #""" - message += "\n" - message += "#"*73 - message += "\n" - message += '' - - return message - -def bye(): - """ Closing message """ - print "---> Congratulations! Your dump is complete <---" - print "If you found any bug, report a new issue here: https://github.com/WikiTeam/wikiteam/issues" - print "If this is a public wiki, please, consider publishing this dump. Do it yourself as explained in https://github.com/WikiTeam/wikiteam/wiki/New-Tutorial#Publishing_the_dump or contact us at https://github.com/WikiTeam/wikiteam" - print "Good luck! Bye!" - - -def getParameters(params=[]): - if not params: - params = sys.argv - - parser = argparse.ArgumentParser(description='') - - parser.add_argument('-v', '--version', action='version', version=getVersion()) - parser.add_argument('--cookies', metavar="cookies.txt", help="path to a cookies.txt file") - parser.add_argument('--delay', metavar=5, default=0, help="adds a delay (in seconds)") - parser.add_argument('--retries', metavar=5, default=5, help="Maximum number of retries for ") - parser.add_argument('--get-wiki-engine', action='store_true', help="returns the wiki engine") - - groupWikiOrAPIOrIndex = parser.add_mutually_exclusive_group(required=True) - groupWikiOrAPIOrIndex.add_argument('wiki', default='', nargs='?', help="URL to wiki") - groupWikiOrAPIOrIndex.add_argument('--api', help="URL to api.php") - groupWikiOrAPIOrIndex.add_argument('--index', help="URL to index.php") - - groupXMLOrImages = parser.add_argument_group() - groupXMLOrImages.add_argument('--xml', action='store_true', help="generates a full history XML dump (--xml --curonly for current revisions only)") - parser.add_argument('--curonly', action='store_true', help='store only the current version of pages') - - groupXMLOrImages.add_argument('--images', action='store_true', help="generates an image dump") - - parser.add_argument('--path', help='path to store wiki dump at') - parser.add_argument('--resume', action='store_true', help='resumes previous incomplete dump (requires --path)') - parser.add_argument('--force', action='store_true', help='') - parser.add_argument('--namespaces', metavar="1,2,3", help='comma-separated value of namespaces to include (all by default)') - parser.add_argument('--exnamespaces', metavar="1,2,3", help='comma-separated value of namespaces to exclude') - - parser.add_argument('--user', help='Username if authentication is required.') - parser.add_argument('--pass', dest='password', help='Password if authentication is required.') - - args = parser.parse_args() - #print args - - # Execute excluding args - if args.get_wiki_engine and args.wiki and (args.wiki.startswith('http://') or args.wiki.startswith('https://')): - print getWikiEngine(url=args.wiki) - sys.exit() - # End execute excluding args - - # check API URL - if args.api and (not args.api.startswith('http://') and not args.api.startswith('https://')): - print args.api - print 'ERROR: URL to api.php must start with http:// or https://\n' - parser.print_usage() - sys.exit(1) - - # check index URL - if args.index and (not args.index.startswith('http://') and not args.index.startswith('https://')): - print 'ERROR: URL to index.php must start with http:// or https://\n' - parser.print_usage() - sys.exit(1) - - # check user and pass (one requires both) - if (args.user and not args.password) or (args.password and not args.user): - print 'Both --user and --pass are required for authentication.' - parser.print_usage() - sys.exit(1) - - namespaces = ['all'] - exnamespaces = [] - # Process namespace inclusions - if args.namespaces: - if re.search(r'[^\d, \-]', args.namespaces) and args.namespaces.lower() != 'all': #fix, why - ? and... --namespaces= all with a space works? - print "Invalid namespace values.\nValid format is integer(s) separated by commas" - sys.exit() - else: - ns = re.sub(' ', '', args.namespaces) - if ns.lower() == 'all': - namespaces = ['all'] - else: - namespaces = [int(i) for i in ns.split(',')] - - # Process namespace exclusions - if args.exnamespaces: - if re.search(r'[^\d, \-]', args.exnamespaces): - print "Invalid namespace values.\nValid format is integer(s) separated by commas" - sys.exit(1) - else: - ns = re.sub(' ', '', args.exnamespaces) - if ns.lower() == 'all': - print 'You cannot exclude all namespaces.' - sys.exit(1) - else: - exnamespaces = [int(i) for i in ns.split(',')] - - # --curonly requires --xml - if args.curonly and not args.xml: - print "--curonly requires --xml\n" - parser.print_usage() - sys.exit(1) - - #user chose --api, but --index it is necessary for special:export: we generate it - if args.api and not args.index: - index = args.api.split('api.php')[0] + 'index.php' - # WARNING: remove index.php here for misconfigured sites like editthis.info, or provide --index directly - print 'You didn\'t provide a path for index.php, using ', index - else: - index = args.index - - cj = cookielib.MozillaCookieJar() - if args.cookies: - cj.load(args.cookies) - print 'Using cookies from %s' % args.cookies - - session = requests.Session() - session.cookies = cj - session.headers = {'User-Agent': getUserAgent()} - if args.user and args.password: - session.auth = (args.user, args.password) - #session.mount(args.api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret)) - - config = { - 'curonly': args.curonly, - 'date': datetime.datetime.now().strftime('%Y%m%d'), - 'api': args.api or '', - 'index': index, - 'images': args.images, - 'logs': False, - 'xml': args.xml, - 'namespaces': namespaces, - 'exnamespaces': exnamespaces, - 'path': args.path or '', - 'cookies': args.cookies or '', - 'delay': args.delay - } - other = { - 'resume': args.resume, - 'filenamelimit': 100, #do not change - 'force': args.force, - 'session': session - } - - if config['api']: - #check api.php - if checkAPI(config['api'], config, session=other['session']): - print 'api.php is OK' - else: - print 'Error in api.php, please, provide a correct path to api.php' - sys.exit() - - if config['index']: - #check index.php - if checkIndexphp(config['index'], config, session=other['session']): - print 'index.php is OK' - else: - print 'Error in index.php, please, provide a correct path to index.php' - sys.exit() - - #calculating path, if not defined by user with --path= - if not config['path']: - config['path'] = './%s-%s-wikidump' % (domain2prefix(config=config, session=session), config['date']) - - return config, other - -def checkAPI(api, config={}, session=None): - """ Checking API availability """ - global cj - r = session.post(url=api, data={'action': 'query', 'meta': 'siteinfo', 'format': 'json'}) - resultText = r.text - print 'Checking api.php...', api - if "MediaWiki API is not enabled for this site." in resultText: - return False - result = json.loads(resultText) - delay(config=config, session=session) - if result.has_key('query'): - return True - return False - -def checkIndexphp(indexphp, config={}, session=None): - """ Checking index.php availability """ - r = session.post(url=indexphp, data={'title': 'Special:Version'}) - raw = r.text - delay(config=config, session=session) - print 'Checking index.php...', indexphp - if re.search(r'(Special:Badtitle|class="permissions-errors"|"wgCanonicalSpecialPageName":"Badtitle"|Login Required)', raw) and not config['cookies']: # Workaround for issue 71 - print "ERROR: This wiki requires login and we are not authenticated" - return False - if re.search(r'(This wiki is powered by|

|meta name="generator" content="MediaWiki)', raw): - return True - return False - -def removeIP(raw=''): - """ Remove IP from HTML comments """ - - raw = re.sub(r'\d+\.\d+\.\d+\.\d+', '0.0.0.0', raw) - #http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html - #weird cases as :: are not included - raw = re.sub(r'(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}', '0:0:0:0:0:0:0:0', raw) - - return raw - -def checkXMLIntegrity(config={}, session=None): - """ Check XML dump integrity, to detect broken XML chunks """ - return - - print 'Verifying dump...' - checktitles = 0 - checkpageopen = 0 - checkpageclose = 0 - checkrevisionopen = 0 - checkrevisionclose = 0 - for line in file('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config, session=session), config['date'], config['curonly'] and 'current' or 'history'), 'r').read().splitlines(): - if "" in line: - checkrevisionopen += 1 - elif "" in line: - checkrevisionclose += 1 - elif "" in line: - checkpageopen += 1 - elif "" in line: - checkpageclose += 1 - elif "" in line: - checktitles += 1 - else: - continue - if (checktitles == checkpageopen and checktitles == checkpageclose and checkrevisionopen == checkrevisionclose): - pass - else: - print 'XML dump seems to be corrupted.' - reply = '' - while reply.lower() not in ['yes', 'y', 'no', 'n']: - reply = raw_input('Regenerate a new dump ([yes, y], [no, n])? ') - if reply.lower() in ['yes', 'y']: - generateXMLDump(config=config, titles=titles) - elif reply.lower() in ['no', 'n']: - print 'Not generating a new dump.' - - -def createNewDump(config={}, other={}): - titles = [] - images = [] - print 'Trying generating a new dump into a new directory...' - if config['xml']: - titles += getPageTitles(config=config, session=other['session']) - saveTitles(config=config, titles=titles) - generateXMLDump(config=config, titles=titles, session=other['session']) - checkXMLIntegrity(config=config) - if config['images']: - if config['api']: - images += getImageFilenamesURLAPI(config=config, session=other['session']) - else: - images += getImageFilenamesURL(config=config, session=other['session']) - saveImageFilenamesURL(config=config, images=images, session=other['session']) - generateImageDump(config=config, other=other, images=images, session=other['session']) - if config['logs']: - saveLogs(config=config, session=session) - -def resumePreviousDump(config={}, other={}): - titles = [] - images = [] - print 'Resuming previous dump process...' - if config['xml']: - #load titles - lasttitle = '' - try: - f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config, session=other['session']), config['date']), 'r') - raw = unicode(f.read(), 'utf-8') - titles = raw.split('\n') - lasttitle = titles[-1] - if not lasttitle: #empty line at EOF ? - lasttitle = titles[-2] - f.close() - except: - pass #probably file doesnot exists - if lasttitle == '--END--': - #titles list is complete - print 'Title list was completed in the previous session' - else: - print 'Title list is incomplete. Reloading...' - #do not resume, reload, to avoid inconsistences, deleted pages or so - titles = getPageTitles(config=config, session=other['session']) - saveTitles(config=config, titles=titles) - #checking xml dump - xmliscomplete = False - lastxmltitle = '' - try: - f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config, session=other['session']), config['date'], config['curonly'] and 'current' or 'history'), 'r') - for l in f: - if re.findall('</mediawiki>', l): - #xml dump is complete - xmliscomplete = True - break - xmltitles = re.findall(r'<title>([^<]+)', l) #weird if found more than 1, but maybe - if xmltitles: - lastxmltitle = undoHTMLEntities(text=xmltitles[-1]) - f.close() - except: - pass #probably file doesnot exists - #removing --END-- before getXMLs - while titles and titles[-1] in ['', '--END--']: - titles = titles[:-1] - if xmliscomplete: - print 'XML dump was completed in the previous session' - elif lastxmltitle: - #resuming... - print 'Resuming XML dump from "%s"' % (lastxmltitle) - generateXMLDump(config=config, titles=titles, start=lastxmltitle, session=other['session']) - else: - #corrupt? only has XML header? - print 'XML is corrupt? Regenerating...' - generateXMLDump(config=config, titles=titles, session=other['session']) - - if config['images']: - #load images - lastimage = '' - try: - f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r') - raw = unicode(f.read(), 'utf-8').strip() - lines = raw.split('\n') - for l in lines: - if re.search(r'\t', l): - images.append(l.split('\t')) - lastimage = lines[-1] - f.close() - except: - pass #probably file doesnot exists - if lastimage == u'--END--': - print 'Image list was completed in the previous session' - else: - print 'Image list is incomplete. Reloading...' - #do not resume, reload, to avoid inconsistences, deleted images or so - if config['api']: - images=getImageFilenamesURLAPI(config=config, session=other['session']) - else: - images = getImageFilenamesURL(config=config, session=other['session']) - saveImageFilenamesURL(config=config, images=images) - #checking images directory - listdir = [] - try: - listdir = os.listdir('%s/images' % (config['path'])) - except: - pass #probably directory does not exist - listdir.sort() - complete = True - lastfilename = '' - lastfilename2 = '' - c = 0 - for filename, url, uploader in images: - lastfilename2 = lastfilename - lastfilename = filename #return always the complete filename, not the truncated - filename2 = filename - if len(filename2) > other['filenamelimit']: - filename2 = truncateFilename(other=other, filename=filename2) - if filename2 not in listdir: - complete = False - break - c +=1 - print '%d images were found in the directory from a previous session' % (c) - if complete: - #image dump is complete - print 'Image dump was completed in the previous session' - else: - generateImageDump(config=config, other=other, images=images, start=lastfilename2, session=other['session']) # we resume from previous image, which may be corrupted (or missing .desc) by the previous session ctrl-c or abort - - if config['logs']: - #fix - pass - -def saveSpecialVersion(config={}, session=None): - """ Save Special:Version as .html, to preserve extensions details """ - - if os.path.exists('%s/Special:Version.html' % (config['path'])): - print 'Special:Version.html exists, do not overwrite' - else: - print 'Downloading Special:Version with extensions and other related info' - r = session.post(url=config['index'], data={'title': 'Special:Version'}) - raw = r.text - delay(config=config, session=session) - raw = removeIP(raw=raw) - with open('%s/Special:Version.html' % (config['path']), 'w') as outfile: - outfile.write(raw.encode('utf-8')) - -def saveIndexPHP(config={}, session=None): - """ Save index.php as .html, to preserve license details available at the botom of the page """ - - if os.path.exists('%s/index.html' % (config['path'])): - print 'index.html exists, do not overwrite' - else: - print 'Downloading index.php (Main Page) as index.html' - r = session.post(url=config['index'], data={}) - raw = r.text - delay(config=config, session=session) - raw = removeIP(raw=raw) - with open('%s/index.html' % (config['path']), 'w') as outfile: - outfile.write(raw.encode('utf-8')) - -def saveSiteInfo(config={}, session=None): - """ Save a file with site info """ - - if config['api']: - if os.path.exists('%s/siteinfo.json' % (config['path'])): - print 'siteinfo.json exists, do not overwrite' - else: - print 'Downloading site info as siteinfo.json' - r = session.post(url=config['api'], data = {'action': 'query', 'meta': 'siteinfo', 'format': 'json'}) - result = json.loads(r.text) - delay(config=config, session=session) - with open('%s/siteinfo.json' % (config['path']), 'w') as outfile: - outfile.write(json.dumps(result, indent=4, sort_keys=True)) - -def avoidWikimediaProjects(config={}, other={}): - """ Skip Wikimedia projects and redirect to the dumps website """ - - #notice about wikipedia dumps - if re.findall(r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org', config['api']+config['index']): - print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!' - print 'Download the dumps from http://dumps.wikimedia.org' - if not other['force']: - print 'Thanks!' - sys.exit() - -def getWikiEngine(url=''): - """ Returns the wiki engine of a URL, if known """ - - session = requests.Session() - session.headers = {'User-Agent': getUserAgent()} - r = session.post(url=url) - result = r.text - - wikiengine = 'Unknown' - if re.search(ur'(?im)( Congratulations! Your dump is complete <--- +If you have suggestions, file a new issue here (Google account required): http://code.google.com/p/wikiteam/issues/list +If this is a public wiki, do consider publishing this dump so others can benefit from it. Follow the steps as explained in http://code.google.com/p/wikiteam/wiki/NewTutorial#Publishing_the_dump or contact us at http://code.google.com/p/wikiteam. +Thank you for using DumpGenerator %s by WikiTeam, good bye!""" % ( self.Version ) + return message + + def checkAPI(self): + """ + Checks the validity of the api.php. + """ + query = { + "meta": "siteinfo", + "siprop": "general" } + sitestats = json.loads( RequestAPI.query( query ) ) + try: + if ( sitestats[ "query" ][ "general" ][ "server" ] in self.urltoapi ): + return True + except: + try: + if ( sitestats[ "error" ][ "code" ] == "readapidenied" ) and ( self.cookies == "" ): + Output.warn( "The wiki is private and we do not have proper authentication information!" ) + return False + except: + Output.warn( "This api.php seems weird or is not valid." ) + return False + + def checkIndex(self): + """ + Checks the validity of the index.php. + """ + # TODO: Screen scraping is involved here, need backward compact for older version of MediaWiki. + parameters = { "title": "Special:Version" } + request = RequestIndex.query( parameters ) + # Since we are at Special:Version, we should not be getting Special:BadTitle unless we are not logged in + if ( re.search( r'(Special:Badtitle)', request ) ) and ( self.cookies == "" ): + Output.error( "The wiki is private and we do not have proper authentication information!" ) + sys.exit(1) + + # Check for some tags within the Special:Version page, must be language-independent + if ( re.search( r'(

|meta name="generator" content="MediaWiki)', request ) ): + return True + + def debug(self): + """ + A temporary debug mode for testing purposes. + REMOVE WHEN COMPLETE! + """ + print "DEBUG MODE ON" + print "Date: %s" % (self.date) + print "URL to api.php: %s" % (self.urltoapi) + print "URL to index.php: %s" % (self.urltoindex) + print "Current revision only: %s" % (self.curonly) + print "Image dump: %s" % (self.images) + print "Log dump: %s" % (self.logs) + print "XML dump: %s" % (self.xml) + print "Resume: %s" % (self.resume) + print "Path for resuming: %s" % (self.path) + print "Delay: %s" % (self.delay) + print "Cookies file: %s" % (self.cookies) + print "Excluded namespaces: %s" % (self.exnamespaces) + print "Debug mode on: %s" % (self.debugmode) + self.tasklist = sorted( self.tasklist ) + for task in self.tasklist: + if ( task == "axml" ): + DumpXML.run() + elif ( task == "bimages" ): + DumpImages.run() + elif ( task == "clogs" ): + DumpLogs.run() + sys.exit(0) + + def downloadHtmlPages(self): + """ + Downloads the HTML pages such as the main page and Special:Version. + """ + # Download the main page + Output.message( "Downloading index.php (Main Page) as index.html." ) + query = {} + index = RequestIndex.query( query ) + index = RequestIndex.removeIP( index ) + if ( os.path.exists( "Special:Version.html" ) ): + os.remove( "index.html" ) + else: + pass + for line in index: + Output.appendToFile( "index.html", line ) + + # Download Special:Version or its respective localized version + Output.message( "Downloading Special:Version with extensions and other related info." ) + query = { "title": "Special:Version" } + SpecialVersion = RequestIndex.query( query ) + SpecialVersion = RequestIndex.removeIP( SpecialVersion ) + if ( os.path.exists( "Special:Version.html" ) ): + os.remove( "Special:Version.html" ) + else: + pass + for line in SpecialVersion: + Output.appendToFile( "Special:Version.html", line ) + + def fixHTMLEntities(self, text): + """ + Convert some HTML entities to their regular characters. + """ + text = re.sub('<', '<', text) + text = re.sub('>', '>', text) + text = re.sub('&', '&', text) + text = re.sub('"', '"', text) + text = re.sub(''', '\'', text) + return text + + def help(self): + """ + Provides vital help information to the user. This function + directly uses the "print" function because it is harmless and + what needs to be logged has already been done so. + + Returns: Help message text + """ + message = """DumpGenerator %s, a script to generate backups of MediaWiki wikis. +For more information, please see: http://code.google.com/p/wikiteam/wiki/NewTutorial + +Startup: + -h, --help Displays this help information and exits. + -v, --version Displays the version of this script, with additional credits. + +Wiki information: + --api=URL The URL to the wiki's api.php, not to be used with --index. + --index=URL The URL to the wiki's index.php, not to be used with --api. + +Options: + --xml Creates an XML dump. + --images Creates an image dump. + --logs Creates a dump of all log pages (not yet supported). + +XML dump (only if --xml is used): + --curonly Download only the current revision. + --exnamespaces The unique system number(s) for namespaces to exclude, separated by commas. + --titlesonly Download only the page titles without the actual content. + --titles Path to a file containing list of titles, requires "--END--" to be on the last line. + +Other: + --auto Enable auto pilot mode (select options that ensures that the script creates a new dump). + --resume Resume an incomplete dump (requires --path to be given). + --path=PATH Path to the incomplete dump. + --delay=SECONDS Adds a delay (in seconds) between requests. + --cookies=PATH Path to a Mozilla cookies.txt file for authentication cookies. + --nolog Disable logging to dumpgenerator.log (does not affect output in terminal). + +Report any issues to our issue tracker: https://code.google.com/p/wikiteam.""" % (self.Version) + return message + + def loadConfig(self): + """ + Load a config file from a partially-made dump. + """ + config = json.loads( self.configfile ) + self.date = config[ "date" ] + self.useAPI = config[ "useAPI" ] + self.useIndex = config[ "useIndex" ] + self.urltoapi = config[ "urltoapi" ] + self.urltoindex = config[ "urltoindex" ] + self.images = config[ "images" ] + self.logs = config[ "logs" ] + self.xml = config[ "xml" ] + self.curonly = config[ "curonly" ] + self.exnamespaces = config[ "exnamespaces" ] + self.titlesonly = config[ "titlesonly" ] + + if ( self.images == True ): + self.tasklist.append( "bimage" ) + if ( self.logs == True ): + self.tasklist.append( "clogs" ) + if ( self.xml == True ): + self.tasklist.append( "axml" ) + + if ( self.useAPI == True ): + domain = self.urltoapi + elif ( self.useIndex == True ): + domain = self.urltoindex + + def makePrefix(self, domain): + """ + Converts a domain to a prefix. + + Inputs: + - domain: The domain to change, may contain api.php or index.php as suffix. + + Returns: + - string with slashes and stray characters changed to underscores, suffix + removed and URL protocol removed. + """ + domain = domain.lower() + # Remove unnecessary prefixes and suffixes + domain = re.sub(r'(https?://|www\.|/index\.php|/api\.php)', '', domain) + # Substitute directory slashes with underscores + domain = re.sub(r'/', '_', domain) + # Convert any stray character that is not in the alphabet to underscores + domain = re.sub(r'[^-.A-Za-z0-9]', '_', domain) + return domain + + def makeNiceURL(self, domain): + """ + Converts a domain to a more human-readable format (used for uploading). + + Inputs: + - domain: The domain to change, may contain api.php or index.php as suffix. + + Returns: + - string with suffix removed. + """ + domain = domain.lower() + # Remove the suffixes + domain = re.sub(r'(/index\.php|/api\.php)', '', domain) + return domain + + def processargs(self): + """ + Processing arguments and options provided by the user. + """ + try: + options, answers = getopt.getopt( sys.argv[1:], self.shortoptions, self.longoptions ) + except getopt.GetoptError: + Output.error( "An unknown option has been specified, please check your arguments before re-running!" ) + sys.exit(1) + + # First accept all arguments and store them in a variable + for option, answer in options: + # Startup + if ( option in ( "-h", "--help" ) ): + # Display the help guide and exit + print self.help() + os.remove( Output.logfile ) + sys.exit(0) + elif ( option in ( "-v", "--version" ) ): + # Display the version of this script + print self.version() + os.remove( Output.logfile ) + sys.exit(0) + + # Wiki information + elif ( option in "--api" ): + self.urltoapi = answer + self.configoptions[ "urltoapi" ] = self.urltoapi + elif ( option in "--index" ): + self.urltoindex = answer + self.configoptions[ "urltoindex" ] = self.urltoindex + + # Dump options + elif ( option == "--images" ): + self.images = True + self.configoptions[ "images" ] = True + self.tasklist.append( "bimages" ) + elif ( option == "--logs" ): + self.logs = True + self.configoptions[ "logs" ] = True + self.tasklist.append( "clogs" ) + elif ( option == "--xml" ): + self.xml = True + self.configoptions[ "xml" ] = True + self.tasklist.append( "axml" ) + + # XML dump options + elif ( option == "--curonly" ): + self.curonly = True + self.configoptions[ "curonly" ] = True + elif ( option in "--exnamespaces" ): + self.exnamespaces = answer + self.configoptions[ "exnamespaces" ] = self.exnamespaces + elif ( option == "--titlesonly" ): + self.titlesonly = True + self.configoptions[ "titlesonly" ] = True + elif ( option in "--titles" ): + self.titles = os.path.abspath( answer ) + + # Other options + elif ( option == "--auto" ): + self.autonomous = True + elif ( option in "--cookies" ): + self.cookies = answer + elif ( option in "--delay" ): + self.delay = answer + elif ( option == "--nolog" ): + self.nolog = True + elif ( option in "--path" ): + self.path = answer + elif ( option == "--resume" ): + self.resume = True + + # Private options (i.e. usable but not documented in --help) + elif ( option == "--debug" ): + self.debugmode = True + else: + Output.error( "An unknown option has been specified, please check your arguments before re-running!" ) + sys.exit(1) + + # Now to verify that the user is not messing around + if ( self.urltoapi == "" and self.urltoindex == "" ): + # User did not specify either --api= or --index= + if ( self.resume == True and self.path != "" ): + # ...but specified --resume and --path= accordingly + self.resumeDump() + elif ( self.resume == True and self.path == "" ): + # ...and specified --resume without --path= + Output.error( "--resume was provided, but you still need to tell me the path to the incomplete dump!" ) + sys.exit(1) + else: + Output.error( "You need to tell me the URL to either the api.php or to index.php!" ) + sys.exit(1) + elif ( self.resume == True ) and ( self.path == "" ): + # User specified --resume, but no --path= was given + Output.error( "--resume was provided, but you still need to tell me the path to the incomplete dump!" ) + sys.exit(1) + elif ( self.urltoapi != "" and self.urltoindex != "" ): + # User specified both --api= and --index= + self.useAPI = True + elif ( self.xml == False and ( self.curonly == True or self.exnamespaces != "" ) ): + # User specified --curonly and --exnamespaces without --xml + Output.error( "You did not specify to make an XML dump using --xml, so why write --curonly or --exnamespaces? Remove them before re-running!" ) + sys.exit(1) + + if ( self.urltoapi != "" ): + self.useAPI = True + elif ( self.urltoindex != "" ): + self.useIndex = True + + if ( self.useAPI == True ): + Output.message( "Checking api.php..." ) + if not ( self.urltoapi.startswith( "http://" ) ) and not ( self.urltoapi.startswith( "https://" ) ): + Output.error( "The URL to api.php must start with either http:// or https://!" ) + sys.exit(1) + elif ( self.checkAPI() ): + Output.message( "api.php is okay" ) + else: + Output.error( "There is an error with api.php, please provide a correct path to it." ) + sys.exit(1) + elif ( self.useIndex == True ): + Output.message( "Checking index.php..." ) + if not ( self.urltoindex.startswith( "http://" ) ) and not ( self.urltoindex.startswith( "https://" ) ): + Output.error( "The URL to index.php must start with either http:// or https://!" ) + sys.exit(1) + elif ( self.checkIndex() ): + Output.message( "index.php is okay" ) + else: + Output.error( "There is an error with index.php, please provide a correct path to it." ) + sys.exit(1) + + def resumeDump(self): + """ + Resume an incomplete dump defined in self.path. + """ + # TODO: Add support for resuming dumps. + os.chdir( self.path ) + self.loadConfig() + self.prefix = "%s-%s" % ( self.makePrefix( domain ), self.date ) + self.domain = self.makeNiceURL( domain ) + if ( self.useAPI == True ): + self.urltoindex = "%s/index.php" % ( self.domain ) + self.tasklist = sorted( self.tasklist ) + for task in self.tasklist: + if ( task == "axml" ): + DumpXML.run() + elif ( task == "bimages" ): + DumpImages.run() + elif ( task == "clogs" ): + DumpLogs.run() + + def run(self): + """ + Run the whole script itself and excute important functions. + """ + print self.welcome() + Updater.checkRevision() + # Check if previously there was a log file in the working directory and remove it if exists + # This is followed by the equivalent of "touch" in Unix to create an empty file + if ( os.path.exists( Output.logfile ) ): + os.remove( Output.logfile ) + open( Output.logfile, "a" ).close() + else: + open( Output.logfile, "a" ).close() + self.processargs() + if ( DumpGenerator.nolog or DumpGenerator.debugmode): + # Remove the dumpgenerator.log file + os.remove( Output.logfile ) + if ( self.useAPI == True ): + domain = self.urltoapi + elif ( self.useIndex == True ): + domain = self.urltoindex + directories = os.walk( "." ).next()[1] + for directory in directories: + # Check if there is a dump that already exists in the current working directory + if ( directory.startswith( self.makePrefix( domain ) ) and directory.endswith( "-wikidump" ) ): + print "" # Create a blank line + Output.warn( "There seems to be a similar dump at %s which might be incomplete." % ( directory ) ) + if ( self.autonomous == True ): + Output.message( "Since auto pilot mode is enabled, that dump will not be resumed." ) + self.resume = False + else: + Output.warn( "Do you wish to resume using configuration from that dump? [yes, y], [no, n]" ) + reply = "" + while reply.lower() not in [ "yes", "y", "no", "n" ]: + reply = raw_input( "Answer: " ) + if ( reply.lower() in [ "yes", "y" ] ): + if not ( os.path.isfile( "%s/%s" % ( directory, self.configfile ) ) ): + Output.error( "I cannot find a %s in the directory! Please delete that directory before re-running!" % ( self.configfile ) ) + sys.exit(1) + else: + Output.warn( "Resuming dump and ignoring configuration given in this session..." ) + self.resume = True + self.path = directory + break + elif ( reply.lower() in [ "no", "n" ] ): + Output.message( "Not resuming..." ) + self.resume = False + else: + continue + if ( self.resume == True ): + self.resumeDump() + else: + self.prefix = "%s-%s" % ( self.makePrefix( domain ), self.date ) + self.domain = self.makeNiceURL( domain ) + workingdir = "%s-wikidump" % ( self.prefix ) + if ( os.path.exists( workingdir ) ): + if ( self.autonomous == True ): + Output.message( "Since auto pilot mode is enabled, the directory with the same name will be deleted." ) + reply = "yes" + else: + Output.warn( "\nThere seems to be a directory with the same name, delete the old one? [yes, y], [no, n]" ) + reply = "" + while reply.lower() not in [ "yes", "y", "no", "n" ]: + reply = raw_input( "Answer: " ) + if ( reply.lower() in [ "yes", "y" ] ): + try: + shutil.rmtree( workingdir ) + except: + Output.error( "There was a problem deleting the directory, please manually delete it before re-running!" ) + sys.exit(1) + print "" # Create a blank line + elif ( reply.lower() in [ "no", "n" ] ): + Output.error( "Existing directory exists, either delete that directory or rename it before re-running!" ) + sys.exit(1) + else: + pass + Output.message( "Generating a new dump into a new directory..." ) + os.mkdir( workingdir ) + os.rename( Output.logfile, "%s/%s" % ( workingdir, Output.logfile ) ) + os.chdir( workingdir ) + self.saveConfig() + # Guess the URL to index.php + if ( self.useAPI == True ): + self.urltoindex = "%s/index.php" % ( self.domain ) + if ( self.debugmode == True ): + self.debug() + else: + # Run every single task that we are assigned to do in order: xml, images, logs + # The "a", "b" and "c" prefix is just to force the order. + self.tasklist = sorted( self.tasklist ) + if ( self.tasklist == [] ): + Output.error( "You did not tell me what dump to create!" ) + else: + for task in self.tasklist: + if ( task == "axml" ): + DumpXML.run() + elif ( task == "bimages" ): + DumpImages.run() + elif ( task == "clogs" ): + DumpLogs.run() + self.downloadHtmlPages() + print self.bye() + + def saveConfig(self): + """ + Save the configuration settings provided. + """ + self.configoptions[ "date" ] = self.date + output = open( self.configfile, "w" ) + json.dump( self.configoptions, output, indent=4 ) + + def version(self): + """ + Displays the version information and credits of the script. + + Returns: Version information and credits + """ + message = """DumpGenerator %s by WikiTeam + +Copyright (C) 2013 Hydriz Scholz +Copyright (C) 2014 WikiTeam + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program. If not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA, or visit + +""" % (self.Version) + return message + + def welcome(self): + """ + Welcomes the user at the very beginning of the script running process. + + Returns: Welcome message. + """ + message = """########## Welcome to DumpGenerator %s by WikiTeam ##########\n""" % (self.Version) + return message + +class DumpImages: + """ + The class for generating an image dump. + """ + def __init__(self): + """ + The constructor function. + """ + self.files = [] + + def dumpImages(self): + """ + Download all the images on the wiki with their corresponding XML. + """ + if ( DumpGenerator.useAPI == True ): + self.getFileListAPI() + else: + self.getFileListIndex() + filecount = 0 + if ( self.files == [] ): + pass + else: + Output.message( "Downloading files and their descriptions into \"images\" directory..." ) + for media in self.files: + time.sleep( DumpGenerator.delay ) # Delay between requests + urllib.urlretrieve( media[ "url" ], "images/%s" % (media[ "name" ] ) ) + title = DumpGenerator.fixHTMLEntities( media[ "title" ].encode( "utf-8" ) ) + contentsfile = DumpXML.getXMLPage( title, siteinfo=True ) + destfile = "images/%s.xml" % ( media[ "name" ] ) + shutil.move( contentsfile, destfile ) + Output.appendToFile( destfile, "\n" ) + filecount += 1 + if ( filecount % 10 == 0 ): + # Give the user a regular status report so that it does not look stuck + Output.message( " Downloaded %d files." % ( filecount ) ) + if ( filecount == 1 ): + Output.message( "Downloaded 1 file." % ( filecount ) ) + else: + Output.message( "Downloaded %d files." % ( filecount ) ) + + def getFileListAPI(self): + """ + Download the list of files on the wiki via the API. + """ + files = [] + dumpfile = "%s-images.txt" % ( DumpGenerator.prefix ) + filecount = 0 + Output.message( "Getting list of files on the wiki..." ) + aifrom = "!" # Very first page of a wiki + while aifrom: + sys.stderr.write('.') # Tell the user that downloading is in progress + query = { + "list": "allimages", + "aifrom": aifrom, + "ailimit": 500 } # The default limit for anonymous users of the API is 500 pages per request + time.sleep( DumpGenerator.delay ) # Delay between requests + filesmeta = json.loads( RequestAPI.query( query ) ) + # Store what the server tells us to continue from + try: + serveraifrom = filesmeta[ "query-continue" ][ "allimages" ][ "aicontinue" ] + aifrom = DumpGenerator.fixHTMLEntities( serveraifrom ) + except: + # Reached the end of having to keep continuing, exit the while condition + aifrom = "" + # TODO: On a wiki with a lot of files, this can cause huge memory problems + files.extend( filesmeta[ "query" ][ "allimages" ] ) + for media in filesmeta[ "query" ][ "allimages" ]: + outputline = "%s\t%s\n" % ( media[ "title" ], media[ "url" ] ) + Output.appendToFile( dumpfile, outputline ) + # Add to namespace page count + filecount += len( files ) + Output.appendToFile( dumpfile, "--END--" ) + if ( filecount == 1 ): + Output.message( " Got 1 file" ) + else: + Output.message( " Got %d files" % ( filecount ) ) + + if ( filecount == 0 ): + Output.warn( "There are no files on the wiki to download!" ) + else: + Output.message( "File names and URLs saved at %s." % ( dumpfile ) ) + self.files = files + + def getFileListIndex(self): + """ + Download the list of files on the wiki via index.php. + """ + # TODO: Add code here + + def run(self): + """ + Execute the process of producing an image dump. + """ + if ( os.path.isdir( "images" ) ): + time.sleep(0) + else: + os.mkdir( "images" ) + self.dumpImages() + +class DumpLogs: + """ + The class for generating a log pages dump (pages in Special:Log). + """ + def __init__(self): + """ + The constructor function. + """ + + def run(self): + """ + Execute the process of producing a log pages dump. + """ + # TODO: Support downloading of log pages + Output.warn( "Sorry, downloading of log pages are not yet supported!" ) + +class DumpXML: + """ + The class for generating an XML dump. + """ + def __init__(self): + """ + The constructor function. + """ + self.lennamespaces = 0 + self.namespaces = {} + self.pagetitles = [] + self.titlesdumpfile = "" + self.dumpretrycount = 0 + + def dumpPageTitlesAPI(self): + """ + Get a list of page titles and outputs it to a file. + """ + self.getNamespacesAPI() + self.getPageTitlesAPI() + Output.message( "Saving list of page titles..." ) + Output.appendToFile( self.titlesdumpfile, "--END--" ) + Output.message( "List of page titles saved at %s." % ( self.titlesdumpfile ) ) + + def dumpXML(self): + """ + Get the whole wiki in an XML file. + """ + Output.message( "Downloading the XML of every page..." ) + if ( DumpGenerator.curonly == True ): + dumpfile = "%s-curonly.xml" % ( DumpGenerator.prefix ) + else: + dumpfile = "%s-history.xml" % ( DumpGenerator.prefix ) + pagecount = 0 + # To reduce memory usage, we are storing the title into memory only when we need it + for title in file( self.titlesdumpfile, "r" ).read().splitlines(): + pagecount += 1 + numberofedits = 0 + # Add the initial siteinfo and header tags for the first page + if ( pagecount == 1 ): + contentsfile = self.getXMLPage( title, siteinfo=True ) + contents = file( contentsfile, "r" ).readlines() + open( dumpfile, "a" ).close() # "touch" the file + os.remove( contentsfile ) + elif ( title == "--END--" ): + contents = [ "\n" ] + else: + contentsfile = self.getXMLPage( title ) + contents = file( contentsfile, "r" ).readlines() + os.remove( contentsfile ) + + for content in contents: + # Count the number of occurrences of "" to determine number of revisions + if ( "" in content ): + numberofedits += 1 + Output.appendToFile( dumpfile, content ) + if ( title == "--END--" ): + pass + else: + if ( numberofedits == 1 ): + Output.message( " %s, 1 edit" % ( title ) ) + else: + Output.message( " %s, %s edits" % ( title, numberofedits ) ) + if ( pagecount % 10 == 0 ): + Output.message( "Downloaded %d pages" % ( pagecount ) ) + Output.message( "XML dump saved at %s." % ( dumpfile ) ) + self.integrityCheck( dumpfile ) + + def getNamespacesAPI(self): + """ + Download the list of namespaces with their names and IDs + via the API. + """ + query = { + "meta": "siteinfo", + "siprop": "namespaces" } + namespacedetails = json.loads( RequestAPI.query( query ) ) + namespacenums = namespacedetails[ "query" ][ "namespaces" ].keys() + # Remove the system namespaces ("Media" and "Special") + namespacenums.remove( "-2" ) + namespacenums.remove( "-1" ) + namespaces = {} + for namespacenum in namespacenums: + namespacename = namespacedetails[ "query" ][ "namespaces" ][ namespacenum ][ "*" ] + namespaces[ namespacenum ] = namespacename + self.lennamespaces = len( list( namespacenums ) ) + Output.message( "%d namespaces found." % ( self.lennamespaces ) ) + self.namespaces = namespaces + + def getPageTitlesAPI(self): + """ + Grab a list of page titles in each namespace via the API. + + There are leading spaces in the outputs so as to make things neater on the terminal. + """ + titles = [] + self.titlesdumpfile = "%s-titles.txt" % ( DumpGenerator.prefix ) + totalpagecount = 0 + for namespace in self.namespaces: + if namespace in DumpGenerator.exnamespaces: + Output.warn( " Skipping namespace %s" % (namespace) ) + else: + pagecount = 0 + Output.message( " Getting titles in namespace %s" % (namespace) ) + apfrom = "!" # Very first page of a wiki + while apfrom: + sys.stderr.write( "." ) # Tell the user that downloading is in progress + query = { + "list": "allpages", + "apnamespace": namespace, + "apfrom": apfrom, + "aplimit": 500 } # The default limit for anonymous users of the API is 500 pages per request + time.sleep( DumpGenerator.delay ) # Delay between requests + pagetitles = json.loads( RequestAPI.query( query ) ) + # Store what the server tells us to continue from + try: + serverapfrom = pagetitles[ "query-continue" ][ "allpages" ][ "apcontinue" ] + apfrom = DumpGenerator.fixHTMLEntities( serverapfrom ) + except: + try: + serverapfrom = pagetitles[ "query-continue" ][ "allpages" ][ "apfrom" ] + apfrom = DumpGenerator.fixHTMLEntities( serverapfrom ) + except: + # Reached the end of having to keep continuing, exit the while condition + apfrom = "" + pages = pagetitles[ "query" ][ "allpages" ] + # Add to namespace page count + pagecount += len( pages ) + for page in pages: + title = "%s\n" % ( page[ "title" ] ) + Output.appendToFile( self.titlesdumpfile, title ) + if ( pagecount == 1 ): + Output.message( " Got 1 page title in namespace %s" % ( namespace ) ) + else: + Output.message( " Got %d page titles in namespace %s" % ( pagecount, namespace ) ) + # Add to total page count + totalpagecount += pagecount + if ( totalpagecount == 1 ): + Output.message( "Got 1 page title in total." % ( totalpagecount ) ) + else: + Output.message( "Got %d page titles in total." % ( totalpagecount ) ) + + def getXMLPage(self, page, siteinfo=False): + """ + Get the XML of one page. + + Input: + - page: The title of the page to download. + - siteinfo: Whether to include the siteinfo header in the XML. + """ + parameters = { + "title": "Special:Export", + "pages": page, + "action": "submit" } + if ( DumpGenerator.curonly == True ): + parameters[ "curonly" ] = 1 + parameters[ "limit" ] = 1 + else: + # Make the wiki download the actual full history + parameters["history"] = "1" + # TODO: Can cause memory problems if the page has a huge history + result = RequestIndex.query( parameters ) + pagehash = hashlib.sha256( page ).hexdigest()[:8] + tempfile = "%s.xml.tmp" % ( pagehash ) + tempfile2 = "%s.xml" % ( pagehash ) + Output.appendToFile( tempfile, result ) + result = "" # Free up memory + # Warning: The following is NOT compatible with MediaWiki XML Schema Description version 0.3 and below! + # See http://wikiteam.googlecode.com/svn/trunk/schema/README.md for more information about MediaWiki versions + # this will affect and ways to overcome it. + if ( siteinfo == False ): + linecount = 0 + # The 11 comes from lines like , "special" namespaces and the very first line + # TODO: Hacky way of removing the siteinfo, check for backward compatibility! + linestoskip = 11 + self.lennamespaces + for line in open( tempfile, "r" ).read().splitlines(): + linecount += 1 + if linecount > linestoskip: + if ( "" in line ): + pass + else: + line = "%s\n" % ( line ) + Output.appendToFile( tempfile2, line ) + else: + continue + else: + for line in open( tempfile, "r" ).read().splitlines(): + if ( "" in line ): + pass + else: + line = "%s\n" % ( line ) + Output.appendToFile( tempfile2, line ) + os.remove( tempfile ) + return tempfile2 + + def integrityCheck(self, dumpfile): + """ + Checks the integrity of the XML dump and ensures that it is not corrupted. + """ + Output.message( "Checking the integrity of the XML dump..." ) + checktitles = 0 + checkpageopen = 0 + checkpageclose = 0 + checkrevisionopen = 0 + checkrevisionclose = 0 + # Check the number of instances of the following tags + # By logic they should be the same number + for line in file( dumpfile, "r" ).read().splitlines(): + if "" in line: + checktitles += 1 + elif "<page>" in line: + checkpageopen += 1 + elif "</page>" in line: + checkpageclose += 1 + elif "<revision>" in line: + checkrevisionopen += 1 + elif "</revision>" in line: + checkrevisionclose += 1 + else: + continue + + if ( checktitles == checkpageopen and checktitles == checkpageclose and checkrevisionopen == checkrevisionclose ): + Output.message( "Excellent, the XML dump is not corrupted." ) + else: + Output.warn( "WARNING: XML dump seems to be corrupted." ) + if ( DumpGenerator.autonomous == True ): + reply = "yes" + else: + reply = "" + while reply.lower() not in [ "yes", "y", "no", "n" ]: + reply = raw_input( 'Regenerate a new dump ([yes, y], [no, n])? ' ) + if reply.lower() in [ "yes", "y" ]: + self.dumpretrycount += 1 + if ( self.dumpretrycount < 3 ): + Output.warn( "Generating a new dump..." ) + os.remove( dumpfile ) + self.dumpXML() + else: + Output.warn( "We have tried dumping the wiki 3 times, but the dump is still corrupted. Not going to carry on since it is probably a problem on the wiki." ) + # Encourage the user to tell us about this faulty wiki + print "Please tell us about this by reporting an issue here: https://code.google.com/p/wikiteam/issues/list. Thank you!" + print "Giving you a little time to see this message..." + time.sleep(3) # Give time for the user to see the message + elif reply.lower() in [ "no", "n" ]: + Output.warn( "Not generating a new dump. Note: Your dump is corrupted and might not work with MediaWiki!" ) + + def run(self): + """ + Execute the process of producing an XML dump. + """ + if ( DumpGenerator.useAPI == True ): + if ( DumpGenerator.titlesonly == True ): + self.dumpPageTitlesAPI() + else: + if ( DumpGenerator.titles != "" ): + Output.message( "Using the list of page titles provided at %s." % ( DumpGenerator.titles ) ) + self.titlesdumpfile = DumpGenerator.titles + else: + self.dumpPageTitlesAPI() + self.dumpXML() + else: + if ( DumpGenerator.titlesonly == True ): + self.dumpPageTitlesIndex() + else: + if ( DumpGenerator.titles != "" ): + self.titlesdumpfile = DumpGenerator.titles + else: + self.dumpPageTitlesIndex() + self.dumpXML() + +class Output: + """ + The class to output anything to the user or to a place not within the script. + + For doing outputs to user: + This is used instead of directly using the "print" function is because + this is intended to log everything that is told to the user, so that it + is possible to check when and where things went wrong. + + For doing outputs to elsewhere: + This is to reduce memory usage by storing large chunks of data into disk + and reducing the risk of getting a MemoryError. + """ + def __init__(self): + self.logfile = "dumpgenerator.log" + + # Output to disk + def appendToFile(self, outputfile, contents): + """ + Output contents to file. + + Inputs: + - outputfile: The file to output to. + - contents: The content to add for each line. + """ + if ( os.path.exists( outputfile ) == False ): + open( outputfile, "a" ).close() # "touch" the file + else: + pass + thefile = open( outputfile, "a" ) + try: + contents = contents.encode( "utf-8", "ignore" ) + # TODO: During a test phase, this error kept coming up, though the final output was no different from + # what was produced using dumpBackup.php and using Special:Export itself. + except UnicodeDecodeError: + pass + thefile.write( contents ) + thefile.close() + + # Output to user + def error(self, message): + print message + print "Write --help for more information." + self.log( "An error occurred: %s" % (message) ) + + def log(self, message): + if ( DumpGenerator.nolog or DumpGenerator.debugmode): + # Skip logging + time.sleep(0) + else: + timestamp = datetime.datetime.fromtimestamp( time.time() ).strftime( "%Y-%m-%d %H:%M:%S" ) + logline = "%s: %s\n" % (timestamp, message) + self.appendToFile( self.logfile, logline ) + + def message(self, message): + print message + self.log( "Told the user: %s" % (message) ) + + def warn(self, message): + print message + self.log( "Warned the user: %s" % (message) ) + +class RequestAPI: + """ + The RequestAPI class, to submit APi request calls to the server. + """ + def __init__(self): + """ + The constructor function. + """ + + def query(self, params, url=""): + """ + The function to send an API call to the server given in the "url" + parameter using the parameters found in params. If url is empty, + DumpGenerator.urltoapi is used instead. + + Note: This function will assume action=query, other functions provides + the other query forms, but not this one. + + Input: + - params: Parameters to API call as an array (excluding action=query and format=json) + + Returns + - Result of API call in JSON format. + """ + if ( url == "" ): + url = DumpGenerator.urltoapi + else: + url = url + queryurl = "%s?action=query&format=json" % ( url ) + headers = { "User-Agent": DumpGenerator.UserAgent } + # Convert the array to a proper URL + paras = urllib.urlencode( params ) + # POST the parameters to the server + request = urllib2.Request( queryurl, paras, headers ) + try: + result = urllib2.urlopen( request ) + except: + try: + # Add a little delay between requests if server is slow + sleeptime = DumpGenerator.delay + 10 + Output.warn( "Failed to get a response from the server, retrying in %d seconds..." % (sleeptime) ) + time.sleep( sleeptime ) + result = urllib2.urlopen( request ) + except: + Output.error( "An error occurred when trying to get a response from the server. Please resume the dump with --resume." ) + sys.exit(2) + output = result.read() + result.close() + return output + +class RequestIndex: + def __init__(self): + """ + The constructor function. + """ + + def query(self, params, url=""): + """ + The function to send an request to the server given in the "url" + parameter using the parameters found in params. If url is empty, + DumpGenerator.urltoindex is used instead. + + Input: + - params: Parameters to the request to send, appended to url as + a GET request. + + Returns + - Result of GET request. + """ + if ( url == "" ): + url = DumpGenerator.urltoindex + else: + url = url + headers = { "User-Agent": DumpGenerator.UserAgent } + paras = urllib.urlencode( params ) + # index.php does not support POST request, formulating a correct GET URL here + queryurl = "%s?%s" % ( url, paras ) + request = urllib2.Request( queryurl, headers=headers ) + # TODO: Make urlopen follow redirects + try: + result = urllib2.urlopen( request ) + except: + try: + # Add a little delay between requests if server is slow + sleeptime = DumpGenerator.delay + 10 + Output.warn( "Failed to get a response from the server, retrying in %d seconds..." % (sleeptime) ) + time.sleep( sleeptime ) + result = urllib2.urlopen( request ) + except: + Output.error( "An error occurred when trying to get a response from the server. Please resume the dump with --resume." ) + sys.exit(2) + output = result.read() + result.close() + return output + + def removeIP(self, content): + """ + Remove the user's IP address while fetching HTML pages. + """ + # Remove IPv4 addresses + content = re.sub( r"\d+\.\d+\.\d+\.\d+", "0.0.0.0", content ) + # Remove IPv6 addresses + content = re.sub( r"(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}", "0:0:0:0:0:0:0:0", content ) + return content + +class Updater: + """ + The class to auto-update the user's script to the latest version of DumpGenerator. + """ + # TODO: Get the script to check only occasionally, this is a performance concern + def __init__(self): + """ + The constructor function. + """ + self.controlUrl = "http://wikiteam.googlecode.com/svn/trunk/revnum.json" + self.controlUrl2 = "https://raw.github.com/dumps/DumpGenerator/master/revnum.json" + self.result = {} + + def checkRevision(self): + """ + Check the current revision and ensure that it is up-to-date. + """ + jsonresult = self.getRevisionJson() + if ( jsonresult == False ): + pass + else: + result = json.loads( jsonresult ) + self.result = result + if ( result[ "latest" ] == DumpGenerator.Version ): + if ( result[ "releases" ][ DumpGenerator.Version ][ "revision" ] == DumpGenerator.revision ): + pass + else: + self.update() + else: + self.update() + + def getRevisionJson(self): + """ + Download the controlling JSON file. + """ + headers = {'User-Agent': DumpGenerator.UserAgent} + skip = False + # TODO: Handle 404 errors + try: + revjson = urllib2.urlopen( urllib2.Request( self.controlUrl, headers=headers ) ) + except: + try: + revjson = urllib2.urlopen( urllib2.Request( self.controlUrl2, headers=headers ) ) + except: + Output.warn( "Unable to check if a new version of dumpgenerator.py is available, continuing..." ) + skip = True + if ( skip == False ): + output = revjson.read() + revjson.close() + return output + else: + return False + + def update(self): + """ + Update DumpGenerator.py to the current latest version + """ + currentfile = sys.argv[0] + latestver = self.result[ "latest" ] + latestrev = self.result[ "releases" ][ latestver ][ "revision" ] + latesturl = self.result[ "releases" ][ latestver ][ "downloadurl" ] + latesturl2 = self.result[ "releases" ][ latestver ][ "downloadurl2" ] + updated = True + # TODO: Handle 404 errors + try: + urllib.urlretrieve( latesturl, currentfile ) + except: + try: + urllib.urlretrieve( latesturl2, currentfile ) + except: + updated = False + if ( updated == False ): + Output.warn( "Unable to update DumpGenerator, skipping update for now..." ) + else: + Output.message( "DumpGenerator was updated to %s (revision %s)! Changes will take effect on next run." % ( latestver, latestrev ) ) if __name__ == "__main__": - main() + # Class registry, for use throughout the whole script + RequestAPI = RequestAPI() + RequestIndex = RequestIndex() + DumpGenerator = DumpGenerator() + DumpImages = DumpImages() + DumpLogs = DumpLogs() + DumpXML = DumpXML() + Output = Output() + Updater = Updater() + + # Start everything up + DumpGenerator.run()