diff --git a/dumpgenerator.py b/dumpgenerator.py
index cd6a58e..ef3ba66 100644
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@@ -1,1291 +1,1292 @@
-#!/usr/bin/env python2
# -*- coding: utf-8 -*-
-# dumpgenerator.py A generator of dumps for wikis
-# Copyright (C) 2011-2014 WikiTeam developers
+# Copyright (C) 2013 Hydriz Scholz
+# Copyright (C) 2014 WikiTeam
+#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
-#
+#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see .
+#
+# You should have received a copy of the GNU General Public License along
+# with this program. If not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA, or visit
+#
+
+#######################################################################
+# dumpgenerator.py is a script to generate backups of MediaWiki wikis #
+# To learn more, read the documentation: #
+# http://code.google.com/p/wikiteam/wiki/NewTutorial #
+#######################################################################
+
+# For developers:
+# * All functions and classes are displayed in alphabetical order for easier accessibility.
+# * Script exit codes reference:
+# * 0 - Script ran well without problems
+# * 1 - Script failed due to user's incorrect use
+# * 2 - Script failed due to destination server issue
+# * For testing purposes, add the --debug parameter and edit DumpGenerator.debug() accordingly.
+
+######
+# TODO LIST
+# 0. Download index.html and Special:Version.html
+# 1. Index.php support.
+# 2. Special:Log pages support
+# 3. GUI (Question and Answer if no parameters are given)
+# 4. Resuming of dump
+# 5. Place the images in various folders so as to avoid hitting the limit of number of files in a directory
+# 6. Speed up the script. A run with --xml --images on test.wikidata.org came up with 9 min 23 sec on 2.0 and 3 min 58 sec on 1.0
+
+# WHAT IS WORKING
+# 1. XML dumping
+# 2. Complete dumping using API (except for --logs)
+# 3. Automatic updating
+# 4. Dumping of XML based on a list of titles
+# 5. Integrity check for XML dump
-# To learn more, read the documentation:
-# https://github.com/WikiTeam/wikiteam/wiki
-
-import cookielib
-import cPickle
import datetime
-import sys
-try:
- import argparse
-except ImportError:
- print "Please install the argparse module."
- sys.exit(1)
+import getopt
+import hashlib
import json
-try:
- from hashlib import md5
-except ImportError: # Python 2.4 compatibility
- from md5 import new as md5
import os
import re
-try:
- import requests
-except ImportError:
- print "Please install or update the Requests module."
- sys.exit(1)
-import subprocess
+import shutil
+import sys
import time
import urllib
-
-__VERSION__ = '0.2.2' #major, minor, micro
-
-def getVersion():
- return(__VERSION__)
-
-
-def truncateFilename(other={}, filename=''):
- """ Truncate filenames when downloading images with large filenames """
- return filename[:other['filenamelimit']] + md5(filename).hexdigest() + '.' + filename.split('.')[-1]
-
-def delay(config={}, session=None):
- """ Add a delay if configured for that """
- if config['delay'] > 0:
- print 'Sleeping... %d seconds...' % (config['delay'])
- time.sleep(config['delay'])
-
-def cleanHTML(raw=''):
- """ Extract only the real wiki content and remove rubbish """
- """ This function is ONLY used to retrieve page titles and file names when no API is available """
- """ DO NOT use this function to extract page content """
- #different "tags" used by different MediaWiki versions to mark where starts and ends content
- if re.search('', raw):
- raw = raw.split('')[1].split('')[0]
- elif re.search('', raw):
- raw = raw.split('')[1].split('')[0]
- elif re.search('', raw):
- raw = raw.split('')[1].split('')[0]
- elif re.search('', raw):
- raw = raw.split('')[1].split('')[0]
- elif re.search('', raw):
- raw = raw.split('')[1].split('')[0]
- else:
- print raw[:250]
- print 'This wiki doesn\'t use marks to split content'
- sys.exit()
- return raw
-
-def handleStatusCode(response):
- statuscode = response.status_code
- if statuscode >= 200 and statuscode < 300:
- return
-
- print "HTTP Error %d." % statuscode
- if statuscode >= 300 and statuscode < 400:
- print "Redirect should happen automatically: please report this as a bug."
- print response.url
-
- elif statuscode == 400:
- print "Bad Request: The wiki may be malfunctioning."
- print "Please try again later."
- print response.url
- sys.exit(1)
-
- elif statuscode == 401 or statuscode == 403:
- print "Authentication required."
- print "Please use --userpass."
- print response.url
-
- elif statuscode == 404:
- print "Not found. Is Special:Export enabled for this wiki?"
- print response.url
- sys.exit(1)
-
- elif statuscode == 429 or (statuscode >= 500 and statuscode < 600):
- print "Server error, max retries exceeded."
- print "Please resume the dump later."
- print response.url
- sys.exit(1)
-
-def getNamespacesScraper(config={}, session=None):
- """ Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages """
- """ Function called if no API is available """
- namespaces = config['namespaces']
- namespacenames = {0:''} # main is 0, no prefix
- if namespaces:
- r = session.post(url=config['index'], data={'title': 'Special:Allpages'})
- raw = r.text
- delay(config=config, session=session)
-
- m = re.compile(r'').finditer(raw) # [^>]*? to include selected="selected"
- if 'all' in namespaces:
- namespaces = []
- for i in m:
- namespaces.append(int(i.group("namespaceid")))
- namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
- else:
- #check if those namespaces really exist in this wiki
- namespaces2 = []
- for i in m:
- if int(i.group("namespaceid")) in namespaces:
- namespaces2.append(int(i.group("namespaceid")))
- namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
- namespaces = namespaces2
- else:
- namespaces = [0]
-
- namespaces = list(set(namespaces)) #uniques
- print '%d namespaces found' % (len(namespaces))
- return namespaces, namespacenames
-
-def getNamespacesAPI(config={}, session=None):
- """ Uses the API to get the list of namespaces names and ids """
- namespaces = config['namespaces']
- namespacenames = {0:''} # main is 0, no prefix
- if namespaces:
- r = session.post(url=config['api'], data={'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'})
- result = json.loads(r.text)
- delay(config=config, session=session)
-
- if 'all' in namespaces:
- namespaces = []
- for i in result['query']['namespaces'].keys():
- if int(i) < 0: # -1: Special, -2: Media, excluding
- continue
- namespaces.append(int(i))
- namespacenames[int(i)] = result['query']['namespaces'][i]['*']
- else:
- #check if those namespaces really exist in this wiki
- namespaces2 = []
- for i in result['query']['namespaces'].keys():
- if int(i) < 0: # -1: Special, -2: Media, excluding
- continue
- if int(i) in namespaces:
- namespaces2.append(int(i))
- namespacenames[int(i)] = result['query']['namespaces'][i]['*']
- namespaces = namespaces2
- else:
- namespaces = [0]
-
- namespaces = list(set(namespaces)) #uniques
- print '%d namespaces found' % (len(namespaces))
- return namespaces, namespacenames
-
-def getPageTitlesAPI(config={}, session=None):
- """ Uses the API to get the list of page titles """
- titles = []
- namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
- for namespace in namespaces:
- if namespace in config['exnamespaces']:
- print ' Skipping namespace = %d' % (namespace)
- continue
-
- c = 0
- print ' Retrieving titles in the namespace %d' % (namespace)
- apfrom = '!'
- while apfrom:
- sys.stderr.write('.') #progress
- params = {'action': 'query', 'list': 'allpages', 'apnamespace': namespace, 'apfrom': apfrom.encode('utf-8'), 'format': 'json', 'aplimit': 500}
- r = session.post(url=config['api'], data=params)
- handleStatusCode(r)
- #FIXME Handle HTTP errors here!
- jsontitles = json.loads(r.text)
- apfrom = ''
- if jsontitles.has_key('query-continue') and jsontitles['query-continue'].has_key('allpages'):
- if jsontitles['query-continue']['allpages'].has_key('apcontinue'):
- apfrom = jsontitles['query-continue']['allpages']['apcontinue']
- elif jsontitles['query-continue']['allpages'].has_key('apfrom'):
- apfrom = jsontitles['query-continue']['allpages']['apfrom']
- #print apfrom
- #print jsontitles
- titles += [page['title'] for page in jsontitles['query']['allpages']]
- if len(titles) != len(set(titles)):
- #probably we are in a loop, server returning dupe titles, stop it
- print 'Probably a loop, finishing'
- titles = list(set(titles))
- apfrom = ''
- c += len(jsontitles['query']['allpages'])
- delay(config=config, session=session)
- print ' %d titles retrieved in the namespace %d' % (c, namespace)
- return titles
-
-def getPageTitlesScraper(config={}, session=None):
- """ """
- titles = []
- namespaces, namespacenames = getNamespacesScraper(config=config, session=session)
- for namespace in namespaces:
- print ' Retrieving titles in the namespace', namespace
- url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace)
- r = session.get(url=url)
- raw = r.text
- raw = cleanHTML(raw)
-
- r_title = r'title="(?P[^>]+)">'
- r_suballpages = ''
- r_suballpages1 = r'&from=(?P[^>]+)&to=(?P[^>]+)">'
- r_suballpages2 = r'Special:Allpages/(?P[^>]+)">'
- if re.search(r_suballpages1, raw):
- r_suballpages = r_suballpages1
- elif re.search(r_suballpages2, raw):
- r_suballpages = r_suballpages2
- else:
- pass #perhaps no subpages
-
- deep = 3 # 3 is the current deep of English Wikipedia for Special:Allpages, 3 levels
- c = 0
- checked_suballpages = []
- rawacum = raw
- while r_suballpages and re.search(r_suballpages, raw) and c < deep:
- #load sub-Allpages
- m = re.compile(r_suballpages).finditer(raw)
- for i in m:
- fr = i.group('from')
-
- if r_suballpages == r_suballpages1:
- to = i.group('to')
- name = '%s-%s' % (fr, to)
- url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % (config['index'], namespace, fr, to) #do not put urllib.quote in fr or to
- elif r_suballpages == r_suballpages2: #fix, esta regexp no carga bien todas? o falla el r_title en este tipo de subpag? (wikiindex)
- fr = fr.split('&namespace=')[0] #clean &namespace=\d, sometimes happens
- name = fr
- url = '%s?title=Special:Allpages/%s&namespace=%s' % (config['index'], name, namespace)
-
- if not name in checked_suballpages:
- checked_suballpages.append(name) #to avoid reload dupe subpages links
- delay(config=config, session=session)
- r2 = session.get(url=url)
- raw2 = r2.text
- raw2 = cleanHTML(raw2)
- rawacum += raw2 #merge it after removed junk
- print ' Reading', name, len(raw2), 'bytes', len(re.findall(r_suballpages, raw2)), 'subpages', len(re.findall(r_title, raw2)), 'pages'
-
- delay(config=config, session=session)
- c += 1
-
- c = 0
- m = re.compile(r_title).finditer(rawacum)
- for i in m:
- t = undoHTMLEntities(text=i.group('title'))
- if not t.startswith('Special:'):
- if not t in titles:
- titles.append(t)
- c += 1
- print ' %d titles retrieved in the namespace %d' % (c, namespace)
- return titles
-
-def getPageTitles(config={}, session=None):
- """ Get list of page titles """
- #http://en.wikipedia.org/wiki/Special:AllPages
- #http://archiveteam.org/index.php?title=Special:AllPages
- #http://www.wikanda.es/wiki/Especial:Todas
- print 'Loading page titles from namespaces = %s' % (config['namespaces'] and ','.join([str(i) for i in config['namespaces']]) or 'None')
- print 'Excluding titles from namespaces = %s' % (config['exnamespaces'] and ','.join([str(i) for i in config['exnamespaces']]) or 'None')
-
- titles = []
- if config['api']:
- titles = getPageTitlesAPI(config=config, session=session)
- elif config['index']:
- titles = getPageTitlesScraper(config=config, session=session)
-
- titles = list(set(titles)) #removing dupes (e.g. in CZ appears Widget:AddThis two times (main namespace and widget namespace))
- titles.sort() #sorting
-
- print '%d page titles loaded' % (len(titles))
- return titles
-
-def getXMLHeader(config={}, session=None):
- """ Retrieve a random page to extract XML headers (namespace info, etc) """
- #get the header of a random page, to attach it in the complete XML backup
- #similar to: ')[0]
- if not xml:
- print 'XML export on this wiki is broken, quitting.'
- sys.exit()
- return header
-
-def getXMLFileDesc(config={}, title='', session=None):
- """ Get XML for image description page """
- config['curonly'] = 1 #tricky to get only the most recent desc
- return getXMLPage(config=config, title=title, verbose=False, session=session)
-
-def getUserAgent():
- """ Return a cool user-agent to hide Python user-agent """
- useragents = [
- #firefox
- 'Mozilla/5.0 (X11; Linux i686; rv:24.0) Gecko/20100101 Firefox/24.0',
- 'Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0',
- ]
- return useragents[0]
-
-def logerror(config={}, text=''):
- """ Log error in file """
- if text:
- with open('%s/errors.log' % (config['path']), 'a') as outfile:
- output = u'%s: %s\n' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text)
- outfile.write(output.encode('utf-8'))
-
-def getXMLPageCore(headers={}, params={}, config={}, session=None):
- """ """
- #returns a XML containing params['limit'] revisions (or current only), ending in
- #if retrieving params['limit'] revisions fails, returns a current only version
- #if all fail, returns the empty string
- xml = ''
- c = 0
- maxseconds = 100 #max seconds to wait in a single sleeping
- maxretries = 5 # x retries and skip
- increment = 20 #increment every retry
- while not re.search(r'', xml):
- if c > 0 and c < maxretries:
- wait = increment * c < maxseconds and increment * c or maxseconds # incremental until maxseconds
- print ' XML for "%s" is wrong. Waiting %d seconds and reloading...' % (params['pages'], wait)
- time.sleep(wait)
- if params['limit'] > 1: # reducing server load requesting smallest chunks (if curonly then limit = 1 from mother function)
- params['limit'] = params['limit'] / 2 # half
- if c >= maxretries:
- print ' We have retried %d times' % (c)
- print ' MediaWiki error for "%s", network error or whatever...' % (params['pages'])
- # If it's not already what we tried: our last chance, preserve only the last revision...
- # config['curonly'] means that the whole dump is configured to save nonly the last
- # params['curonly'] should mean that we've already tried this fallback, because it's set by the following if and passed to getXMLPageCore
- if not config['curonly']:
- print ' Trying to save only the last revision for this page...'
- params['curonly'] = 1
- logerror(config=config, text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' % (params['pages']))
- return getXMLPageCore(headers=headers, params=params, config=config)
- else:
- print ' Saving in the errors log, and skipping...'
- logerror(config=config, text='Error while retrieving the last revision of "%s". Skipping.' % (params['pages']))
- return '' # empty xml
- #FIXME HANDLE HTTP Errors HERE
- r = session.post(url=config['index'], data=params, headers=headers)
- handleStatusCode(r)
- xml = r.text
- c += 1
-
- return xml
-
-def getXMLPage(config={}, title='', verbose=True, session=None):
- """ Get the full history (or current only) of a page """
-
- #if server errors occurs while retrieving the full page history, it may return [oldest OK versions] + last version, excluding middle revisions, so it would be partialy truncated
- #http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F
-
- limit = 1000
- truncated = False
- title_ = title
- title_ = re.sub(' ', '_', title_)
- #do not convert & into %26, title_ = re.sub('&', '%26', title_)
- params = {'title': 'Special:Export', 'pages': title_, 'action': 'submit'}
- if config['curonly']:
- params['curonly'] = 1
- params['limit'] = 1
- else:
- params['offset'] = '1' # 1 always < 2000s
- params['limit'] = limit
- if config.has_key('templates') and config['templates']: #in other case, do not set params['templates']
- params['templates'] = 1
-
- xml = getXMLPageCore(params=params, config=config, session=session)
-
- #if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
- #else, warning about Special:Export truncating large page histories
- r_timestamp = r'([^<]+)'
- if not config['curonly'] and re.search(r_timestamp, xml): # search for timestamps in xml to avoid analysing empty pages like Special:Allpages and the random one
- while not truncated and params['offset']: #next chunk
- params['offset'] = re.findall(r_timestamp, xml)[-1] #get the last timestamp from the acum XML
- xml2 = getXMLPageCore(params=params, config=config, session=session)
-
- if re.findall(r_timestamp, xml2): #are there more edits in this next XML chunk or no ?
- if re.findall(r_timestamp, xml2)[-1] == params['offset']:
- #again the same XML, this wiki does not support params in Special:Export, offer complete XML up to X edits (usually 1000)
- print 'ATTENTION: This wiki does not allow some parameters in Special:Export, therefore pages with large histories may be truncated'
- truncated = True
- break
- else:
- """
-
-
- Main Page
- 15580374
- edit=sysop:move=sysop (?)
-
- 418009832
- 2011-03-09T19:57:06Z
-
- """
- #offset is OK in this wiki, merge with the previous chunk of this page history and continue
- xml = xml.split('')[0] + ' ' + (''.join(xml2.split('')[1:]))
- else:
- params['offset'] = '' #no more edits in this page history
-
- if verbose:
- numberofedits = len(re.findall(r_timestamp, xml))
- if (numberofedits == 1):
- print ' %s, 1 edit' % (title)
- else:
- print ' %s, %d edits' % (title, numberofedits)
-
- return xml
-
-def cleanXML(xml=''):
- """ Trim redundant info """
- #do not touch XML codification, leave AS IS
- if re.search(r'\n', xml) and re.search(r'', xml):
- xml = xml.split('\n')[1]
- xml = xml.split('')[0]
- return xml
-
-def generateXMLDump(config={}, titles=[], start='', session=None):
- """ Generates a XML dump for a list of titles """
-
- print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
- header = getXMLHeader(config=config, session=session)
- footer = '\n' #new line at the end
- xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config), config['date'], config['curonly'] and 'current' or 'history')
- xmlfile = ''
- lock = True
- if start:
- #remove the last chunk of xml dump (it is probably incomplete)
- xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'r')
- xmlfile2 = open('%s/%s2' % (config['path'], xmlfilename), 'w')
- prev = ''
- c = 0
- for l in xmlfile:
- #removing \n until end of file
- if c != 0: #lock to avoid write an empty line at the begining of file
- if not re.search(r'%s' % (start), l):
- xmlfile2.write(prev)
- else:
- break
- c += 1
- prev = l
- xmlfile.close()
- xmlfile2.close()
- #subst xml with xml2
- os.remove('%s/%s' % (config['path'], xmlfilename)) #remove previous xml dump
- os.rename('%s/%s2' % (config['path'], xmlfilename), '%s/%s' % (config['path'], xmlfilename)) #move correctly truncated dump to its real name
- else:
- #requested complete xml dump
- lock = False
- xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
- xmlfile.write(header.encode('utf-8'))
- xmlfile.close()
-
- xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
- c = 1
- for title in titles:
- if not title.strip():
- continue
- if title == start: #start downloading from start, included
- lock = False
- if lock:
- continue
- delay(config=config, session=session)
- if c % 10 == 0:
- print 'Downloaded %d pages' % (c)
- xml = getXMLPage(config=config, title=title, session=session)
- xml = cleanXML(xml=xml)
- if not xml:
- logerror(config=config, text=u'The page "%s" was missing in the wiki (probably deleted)' % (title))
- #here, XML is a correct chunk or
- #an empty string due to a deleted page (logged in errors log) or
- #an empty string due to an error while retrieving the page from server (logged in errors log)
- xmlfile.write(xml.encode('utf-8'))
- c += 1
- xmlfile.write(footer)
- xmlfile.close()
- print 'XML dump saved at...', xmlfilename
-
-def saveTitles(config={}, titles=[]):
- """ Save title list in a file """
-
- titlesfilename = '%s-%s-titles.txt' % (domain2prefix(config=config), config['date'])
- titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'w')
- output = u"%s\n--END--" % ('\n'.join(titles))
- titlesfile.write(output.encode('utf-8'))
- titlesfile.close()
-
- print 'Titles saved at...', titlesfilename
-
-def saveImageFilenamesURL(config={}, images=[], session=None):
- """ Save image list in a file, including filename, url and uploader """
-
- imagesfilename = '%s-%s-images.txt' % (domain2prefix(config=config), config['date'])
- imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'w')
- imagesfile.write(('\n'.join(['%s\t%s\t%s' % (filename, url, uploader) for filename, url, uploader in images]).encode('utf-8')))
- imagesfile.write('\n--END--')
- imagesfile.close()
-
- print 'Image filenames and URLs saved at...', imagesfilename
-
-def getImageFilenamesURL(config={}, session=None):
- """ Retrieve file list: filename, url, uploader """
-
- print 'Retrieving image filenames'
- r_next = r'(?\d+)&' # (? 10:
- print 'Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit)
- limit = limit/10
- continue
- elif retries > 0: # waste retries, then exit
- retries -= 1
- print 'Retrying...'
- continue
- else:
- print 'No more retries, exit...'
- break
-
- raw = cleanHTML(raw)
- #archiveteam 1.15.1
'
- #http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
- #(desc) 109 0923.JPG . . 885,713 bytes . . Bfalconer . . 18:44, 17 November 2005
- r_images4 = r'(?im)]+ title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+]+>(?P[^<]+)'
- m = []
- #different mediawiki versions
- if re.search(r_images1, raw):
- m = re.compile(r_images1).finditer(raw)
- elif re.search(r_images2, raw):
- m = re.compile(r_images2).finditer(raw)
- elif re.search(r_images3, raw):
- m = re.compile(r_images3).finditer(raw)
- elif re.search(r_images4, raw):
- m = re.compile(r_images4).finditer(raw)
-
- for i in m:
- url = i.group('url')
- if url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')): #is it a relative URL?
- if url[0] == '/': #slash is added later
- url = url[1:]
- domainalone = config['index'].split('://')[1].split('/')[0] #remove from :// (http or https) until the first / after domain
- url = u'%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url
- url = undoHTMLEntities(text=url)
- #url = urllib.unquote(url) #do not use unquote with url, it break some urls with odd chars
- url = re.sub(' ', '_', url)
- filename = re.sub('_', ' ', i.group('filename'))
- filename = undoHTMLEntities(text=filename)
- filename = urllib.unquote(filename)
- uploader = re.sub('_', ' ', i.group('uploader'))
- uploader = undoHTMLEntities(text=uploader)
- uploader = urllib.unquote(uploader)
- images.append([filename, url, uploader])
- #print filename, url
-
- if re.search(r_next, raw):
- offset = re.findall(r_next, raw)[0]
- retries += 5 # add more retries if we got a page with offset
- else:
- offset = ''
-
- if (len(images) == 1):
- print ' Found 1 image'
- else:
- print ' Found %d images' % (len(images))
-
- images.sort()
- return images
-
-def getImageFilenamesURLAPI(config={}, session=None):
- """ Retrieve file list: filename, url, uploader """
-
- print 'Retrieving image filenames'
- aifrom = '!'
- images = []
- while aifrom:
- sys.stderr.write('.') #progress
- params = {'action': 'query', 'list': 'allimages', 'aiprop': 'url|user', 'aifrom': aifrom, 'format': 'json', 'ailimit': 500}
- #FIXME Handle HTTP Errors HERE
- r = session.post(url=config['api'], data=params)
- handleStatusCode(r)
- jsonimages = json.loads(r.text)
- delay(config=config, session=session)
- aifrom = ''
- if jsonimages.has_key('query-continue') and jsonimages['query-continue'].has_key('allimages'):
- if jsonimages['query-continue']['allimages'].has_key('aicontinue'):
- aifrom = jsonimages['query-continue']['allimages']['aicontinue']
- elif jsonimages['query-continue']['allimages'].has_key('aifrom'):
- aifrom = jsonimages['query-continue']['allimages']['aifrom']
- #print aifrom
-
- for image in jsonimages['query']['allimages']:
- url = image['url']
- if url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')): #is it a relative URL?
- if url[0] == '/': #slash is added later
- url = url[1:]
- domainalone = config['index'].split('://')[1].split('/')[0] #remove from :// (http or https) until the first / after domain
- url = u'%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url
- url = re.sub(' ', '_', url)
- # encoding to ascii is needed to work around this horrible bug: http://bugs.python.org/issue8136
- filename = unicode(urllib.unquote((re.sub('_', ' ', url.split('/')[-1])).encode('ascii','ignore')), 'utf-8')
- uploader = re.sub('_', ' ', image['user'])
- images.append([filename, url, uploader])
-
- if (len(images) == 1):
- print ' Found 1 image'
- else:
- print ' Found %d images' % (len(images))
-
- images.sort()
- return images
-
-def undoHTMLEntities(text=''):
- """ Undo some HTML codes """
-
- text = re.sub('<', '<', text) # i guess only < > & " ' need conversion http://www.w3schools.com/html/html_entities.asp
- text = re.sub('>', '>', text)
- text = re.sub('&', '&', text)
- text = re.sub('"', '"', text)
- text = re.sub(''', '\'', text)
-
- return text
-
-def generateImageDump(config={}, other={}, images=[], start='', session=None):
- """ Save files and descriptions using a file list """
-
- #fix use subdirectories md5
- print 'Retrieving images from "%s"' % (start and start or 'start')
- imagepath = '%s/images' % (config['path'])
- if not os.path.isdir(imagepath):
- print 'Creating "%s" directory' % (imagepath)
- os.makedirs(imagepath)
-
- c = 0
- lock = True
- if not start:
- lock = False
- for filename, url, uploader in images:
- if filename == start: #start downloading from start (included)
- lock = False
- if lock:
- continue
- delay(config=config, session=session)
-
- #saving file
- #truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash limit). Later .desc is added to filename, so better 100 as max)
- filename2 = urllib.unquote(filename)
- if len(filename2) > other['filenamelimit']:
- # split last . (extension) and then merge
- filename2 = truncateFilename(other=other, filename=filename2)
- print 'Filename is too long, truncating. Now it is:', filename2
- filename3 = u'%s/%s' % (imagepath, filename2)
- imagefile = open(filename3, 'wb')
- r = requests.get(url=url)
- imagefile.write(r.content)
- imagefile.close()
- #saving description if any
- xmlfiledesc = getXMLFileDesc(config=config, title=u'Image:%s' % (filename), session=session) # use Image: for backwards compatibility
- f = open('%s/%s.desc' % (imagepath, filename2), 'w')
- if not re.search(r'', xmlfiledesc): #Banner featuring SG1, SGA, SGU teams
- #failure when retrieving desc? then save it as empty .desc
- xmlfiledesc = ''
- f.write(xmlfiledesc.encode('utf-8'))
- f.close()
- delay(config=config, session=session)
- c += 1
- if c % 10 == 0:
- print ' Downloaded %d images' % (c)
-
- print 'Downloaded %d images' % (c)
-
-def saveLogs(config={}, session=None):
- """ Save Special:Log """
- #get all logs from Special:Log
- """parse
-
- """
- delay(config=config, session=session)
-
-def domain2prefix(config={}, session=None):
- """ Convert domain name to a valid prefix filename. """
-
- # At this point, both api and index are supposed to be defined
- domain = ''
- if config['api']:
- domain = config['api']
- elif config['index']:
- domain = config['index']
-
- domain = domain.lower()
- domain = re.sub(r'(https?://|www\.|/index\.php|/api\.php)', '', domain)
- domain = re.sub(r'/', '_', domain)
- domain = re.sub(r'\.', '', domain)
- domain = re.sub(r'[^A-Za-z0-9]', '_', domain)
-
- return domain
-
-def loadConfig(config={}, configfilename=''):
- """ Load config file """
-
- try:
- with open('%s/%s' % (config['path'], configfilename), 'r') as infile:
- config = cPickle.load(infile)
- except:
- print 'There is no config file. we can\'t resume. Start a new dump.'
- sys.exit()
-
- return config
-
-def saveConfig(config={}, configfilename=''):
- """ Save config file """
-
- with open('%s/%s' % (config['path'], configfilename), 'w') as outfile:
- cPickle.dump(config, outfile)
-
-def welcome():
- message = ''
- """ Opening message """
- message += "#"*73
- message += """
-# Welcome to DumpGenerator %s by WikiTeam (GPL v3) #
-# More info at: https://github.com/WikiTeam/wikiteam #""" % (getVersion())
- message += "\n"
- message += "#"*73
- message += "\n"
- message += ''
- message += "\n"
- message += "#"*73
- message += """
-# Copyright (C) 2011-2014 WikiTeam #
-# This program is free software: you can redistribute it and/or modify #
-# it under the terms of the GNU General Public License as published by #
-# the Free Software Foundation, either version 3 of the License, or #
-# (at your option) any later version. #
-# #
-# This program is distributed in the hope that it will be useful, #
-# but WITHOUT ANY WARRANTY; without even the implied warranty of #
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
-# GNU General Public License for more details. #
-# #
-# You should have received a copy of the GNU General Public License #
-# along with this program. If not, see . #"""
- message += "\n"
- message += "#"*73
- message += "\n"
- message += ''
-
- return message
-
-def bye():
- """ Closing message """
- print "---> Congratulations! Your dump is complete <---"
- print "If you found any bug, report a new issue here: https://github.com/WikiTeam/wikiteam/issues"
- print "If this is a public wiki, please, consider publishing this dump. Do it yourself as explained in https://github.com/WikiTeam/wikiteam/wiki/New-Tutorial#Publishing_the_dump or contact us at https://github.com/WikiTeam/wikiteam"
- print "Good luck! Bye!"
-
-
-def getParameters(params=[]):
- if not params:
- params = sys.argv
-
- parser = argparse.ArgumentParser(description='')
-
- parser.add_argument('-v', '--version', action='version', version=getVersion())
- parser.add_argument('--cookies', metavar="cookies.txt", help="path to a cookies.txt file")
- parser.add_argument('--delay', metavar=5, default=0, help="adds a delay (in seconds)")
- parser.add_argument('--retries', metavar=5, default=5, help="Maximum number of retries for ")
- parser.add_argument('--get-wiki-engine', action='store_true', help="returns the wiki engine")
-
- groupWikiOrAPIOrIndex = parser.add_mutually_exclusive_group(required=True)
- groupWikiOrAPIOrIndex.add_argument('wiki', default='', nargs='?', help="URL to wiki")
- groupWikiOrAPIOrIndex.add_argument('--api', help="URL to api.php")
- groupWikiOrAPIOrIndex.add_argument('--index', help="URL to index.php")
-
- groupXMLOrImages = parser.add_argument_group()
- groupXMLOrImages.add_argument('--xml', action='store_true', help="generates a full history XML dump (--xml --curonly for current revisions only)")
- parser.add_argument('--curonly', action='store_true', help='store only the current version of pages')
-
- groupXMLOrImages.add_argument('--images', action='store_true', help="generates an image dump")
-
- parser.add_argument('--path', help='path to store wiki dump at')
- parser.add_argument('--resume', action='store_true', help='resumes previous incomplete dump (requires --path)')
- parser.add_argument('--force', action='store_true', help='')
- parser.add_argument('--namespaces', metavar="1,2,3", help='comma-separated value of namespaces to include (all by default)')
- parser.add_argument('--exnamespaces', metavar="1,2,3", help='comma-separated value of namespaces to exclude')
-
- parser.add_argument('--user', help='Username if authentication is required.')
- parser.add_argument('--pass', dest='password', help='Password if authentication is required.')
-
- args = parser.parse_args()
- #print args
-
- # Execute excluding args
- if args.get_wiki_engine and args.wiki and (args.wiki.startswith('http://') or args.wiki.startswith('https://')):
- print getWikiEngine(url=args.wiki)
- sys.exit()
- # End execute excluding args
-
- # check API URL
- if args.api and (not args.api.startswith('http://') and not args.api.startswith('https://')):
- print args.api
- print 'ERROR: URL to api.php must start with http:// or https://\n'
- parser.print_usage()
- sys.exit(1)
-
- # check index URL
- if args.index and (not args.index.startswith('http://') and not args.index.startswith('https://')):
- print 'ERROR: URL to index.php must start with http:// or https://\n'
- parser.print_usage()
- sys.exit(1)
-
- # check user and pass (one requires both)
- if (args.user and not args.password) or (args.password and not args.user):
- print 'Both --user and --pass are required for authentication.'
- parser.print_usage()
- sys.exit(1)
-
- namespaces = ['all']
- exnamespaces = []
- # Process namespace inclusions
- if args.namespaces:
- if re.search(r'[^\d, \-]', args.namespaces) and args.namespaces.lower() != 'all': #fix, why - ? and... --namespaces= all with a space works?
- print "Invalid namespace values.\nValid format is integer(s) separated by commas"
- sys.exit()
- else:
- ns = re.sub(' ', '', args.namespaces)
- if ns.lower() == 'all':
- namespaces = ['all']
- else:
- namespaces = [int(i) for i in ns.split(',')]
-
- # Process namespace exclusions
- if args.exnamespaces:
- if re.search(r'[^\d, \-]', args.exnamespaces):
- print "Invalid namespace values.\nValid format is integer(s) separated by commas"
- sys.exit(1)
- else:
- ns = re.sub(' ', '', args.exnamespaces)
- if ns.lower() == 'all':
- print 'You cannot exclude all namespaces.'
- sys.exit(1)
- else:
- exnamespaces = [int(i) for i in ns.split(',')]
-
- # --curonly requires --xml
- if args.curonly and not args.xml:
- print "--curonly requires --xml\n"
- parser.print_usage()
- sys.exit(1)
-
- #user chose --api, but --index it is necessary for special:export: we generate it
- if args.api and not args.index:
- index = args.api.split('api.php')[0] + 'index.php'
- # WARNING: remove index.php here for misconfigured sites like editthis.info, or provide --index directly
- print 'You didn\'t provide a path for index.php, using ', index
- else:
- index = args.index
-
- cj = cookielib.MozillaCookieJar()
- if args.cookies:
- cj.load(args.cookies)
- print 'Using cookies from %s' % args.cookies
-
- session = requests.Session()
- session.cookies = cj
- session.headers = {'User-Agent': getUserAgent()}
- if args.user and args.password:
- session.auth = (args.user, args.password)
- #session.mount(args.api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret))
-
- config = {
- 'curonly': args.curonly,
- 'date': datetime.datetime.now().strftime('%Y%m%d'),
- 'api': args.api or '',
- 'index': index,
- 'images': args.images,
- 'logs': False,
- 'xml': args.xml,
- 'namespaces': namespaces,
- 'exnamespaces': exnamespaces,
- 'path': args.path or '',
- 'cookies': args.cookies or '',
- 'delay': args.delay
- }
- other = {
- 'resume': args.resume,
- 'filenamelimit': 100, #do not change
- 'force': args.force,
- 'session': session
- }
-
- if config['api']:
- #check api.php
- if checkAPI(config['api'], config, session=other['session']):
- print 'api.php is OK'
- else:
- print 'Error in api.php, please, provide a correct path to api.php'
- sys.exit()
-
- if config['index']:
- #check index.php
- if checkIndexphp(config['index'], config, session=other['session']):
- print 'index.php is OK'
- else:
- print 'Error in index.php, please, provide a correct path to index.php'
- sys.exit()
-
- #calculating path, if not defined by user with --path=
- if not config['path']:
- config['path'] = './%s-%s-wikidump' % (domain2prefix(config=config, session=session), config['date'])
-
- return config, other
-
-def checkAPI(api, config={}, session=None):
- """ Checking API availability """
- global cj
- r = session.post(url=api, data={'action': 'query', 'meta': 'siteinfo', 'format': 'json'})
- resultText = r.text
- print 'Checking api.php...', api
- if "MediaWiki API is not enabled for this site." in resultText:
- return False
- result = json.loads(resultText)
- delay(config=config, session=session)
- if result.has_key('query'):
- return True
- return False
-
-def checkIndexphp(indexphp, config={}, session=None):
- """ Checking index.php availability """
- r = session.post(url=indexphp, data={'title': 'Special:Version'})
- raw = r.text
- delay(config=config, session=session)
- print 'Checking index.php...', indexphp
- if re.search(r'(Special:Badtitle|class="permissions-errors"|"wgCanonicalSpecialPageName":"Badtitle"|Login Required)', raw) and not config['cookies']: # Workaround for issue 71
- print "ERROR: This wiki requires login and we are not authenticated"
- return False
- if re.search(r'(This wiki is powered by|
|meta name="generator" content="MediaWiki)', raw):
- return True
- return False
-
-def removeIP(raw=''):
- """ Remove IP from HTML comments """
-
- raw = re.sub(r'\d+\.\d+\.\d+\.\d+', '0.0.0.0', raw)
- #http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html
- #weird cases as :: are not included
- raw = re.sub(r'(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}', '0:0:0:0:0:0:0:0', raw)
-
- return raw
-
-def checkXMLIntegrity(config={}, session=None):
- """ Check XML dump integrity, to detect broken XML chunks """
- return
-
- print 'Verifying dump...'
- checktitles = 0
- checkpageopen = 0
- checkpageclose = 0
- checkrevisionopen = 0
- checkrevisionclose = 0
- for line in file('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config, session=session), config['date'], config['curonly'] and 'current' or 'history'), 'r').read().splitlines():
- if "" in line:
- checkrevisionopen += 1
- elif "" in line:
- checkrevisionclose += 1
- elif "" in line:
- checkpageopen += 1
- elif "" in line:
- checkpageclose += 1
- elif "" in line:
- checktitles += 1
- else:
- continue
- if (checktitles == checkpageopen and checktitles == checkpageclose and checkrevisionopen == checkrevisionclose):
- pass
- else:
- print 'XML dump seems to be corrupted.'
- reply = ''
- while reply.lower() not in ['yes', 'y', 'no', 'n']:
- reply = raw_input('Regenerate a new dump ([yes, y], [no, n])? ')
- if reply.lower() in ['yes', 'y']:
- generateXMLDump(config=config, titles=titles)
- elif reply.lower() in ['no', 'n']:
- print 'Not generating a new dump.'
-
-
-def createNewDump(config={}, other={}):
- titles = []
- images = []
- print 'Trying generating a new dump into a new directory...'
- if config['xml']:
- titles += getPageTitles(config=config, session=other['session'])
- saveTitles(config=config, titles=titles)
- generateXMLDump(config=config, titles=titles, session=other['session'])
- checkXMLIntegrity(config=config)
- if config['images']:
- if config['api']:
- images += getImageFilenamesURLAPI(config=config, session=other['session'])
- else:
- images += getImageFilenamesURL(config=config, session=other['session'])
- saveImageFilenamesURL(config=config, images=images, session=other['session'])
- generateImageDump(config=config, other=other, images=images, session=other['session'])
- if config['logs']:
- saveLogs(config=config, session=session)
-
-def resumePreviousDump(config={}, other={}):
- titles = []
- images = []
- print 'Resuming previous dump process...'
- if config['xml']:
- #load titles
- lasttitle = ''
- try:
- f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config, session=other['session']), config['date']), 'r')
- raw = unicode(f.read(), 'utf-8')
- titles = raw.split('\n')
- lasttitle = titles[-1]
- if not lasttitle: #empty line at EOF ?
- lasttitle = titles[-2]
- f.close()
- except:
- pass #probably file doesnot exists
- if lasttitle == '--END--':
- #titles list is complete
- print 'Title list was completed in the previous session'
- else:
- print 'Title list is incomplete. Reloading...'
- #do not resume, reload, to avoid inconsistences, deleted pages or so
- titles = getPageTitles(config=config, session=other['session'])
- saveTitles(config=config, titles=titles)
- #checking xml dump
- xmliscomplete = False
- lastxmltitle = ''
- try:
- f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config, session=other['session']), config['date'], config['curonly'] and 'current' or 'history'), 'r')
- for l in f:
- if re.findall('', l):
- #xml dump is complete
- xmliscomplete = True
- break
- xmltitles = re.findall(r'([^<]+)', l) #weird if found more than 1, but maybe
- if xmltitles:
- lastxmltitle = undoHTMLEntities(text=xmltitles[-1])
- f.close()
- except:
- pass #probably file doesnot exists
- #removing --END-- before getXMLs
- while titles and titles[-1] in ['', '--END--']:
- titles = titles[:-1]
- if xmliscomplete:
- print 'XML dump was completed in the previous session'
- elif lastxmltitle:
- #resuming...
- print 'Resuming XML dump from "%s"' % (lastxmltitle)
- generateXMLDump(config=config, titles=titles, start=lastxmltitle, session=other['session'])
- else:
- #corrupt? only has XML header?
- print 'XML is corrupt? Regenerating...'
- generateXMLDump(config=config, titles=titles, session=other['session'])
-
- if config['images']:
- #load images
- lastimage = ''
- try:
- f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r')
- raw = unicode(f.read(), 'utf-8').strip()
- lines = raw.split('\n')
- for l in lines:
- if re.search(r'\t', l):
- images.append(l.split('\t'))
- lastimage = lines[-1]
- f.close()
- except:
- pass #probably file doesnot exists
- if lastimage == u'--END--':
- print 'Image list was completed in the previous session'
- else:
- print 'Image list is incomplete. Reloading...'
- #do not resume, reload, to avoid inconsistences, deleted images or so
- if config['api']:
- images=getImageFilenamesURLAPI(config=config, session=other['session'])
- else:
- images = getImageFilenamesURL(config=config, session=other['session'])
- saveImageFilenamesURL(config=config, images=images)
- #checking images directory
- listdir = []
- try:
- listdir = os.listdir('%s/images' % (config['path']))
- except:
- pass #probably directory does not exist
- listdir.sort()
- complete = True
- lastfilename = ''
- lastfilename2 = ''
- c = 0
- for filename, url, uploader in images:
- lastfilename2 = lastfilename
- lastfilename = filename #return always the complete filename, not the truncated
- filename2 = filename
- if len(filename2) > other['filenamelimit']:
- filename2 = truncateFilename(other=other, filename=filename2)
- if filename2 not in listdir:
- complete = False
- break
- c +=1
- print '%d images were found in the directory from a previous session' % (c)
- if complete:
- #image dump is complete
- print 'Image dump was completed in the previous session'
- else:
- generateImageDump(config=config, other=other, images=images, start=lastfilename2, session=other['session']) # we resume from previous image, which may be corrupted (or missing .desc) by the previous session ctrl-c or abort
-
- if config['logs']:
- #fix
- pass
-
-def saveSpecialVersion(config={}, session=None):
- """ Save Special:Version as .html, to preserve extensions details """
-
- if os.path.exists('%s/Special:Version.html' % (config['path'])):
- print 'Special:Version.html exists, do not overwrite'
- else:
- print 'Downloading Special:Version with extensions and other related info'
- r = session.post(url=config['index'], data={'title': 'Special:Version'})
- raw = r.text
- delay(config=config, session=session)
- raw = removeIP(raw=raw)
- with open('%s/Special:Version.html' % (config['path']), 'w') as outfile:
- outfile.write(raw.encode('utf-8'))
-
-def saveIndexPHP(config={}, session=None):
- """ Save index.php as .html, to preserve license details available at the botom of the page """
-
- if os.path.exists('%s/index.html' % (config['path'])):
- print 'index.html exists, do not overwrite'
- else:
- print 'Downloading index.php (Main Page) as index.html'
- r = session.post(url=config['index'], data={})
- raw = r.text
- delay(config=config, session=session)
- raw = removeIP(raw=raw)
- with open('%s/index.html' % (config['path']), 'w') as outfile:
- outfile.write(raw.encode('utf-8'))
-
-def saveSiteInfo(config={}, session=None):
- """ Save a file with site info """
-
- if config['api']:
- if os.path.exists('%s/siteinfo.json' % (config['path'])):
- print 'siteinfo.json exists, do not overwrite'
- else:
- print 'Downloading site info as siteinfo.json'
- r = session.post(url=config['api'], data = {'action': 'query', 'meta': 'siteinfo', 'format': 'json'})
- result = json.loads(r.text)
- delay(config=config, session=session)
- with open('%s/siteinfo.json' % (config['path']), 'w') as outfile:
- outfile.write(json.dumps(result, indent=4, sort_keys=True))
-
-def avoidWikimediaProjects(config={}, other={}):
- """ Skip Wikimedia projects and redirect to the dumps website """
-
- #notice about wikipedia dumps
- if re.findall(r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org', config['api']+config['index']):
- print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!'
- print 'Download the dumps from http://dumps.wikimedia.org'
- if not other['force']:
- print 'Thanks!'
- sys.exit()
-
-def getWikiEngine(url=''):
- """ Returns the wiki engine of a URL, if known """
-
- session = requests.Session()
- session.headers = {'User-Agent': getUserAgent()}
- r = session.post(url=url)
- result = r.text
-
- wikiengine = 'Unknown'
- if re.search(ur'(?im)( Congratulations! Your dump is complete <---
+If you have suggestions, file a new issue here (Google account required): http://code.google.com/p/wikiteam/issues/list
+If this is a public wiki, do consider publishing this dump so others can benefit from it. Follow the steps as explained in http://code.google.com/p/wikiteam/wiki/NewTutorial#Publishing_the_dump or contact us at http://code.google.com/p/wikiteam.
+Thank you for using DumpGenerator %s by WikiTeam, good bye!""" % ( self.Version )
+ return message
+
+ def checkAPI(self):
+ """
+ Checks the validity of the api.php.
+ """
+ query = {
+ "meta": "siteinfo",
+ "siprop": "general" }
+ sitestats = json.loads( RequestAPI.query( query ) )
+ try:
+ if ( sitestats[ "query" ][ "general" ][ "server" ] in self.urltoapi ):
+ return True
+ except:
+ try:
+ if ( sitestats[ "error" ][ "code" ] == "readapidenied" ) and ( self.cookies == "" ):
+ Output.warn( "The wiki is private and we do not have proper authentication information!" )
+ return False
+ except:
+ Output.warn( "This api.php seems weird or is not valid." )
+ return False
+
+ def checkIndex(self):
+ """
+ Checks the validity of the index.php.
+ """
+ # TODO: Screen scraping is involved here, need backward compact for older version of MediaWiki.
+ parameters = { "title": "Special:Version" }
+ request = RequestIndex.query( parameters )
+ # Since we are at Special:Version, we should not be getting Special:BadTitle unless we are not logged in
+ if ( re.search( r'(Special:Badtitle)', request ) ) and ( self.cookies == "" ):
+ Output.error( "The wiki is private and we do not have proper authentication information!" )
+ sys.exit(1)
+
+ # Check for some tags within the Special:Version page, must be language-independent
+ if ( re.search( r'(
|meta name="generator" content="MediaWiki)', request ) ):
+ return True
+
+ def debug(self):
+ """
+ A temporary debug mode for testing purposes.
+ REMOVE WHEN COMPLETE!
+ """
+ print "DEBUG MODE ON"
+ print "Date: %s" % (self.date)
+ print "URL to api.php: %s" % (self.urltoapi)
+ print "URL to index.php: %s" % (self.urltoindex)
+ print "Current revision only: %s" % (self.curonly)
+ print "Image dump: %s" % (self.images)
+ print "Log dump: %s" % (self.logs)
+ print "XML dump: %s" % (self.xml)
+ print "Resume: %s" % (self.resume)
+ print "Path for resuming: %s" % (self.path)
+ print "Delay: %s" % (self.delay)
+ print "Cookies file: %s" % (self.cookies)
+ print "Excluded namespaces: %s" % (self.exnamespaces)
+ print "Debug mode on: %s" % (self.debugmode)
+ self.tasklist = sorted( self.tasklist )
+ for task in self.tasklist:
+ if ( task == "axml" ):
+ DumpXML.run()
+ elif ( task == "bimages" ):
+ DumpImages.run()
+ elif ( task == "clogs" ):
+ DumpLogs.run()
+ sys.exit(0)
+
+ def downloadHtmlPages(self):
+ """
+ Downloads the HTML pages such as the main page and Special:Version.
+ """
+ # Download the main page
+ Output.message( "Downloading index.php (Main Page) as index.html." )
+ query = {}
+ index = RequestIndex.query( query )
+ index = RequestIndex.removeIP( index )
+ if ( os.path.exists( "Special:Version.html" ) ):
+ os.remove( "index.html" )
+ else:
+ pass
+ for line in index:
+ Output.appendToFile( "index.html", line )
+
+ # Download Special:Version or its respective localized version
+ Output.message( "Downloading Special:Version with extensions and other related info." )
+ query = { "title": "Special:Version" }
+ SpecialVersion = RequestIndex.query( query )
+ SpecialVersion = RequestIndex.removeIP( SpecialVersion )
+ if ( os.path.exists( "Special:Version.html" ) ):
+ os.remove( "Special:Version.html" )
+ else:
+ pass
+ for line in SpecialVersion:
+ Output.appendToFile( "Special:Version.html", line )
+
+ def fixHTMLEntities(self, text):
+ """
+ Convert some HTML entities to their regular characters.
+ """
+ text = re.sub('<', '<', text)
+ text = re.sub('>', '>', text)
+ text = re.sub('&', '&', text)
+ text = re.sub('"', '"', text)
+ text = re.sub(''', '\'', text)
+ return text
+
+ def help(self):
+ """
+ Provides vital help information to the user. This function
+ directly uses the "print" function because it is harmless and
+ what needs to be logged has already been done so.
+
+ Returns: Help message text
+ """
+ message = """DumpGenerator %s, a script to generate backups of MediaWiki wikis.
+For more information, please see: http://code.google.com/p/wikiteam/wiki/NewTutorial
+
+Startup:
+ -h, --help Displays this help information and exits.
+ -v, --version Displays the version of this script, with additional credits.
+
+Wiki information:
+ --api=URL The URL to the wiki's api.php, not to be used with --index.
+ --index=URL The URL to the wiki's index.php, not to be used with --api.
+
+Options:
+ --xml Creates an XML dump.
+ --images Creates an image dump.
+ --logs Creates a dump of all log pages (not yet supported).
+
+XML dump (only if --xml is used):
+ --curonly Download only the current revision.
+ --exnamespaces The unique system number(s) for namespaces to exclude, separated by commas.
+ --titlesonly Download only the page titles without the actual content.
+ --titles Path to a file containing list of titles, requires "--END--" to be on the last line.
+
+Other:
+ --auto Enable auto pilot mode (select options that ensures that the script creates a new dump).
+ --resume Resume an incomplete dump (requires --path to be given).
+ --path=PATH Path to the incomplete dump.
+ --delay=SECONDS Adds a delay (in seconds) between requests.
+ --cookies=PATH Path to a Mozilla cookies.txt file for authentication cookies.
+ --nolog Disable logging to dumpgenerator.log (does not affect output in terminal).
+
+Report any issues to our issue tracker: https://code.google.com/p/wikiteam.""" % (self.Version)
+ return message
+
+ def loadConfig(self):
+ """
+ Load a config file from a partially-made dump.
+ """
+ config = json.loads( self.configfile )
+ self.date = config[ "date" ]
+ self.useAPI = config[ "useAPI" ]
+ self.useIndex = config[ "useIndex" ]
+ self.urltoapi = config[ "urltoapi" ]
+ self.urltoindex = config[ "urltoindex" ]
+ self.images = config[ "images" ]
+ self.logs = config[ "logs" ]
+ self.xml = config[ "xml" ]
+ self.curonly = config[ "curonly" ]
+ self.exnamespaces = config[ "exnamespaces" ]
+ self.titlesonly = config[ "titlesonly" ]
+
+ if ( self.images == True ):
+ self.tasklist.append( "bimage" )
+ if ( self.logs == True ):
+ self.tasklist.append( "clogs" )
+ if ( self.xml == True ):
+ self.tasklist.append( "axml" )
+
+ if ( self.useAPI == True ):
+ domain = self.urltoapi
+ elif ( self.useIndex == True ):
+ domain = self.urltoindex
+
+ def makePrefix(self, domain):
+ """
+ Converts a domain to a prefix.
+
+ Inputs:
+ - domain: The domain to change, may contain api.php or index.php as suffix.
+
+ Returns:
+ - string with slashes and stray characters changed to underscores, suffix
+ removed and URL protocol removed.
+ """
+ domain = domain.lower()
+ # Remove unnecessary prefixes and suffixes
+ domain = re.sub(r'(https?://|www\.|/index\.php|/api\.php)', '', domain)
+ # Substitute directory slashes with underscores
+ domain = re.sub(r'/', '_', domain)
+ # Convert any stray character that is not in the alphabet to underscores
+ domain = re.sub(r'[^-.A-Za-z0-9]', '_', domain)
+ return domain
+
+ def makeNiceURL(self, domain):
+ """
+ Converts a domain to a more human-readable format (used for uploading).
+
+ Inputs:
+ - domain: The domain to change, may contain api.php or index.php as suffix.
+
+ Returns:
+ - string with suffix removed.
+ """
+ domain = domain.lower()
+ # Remove the suffixes
+ domain = re.sub(r'(/index\.php|/api\.php)', '', domain)
+ return domain
+
+ def processargs(self):
+ """
+ Processing arguments and options provided by the user.
+ """
+ try:
+ options, answers = getopt.getopt( sys.argv[1:], self.shortoptions, self.longoptions )
+ except getopt.GetoptError:
+ Output.error( "An unknown option has been specified, please check your arguments before re-running!" )
+ sys.exit(1)
+
+ # First accept all arguments and store them in a variable
+ for option, answer in options:
+ # Startup
+ if ( option in ( "-h", "--help" ) ):
+ # Display the help guide and exit
+ print self.help()
+ os.remove( Output.logfile )
+ sys.exit(0)
+ elif ( option in ( "-v", "--version" ) ):
+ # Display the version of this script
+ print self.version()
+ os.remove( Output.logfile )
+ sys.exit(0)
+
+ # Wiki information
+ elif ( option in "--api" ):
+ self.urltoapi = answer
+ self.configoptions[ "urltoapi" ] = self.urltoapi
+ elif ( option in "--index" ):
+ self.urltoindex = answer
+ self.configoptions[ "urltoindex" ] = self.urltoindex
+
+ # Dump options
+ elif ( option == "--images" ):
+ self.images = True
+ self.configoptions[ "images" ] = True
+ self.tasklist.append( "bimages" )
+ elif ( option == "--logs" ):
+ self.logs = True
+ self.configoptions[ "logs" ] = True
+ self.tasklist.append( "clogs" )
+ elif ( option == "--xml" ):
+ self.xml = True
+ self.configoptions[ "xml" ] = True
+ self.tasklist.append( "axml" )
+
+ # XML dump options
+ elif ( option == "--curonly" ):
+ self.curonly = True
+ self.configoptions[ "curonly" ] = True
+ elif ( option in "--exnamespaces" ):
+ self.exnamespaces = answer
+ self.configoptions[ "exnamespaces" ] = self.exnamespaces
+ elif ( option == "--titlesonly" ):
+ self.titlesonly = True
+ self.configoptions[ "titlesonly" ] = True
+ elif ( option in "--titles" ):
+ self.titles = os.path.abspath( answer )
+
+ # Other options
+ elif ( option == "--auto" ):
+ self.autonomous = True
+ elif ( option in "--cookies" ):
+ self.cookies = answer
+ elif ( option in "--delay" ):
+ self.delay = answer
+ elif ( option == "--nolog" ):
+ self.nolog = True
+ elif ( option in "--path" ):
+ self.path = answer
+ elif ( option == "--resume" ):
+ self.resume = True
+
+ # Private options (i.e. usable but not documented in --help)
+ elif ( option == "--debug" ):
+ self.debugmode = True
+ else:
+ Output.error( "An unknown option has been specified, please check your arguments before re-running!" )
+ sys.exit(1)
+
+ # Now to verify that the user is not messing around
+ if ( self.urltoapi == "" and self.urltoindex == "" ):
+ # User did not specify either --api= or --index=
+ if ( self.resume == True and self.path != "" ):
+ # ...but specified --resume and --path= accordingly
+ self.resumeDump()
+ elif ( self.resume == True and self.path == "" ):
+ # ...and specified --resume without --path=
+ Output.error( "--resume was provided, but you still need to tell me the path to the incomplete dump!" )
+ sys.exit(1)
+ else:
+ Output.error( "You need to tell me the URL to either the api.php or to index.php!" )
+ sys.exit(1)
+ elif ( self.resume == True ) and ( self.path == "" ):
+ # User specified --resume, but no --path= was given
+ Output.error( "--resume was provided, but you still need to tell me the path to the incomplete dump!" )
+ sys.exit(1)
+ elif ( self.urltoapi != "" and self.urltoindex != "" ):
+ # User specified both --api= and --index=
+ self.useAPI = True
+ elif ( self.xml == False and ( self.curonly == True or self.exnamespaces != "" ) ):
+ # User specified --curonly and --exnamespaces without --xml
+ Output.error( "You did not specify to make an XML dump using --xml, so why write --curonly or --exnamespaces? Remove them before re-running!" )
+ sys.exit(1)
+
+ if ( self.urltoapi != "" ):
+ self.useAPI = True
+ elif ( self.urltoindex != "" ):
+ self.useIndex = True
+
+ if ( self.useAPI == True ):
+ Output.message( "Checking api.php..." )
+ if not ( self.urltoapi.startswith( "http://" ) ) and not ( self.urltoapi.startswith( "https://" ) ):
+ Output.error( "The URL to api.php must start with either http:// or https://!" )
+ sys.exit(1)
+ elif ( self.checkAPI() ):
+ Output.message( "api.php is okay" )
+ else:
+ Output.error( "There is an error with api.php, please provide a correct path to it." )
+ sys.exit(1)
+ elif ( self.useIndex == True ):
+ Output.message( "Checking index.php..." )
+ if not ( self.urltoindex.startswith( "http://" ) ) and not ( self.urltoindex.startswith( "https://" ) ):
+ Output.error( "The URL to index.php must start with either http:// or https://!" )
+ sys.exit(1)
+ elif ( self.checkIndex() ):
+ Output.message( "index.php is okay" )
+ else:
+ Output.error( "There is an error with index.php, please provide a correct path to it." )
+ sys.exit(1)
+
+ def resumeDump(self):
+ """
+ Resume an incomplete dump defined in self.path.
+ """
+ # TODO: Add support for resuming dumps.
+ os.chdir( self.path )
+ self.loadConfig()
+ self.prefix = "%s-%s" % ( self.makePrefix( domain ), self.date )
+ self.domain = self.makeNiceURL( domain )
+ if ( self.useAPI == True ):
+ self.urltoindex = "%s/index.php" % ( self.domain )
+ self.tasklist = sorted( self.tasklist )
+ for task in self.tasklist:
+ if ( task == "axml" ):
+ DumpXML.run()
+ elif ( task == "bimages" ):
+ DumpImages.run()
+ elif ( task == "clogs" ):
+ DumpLogs.run()
+
+ def run(self):
+ """
+ Run the whole script itself and excute important functions.
+ """
+ print self.welcome()
+ Updater.checkRevision()
+ # Check if previously there was a log file in the working directory and remove it if exists
+ # This is followed by the equivalent of "touch" in Unix to create an empty file
+ if ( os.path.exists( Output.logfile ) ):
+ os.remove( Output.logfile )
+ open( Output.logfile, "a" ).close()
+ else:
+ open( Output.logfile, "a" ).close()
+ self.processargs()
+ if ( DumpGenerator.nolog or DumpGenerator.debugmode):
+ # Remove the dumpgenerator.log file
+ os.remove( Output.logfile )
+ if ( self.useAPI == True ):
+ domain = self.urltoapi
+ elif ( self.useIndex == True ):
+ domain = self.urltoindex
+ directories = os.walk( "." ).next()[1]
+ for directory in directories:
+ # Check if there is a dump that already exists in the current working directory
+ if ( directory.startswith( self.makePrefix( domain ) ) and directory.endswith( "-wikidump" ) ):
+ print "" # Create a blank line
+ Output.warn( "There seems to be a similar dump at %s which might be incomplete." % ( directory ) )
+ if ( self.autonomous == True ):
+ Output.message( "Since auto pilot mode is enabled, that dump will not be resumed." )
+ self.resume = False
+ else:
+ Output.warn( "Do you wish to resume using configuration from that dump? [yes, y], [no, n]" )
+ reply = ""
+ while reply.lower() not in [ "yes", "y", "no", "n" ]:
+ reply = raw_input( "Answer: " )
+ if ( reply.lower() in [ "yes", "y" ] ):
+ if not ( os.path.isfile( "%s/%s" % ( directory, self.configfile ) ) ):
+ Output.error( "I cannot find a %s in the directory! Please delete that directory before re-running!" % ( self.configfile ) )
+ sys.exit(1)
+ else:
+ Output.warn( "Resuming dump and ignoring configuration given in this session..." )
+ self.resume = True
+ self.path = directory
+ break
+ elif ( reply.lower() in [ "no", "n" ] ):
+ Output.message( "Not resuming..." )
+ self.resume = False
+ else:
+ continue
+ if ( self.resume == True ):
+ self.resumeDump()
+ else:
+ self.prefix = "%s-%s" % ( self.makePrefix( domain ), self.date )
+ self.domain = self.makeNiceURL( domain )
+ workingdir = "%s-wikidump" % ( self.prefix )
+ if ( os.path.exists( workingdir ) ):
+ if ( self.autonomous == True ):
+ Output.message( "Since auto pilot mode is enabled, the directory with the same name will be deleted." )
+ reply = "yes"
+ else:
+ Output.warn( "\nThere seems to be a directory with the same name, delete the old one? [yes, y], [no, n]" )
+ reply = ""
+ while reply.lower() not in [ "yes", "y", "no", "n" ]:
+ reply = raw_input( "Answer: " )
+ if ( reply.lower() in [ "yes", "y" ] ):
+ try:
+ shutil.rmtree( workingdir )
+ except:
+ Output.error( "There was a problem deleting the directory, please manually delete it before re-running!" )
+ sys.exit(1)
+ print "" # Create a blank line
+ elif ( reply.lower() in [ "no", "n" ] ):
+ Output.error( "Existing directory exists, either delete that directory or rename it before re-running!" )
+ sys.exit(1)
+ else:
+ pass
+ Output.message( "Generating a new dump into a new directory..." )
+ os.mkdir( workingdir )
+ os.rename( Output.logfile, "%s/%s" % ( workingdir, Output.logfile ) )
+ os.chdir( workingdir )
+ self.saveConfig()
+ # Guess the URL to index.php
+ if ( self.useAPI == True ):
+ self.urltoindex = "%s/index.php" % ( self.domain )
+ if ( self.debugmode == True ):
+ self.debug()
+ else:
+ # Run every single task that we are assigned to do in order: xml, images, logs
+ # The "a", "b" and "c" prefix is just to force the order.
+ self.tasklist = sorted( self.tasklist )
+ if ( self.tasklist == [] ):
+ Output.error( "You did not tell me what dump to create!" )
+ else:
+ for task in self.tasklist:
+ if ( task == "axml" ):
+ DumpXML.run()
+ elif ( task == "bimages" ):
+ DumpImages.run()
+ elif ( task == "clogs" ):
+ DumpLogs.run()
+ self.downloadHtmlPages()
+ print self.bye()
+
+ def saveConfig(self):
+ """
+ Save the configuration settings provided.
+ """
+ self.configoptions[ "date" ] = self.date
+ output = open( self.configfile, "w" )
+ json.dump( self.configoptions, output, indent=4 )
+
+ def version(self):
+ """
+ Displays the version information and credits of the script.
+
+ Returns: Version information and credits
+ """
+ message = """DumpGenerator %s by WikiTeam
+
+Copyright (C) 2013 Hydriz Scholz
+Copyright (C) 2014 WikiTeam
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program. If not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA, or visit
+
+""" % (self.Version)
+ return message
+
+ def welcome(self):
+ """
+ Welcomes the user at the very beginning of the script running process.
+
+ Returns: Welcome message.
+ """
+ message = """########## Welcome to DumpGenerator %s by WikiTeam ##########\n""" % (self.Version)
+ return message
+
+class DumpImages:
+ """
+ The class for generating an image dump.
+ """
+ def __init__(self):
+ """
+ The constructor function.
+ """
+ self.files = []
+
+ def dumpImages(self):
+ """
+ Download all the images on the wiki with their corresponding XML.
+ """
+ if ( DumpGenerator.useAPI == True ):
+ self.getFileListAPI()
+ else:
+ self.getFileListIndex()
+ filecount = 0
+ if ( self.files == [] ):
+ pass
+ else:
+ Output.message( "Downloading files and their descriptions into \"images\" directory..." )
+ for media in self.files:
+ time.sleep( DumpGenerator.delay ) # Delay between requests
+ urllib.urlretrieve( media[ "url" ], "images/%s" % (media[ "name" ] ) )
+ title = DumpGenerator.fixHTMLEntities( media[ "title" ].encode( "utf-8" ) )
+ contentsfile = DumpXML.getXMLPage( title, siteinfo=True )
+ destfile = "images/%s.xml" % ( media[ "name" ] )
+ shutil.move( contentsfile, destfile )
+ Output.appendToFile( destfile, "\n" )
+ filecount += 1
+ if ( filecount % 10 == 0 ):
+ # Give the user a regular status report so that it does not look stuck
+ Output.message( " Downloaded %d files." % ( filecount ) )
+ if ( filecount == 1 ):
+ Output.message( "Downloaded 1 file." % ( filecount ) )
+ else:
+ Output.message( "Downloaded %d files." % ( filecount ) )
+
+ def getFileListAPI(self):
+ """
+ Download the list of files on the wiki via the API.
+ """
+ files = []
+ dumpfile = "%s-images.txt" % ( DumpGenerator.prefix )
+ filecount = 0
+ Output.message( "Getting list of files on the wiki..." )
+ aifrom = "!" # Very first page of a wiki
+ while aifrom:
+ sys.stderr.write('.') # Tell the user that downloading is in progress
+ query = {
+ "list": "allimages",
+ "aifrom": aifrom,
+ "ailimit": 500 } # The default limit for anonymous users of the API is 500 pages per request
+ time.sleep( DumpGenerator.delay ) # Delay between requests
+ filesmeta = json.loads( RequestAPI.query( query ) )
+ # Store what the server tells us to continue from
+ try:
+ serveraifrom = filesmeta[ "query-continue" ][ "allimages" ][ "aicontinue" ]
+ aifrom = DumpGenerator.fixHTMLEntities( serveraifrom )
+ except:
+ # Reached the end of having to keep continuing, exit the while condition
+ aifrom = ""
+ # TODO: On a wiki with a lot of files, this can cause huge memory problems
+ files.extend( filesmeta[ "query" ][ "allimages" ] )
+ for media in filesmeta[ "query" ][ "allimages" ]:
+ outputline = "%s\t%s\n" % ( media[ "title" ], media[ "url" ] )
+ Output.appendToFile( dumpfile, outputline )
+ # Add to namespace page count
+ filecount += len( files )
+ Output.appendToFile( dumpfile, "--END--" )
+ if ( filecount == 1 ):
+ Output.message( " Got 1 file" )
+ else:
+ Output.message( " Got %d files" % ( filecount ) )
+
+ if ( filecount == 0 ):
+ Output.warn( "There are no files on the wiki to download!" )
+ else:
+ Output.message( "File names and URLs saved at %s." % ( dumpfile ) )
+ self.files = files
+
+ def getFileListIndex(self):
+ """
+ Download the list of files on the wiki via index.php.
+ """
+ # TODO: Add code here
+
+ def run(self):
+ """
+ Execute the process of producing an image dump.
+ """
+ if ( os.path.isdir( "images" ) ):
+ time.sleep(0)
+ else:
+ os.mkdir( "images" )
+ self.dumpImages()
+
+class DumpLogs:
+ """
+ The class for generating a log pages dump (pages in Special:Log).
+ """
+ def __init__(self):
+ """
+ The constructor function.
+ """
+
+ def run(self):
+ """
+ Execute the process of producing a log pages dump.
+ """
+ # TODO: Support downloading of log pages
+ Output.warn( "Sorry, downloading of log pages are not yet supported!" )
+
+class DumpXML:
+ """
+ The class for generating an XML dump.
+ """
+ def __init__(self):
+ """
+ The constructor function.
+ """
+ self.lennamespaces = 0
+ self.namespaces = {}
+ self.pagetitles = []
+ self.titlesdumpfile = ""
+ self.dumpretrycount = 0
+
+ def dumpPageTitlesAPI(self):
+ """
+ Get a list of page titles and outputs it to a file.
+ """
+ self.getNamespacesAPI()
+ self.getPageTitlesAPI()
+ Output.message( "Saving list of page titles..." )
+ Output.appendToFile( self.titlesdumpfile, "--END--" )
+ Output.message( "List of page titles saved at %s." % ( self.titlesdumpfile ) )
+
+ def dumpXML(self):
+ """
+ Get the whole wiki in an XML file.
+ """
+ Output.message( "Downloading the XML of every page..." )
+ if ( DumpGenerator.curonly == True ):
+ dumpfile = "%s-curonly.xml" % ( DumpGenerator.prefix )
+ else:
+ dumpfile = "%s-history.xml" % ( DumpGenerator.prefix )
+ pagecount = 0
+ # To reduce memory usage, we are storing the title into memory only when we need it
+ for title in file( self.titlesdumpfile, "r" ).read().splitlines():
+ pagecount += 1
+ numberofedits = 0
+ # Add the initial siteinfo and header tags for the first page
+ if ( pagecount == 1 ):
+ contentsfile = self.getXMLPage( title, siteinfo=True )
+ contents = file( contentsfile, "r" ).readlines()
+ open( dumpfile, "a" ).close() # "touch" the file
+ os.remove( contentsfile )
+ elif ( title == "--END--" ):
+ contents = [ "\n" ]
+ else:
+ contentsfile = self.getXMLPage( title )
+ contents = file( contentsfile, "r" ).readlines()
+ os.remove( contentsfile )
+
+ for content in contents:
+ # Count the number of occurrences of "" to determine number of revisions
+ if ( "" in content ):
+ numberofedits += 1
+ Output.appendToFile( dumpfile, content )
+ if ( title == "--END--" ):
+ pass
+ else:
+ if ( numberofedits == 1 ):
+ Output.message( " %s, 1 edit" % ( title ) )
+ else:
+ Output.message( " %s, %s edits" % ( title, numberofedits ) )
+ if ( pagecount % 10 == 0 ):
+ Output.message( "Downloaded %d pages" % ( pagecount ) )
+ Output.message( "XML dump saved at %s." % ( dumpfile ) )
+ self.integrityCheck( dumpfile )
+
+ def getNamespacesAPI(self):
+ """
+ Download the list of namespaces with their names and IDs
+ via the API.
+ """
+ query = {
+ "meta": "siteinfo",
+ "siprop": "namespaces" }
+ namespacedetails = json.loads( RequestAPI.query( query ) )
+ namespacenums = namespacedetails[ "query" ][ "namespaces" ].keys()
+ # Remove the system namespaces ("Media" and "Special")
+ namespacenums.remove( "-2" )
+ namespacenums.remove( "-1" )
+ namespaces = {}
+ for namespacenum in namespacenums:
+ namespacename = namespacedetails[ "query" ][ "namespaces" ][ namespacenum ][ "*" ]
+ namespaces[ namespacenum ] = namespacename
+ self.lennamespaces = len( list( namespacenums ) )
+ Output.message( "%d namespaces found." % ( self.lennamespaces ) )
+ self.namespaces = namespaces
+
+ def getPageTitlesAPI(self):
+ """
+ Grab a list of page titles in each namespace via the API.
+
+ There are leading spaces in the outputs so as to make things neater on the terminal.
+ """
+ titles = []
+ self.titlesdumpfile = "%s-titles.txt" % ( DumpGenerator.prefix )
+ totalpagecount = 0
+ for namespace in self.namespaces:
+ if namespace in DumpGenerator.exnamespaces:
+ Output.warn( " Skipping namespace %s" % (namespace) )
+ else:
+ pagecount = 0
+ Output.message( " Getting titles in namespace %s" % (namespace) )
+ apfrom = "!" # Very first page of a wiki
+ while apfrom:
+ sys.stderr.write( "." ) # Tell the user that downloading is in progress
+ query = {
+ "list": "allpages",
+ "apnamespace": namespace,
+ "apfrom": apfrom,
+ "aplimit": 500 } # The default limit for anonymous users of the API is 500 pages per request
+ time.sleep( DumpGenerator.delay ) # Delay between requests
+ pagetitles = json.loads( RequestAPI.query( query ) )
+ # Store what the server tells us to continue from
+ try:
+ serverapfrom = pagetitles[ "query-continue" ][ "allpages" ][ "apcontinue" ]
+ apfrom = DumpGenerator.fixHTMLEntities( serverapfrom )
+ except:
+ try:
+ serverapfrom = pagetitles[ "query-continue" ][ "allpages" ][ "apfrom" ]
+ apfrom = DumpGenerator.fixHTMLEntities( serverapfrom )
+ except:
+ # Reached the end of having to keep continuing, exit the while condition
+ apfrom = ""
+ pages = pagetitles[ "query" ][ "allpages" ]
+ # Add to namespace page count
+ pagecount += len( pages )
+ for page in pages:
+ title = "%s\n" % ( page[ "title" ] )
+ Output.appendToFile( self.titlesdumpfile, title )
+ if ( pagecount == 1 ):
+ Output.message( " Got 1 page title in namespace %s" % ( namespace ) )
+ else:
+ Output.message( " Got %d page titles in namespace %s" % ( pagecount, namespace ) )
+ # Add to total page count
+ totalpagecount += pagecount
+ if ( totalpagecount == 1 ):
+ Output.message( "Got 1 page title in total." % ( totalpagecount ) )
+ else:
+ Output.message( "Got %d page titles in total." % ( totalpagecount ) )
+
+ def getXMLPage(self, page, siteinfo=False):
+ """
+ Get the XML of one page.
+
+ Input:
+ - page: The title of the page to download.
+ - siteinfo: Whether to include the siteinfo header in the XML.
+ """
+ parameters = {
+ "title": "Special:Export",
+ "pages": page,
+ "action": "submit" }
+ if ( DumpGenerator.curonly == True ):
+ parameters[ "curonly" ] = 1
+ parameters[ "limit" ] = 1
+ else:
+ # Make the wiki download the actual full history
+ parameters["history"] = "1"
+ # TODO: Can cause memory problems if the page has a huge history
+ result = RequestIndex.query( parameters )
+ pagehash = hashlib.sha256( page ).hexdigest()[:8]
+ tempfile = "%s.xml.tmp" % ( pagehash )
+ tempfile2 = "%s.xml" % ( pagehash )
+ Output.appendToFile( tempfile, result )
+ result = "" # Free up memory
+ # Warning: The following is NOT compatible with MediaWiki XML Schema Description version 0.3 and below!
+ # See http://wikiteam.googlecode.com/svn/trunk/schema/README.md for more information about MediaWiki versions
+ # this will affect and ways to overcome it.
+ if ( siteinfo == False ):
+ linecount = 0
+ # The 11 comes from lines like , "special" namespaces and the very first line
+ # TODO: Hacky way of removing the siteinfo, check for backward compatibility!
+ linestoskip = 11 + self.lennamespaces
+ for line in open( tempfile, "r" ).read().splitlines():
+ linecount += 1
+ if linecount > linestoskip:
+ if ( "" in line ):
+ pass
+ else:
+ line = "%s\n" % ( line )
+ Output.appendToFile( tempfile2, line )
+ else:
+ continue
+ else:
+ for line in open( tempfile, "r" ).read().splitlines():
+ if ( "" in line ):
+ pass
+ else:
+ line = "%s\n" % ( line )
+ Output.appendToFile( tempfile2, line )
+ os.remove( tempfile )
+ return tempfile2
+
+ def integrityCheck(self, dumpfile):
+ """
+ Checks the integrity of the XML dump and ensures that it is not corrupted.
+ """
+ Output.message( "Checking the integrity of the XML dump..." )
+ checktitles = 0
+ checkpageopen = 0
+ checkpageclose = 0
+ checkrevisionopen = 0
+ checkrevisionclose = 0
+ # Check the number of instances of the following tags
+ # By logic they should be the same number
+ for line in file( dumpfile, "r" ).read().splitlines():
+ if "" in line:
+ checktitles += 1
+ elif "" in line:
+ checkpageopen += 1
+ elif "" in line:
+ checkpageclose += 1
+ elif "" in line:
+ checkrevisionopen += 1
+ elif "" in line:
+ checkrevisionclose += 1
+ else:
+ continue
+
+ if ( checktitles == checkpageopen and checktitles == checkpageclose and checkrevisionopen == checkrevisionclose ):
+ Output.message( "Excellent, the XML dump is not corrupted." )
+ else:
+ Output.warn( "WARNING: XML dump seems to be corrupted." )
+ if ( DumpGenerator.autonomous == True ):
+ reply = "yes"
+ else:
+ reply = ""
+ while reply.lower() not in [ "yes", "y", "no", "n" ]:
+ reply = raw_input( 'Regenerate a new dump ([yes, y], [no, n])? ' )
+ if reply.lower() in [ "yes", "y" ]:
+ self.dumpretrycount += 1
+ if ( self.dumpretrycount < 3 ):
+ Output.warn( "Generating a new dump..." )
+ os.remove( dumpfile )
+ self.dumpXML()
+ else:
+ Output.warn( "We have tried dumping the wiki 3 times, but the dump is still corrupted. Not going to carry on since it is probably a problem on the wiki." )
+ # Encourage the user to tell us about this faulty wiki
+ print "Please tell us about this by reporting an issue here: https://code.google.com/p/wikiteam/issues/list. Thank you!"
+ print "Giving you a little time to see this message..."
+ time.sleep(3) # Give time for the user to see the message
+ elif reply.lower() in [ "no", "n" ]:
+ Output.warn( "Not generating a new dump. Note: Your dump is corrupted and might not work with MediaWiki!" )
+
+ def run(self):
+ """
+ Execute the process of producing an XML dump.
+ """
+ if ( DumpGenerator.useAPI == True ):
+ if ( DumpGenerator.titlesonly == True ):
+ self.dumpPageTitlesAPI()
+ else:
+ if ( DumpGenerator.titles != "" ):
+ Output.message( "Using the list of page titles provided at %s." % ( DumpGenerator.titles ) )
+ self.titlesdumpfile = DumpGenerator.titles
+ else:
+ self.dumpPageTitlesAPI()
+ self.dumpXML()
+ else:
+ if ( DumpGenerator.titlesonly == True ):
+ self.dumpPageTitlesIndex()
+ else:
+ if ( DumpGenerator.titles != "" ):
+ self.titlesdumpfile = DumpGenerator.titles
+ else:
+ self.dumpPageTitlesIndex()
+ self.dumpXML()
+
+class Output:
+ """
+ The class to output anything to the user or to a place not within the script.
+
+ For doing outputs to user:
+ This is used instead of directly using the "print" function is because
+ this is intended to log everything that is told to the user, so that it
+ is possible to check when and where things went wrong.
+
+ For doing outputs to elsewhere:
+ This is to reduce memory usage by storing large chunks of data into disk
+ and reducing the risk of getting a MemoryError.
+ """
+ def __init__(self):
+ self.logfile = "dumpgenerator.log"
+
+ # Output to disk
+ def appendToFile(self, outputfile, contents):
+ """
+ Output contents to file.
+
+ Inputs:
+ - outputfile: The file to output to.
+ - contents: The content to add for each line.
+ """
+ if ( os.path.exists( outputfile ) == False ):
+ open( outputfile, "a" ).close() # "touch" the file
+ else:
+ pass
+ thefile = open( outputfile, "a" )
+ try:
+ contents = contents.encode( "utf-8", "ignore" )
+ # TODO: During a test phase, this error kept coming up, though the final output was no different from
+ # what was produced using dumpBackup.php and using Special:Export itself.
+ except UnicodeDecodeError:
+ pass
+ thefile.write( contents )
+ thefile.close()
+
+ # Output to user
+ def error(self, message):
+ print message
+ print "Write --help for more information."
+ self.log( "An error occurred: %s" % (message) )
+
+ def log(self, message):
+ if ( DumpGenerator.nolog or DumpGenerator.debugmode):
+ # Skip logging
+ time.sleep(0)
+ else:
+ timestamp = datetime.datetime.fromtimestamp( time.time() ).strftime( "%Y-%m-%d %H:%M:%S" )
+ logline = "%s: %s\n" % (timestamp, message)
+ self.appendToFile( self.logfile, logline )
+
+ def message(self, message):
+ print message
+ self.log( "Told the user: %s" % (message) )
+
+ def warn(self, message):
+ print message
+ self.log( "Warned the user: %s" % (message) )
+
+class RequestAPI:
+ """
+ The RequestAPI class, to submit APi request calls to the server.
+ """
+ def __init__(self):
+ """
+ The constructor function.
+ """
+
+ def query(self, params, url=""):
+ """
+ The function to send an API call to the server given in the "url"
+ parameter using the parameters found in params. If url is empty,
+ DumpGenerator.urltoapi is used instead.
+
+ Note: This function will assume action=query, other functions provides
+ the other query forms, but not this one.
+
+ Input:
+ - params: Parameters to API call as an array (excluding action=query and format=json)
+
+ Returns
+ - Result of API call in JSON format.
+ """
+ if ( url == "" ):
+ url = DumpGenerator.urltoapi
+ else:
+ url = url
+ queryurl = "%s?action=query&format=json" % ( url )
+ headers = { "User-Agent": DumpGenerator.UserAgent }
+ # Convert the array to a proper URL
+ paras = urllib.urlencode( params )
+ # POST the parameters to the server
+ request = urllib2.Request( queryurl, paras, headers )
+ try:
+ result = urllib2.urlopen( request )
+ except:
+ try:
+ # Add a little delay between requests if server is slow
+ sleeptime = DumpGenerator.delay + 10
+ Output.warn( "Failed to get a response from the server, retrying in %d seconds..." % (sleeptime) )
+ time.sleep( sleeptime )
+ result = urllib2.urlopen( request )
+ except:
+ Output.error( "An error occurred when trying to get a response from the server. Please resume the dump with --resume." )
+ sys.exit(2)
+ output = result.read()
+ result.close()
+ return output
+
+class RequestIndex:
+ def __init__(self):
+ """
+ The constructor function.
+ """
+
+ def query(self, params, url=""):
+ """
+ The function to send an request to the server given in the "url"
+ parameter using the parameters found in params. If url is empty,
+ DumpGenerator.urltoindex is used instead.
+
+ Input:
+ - params: Parameters to the request to send, appended to url as
+ a GET request.
+
+ Returns
+ - Result of GET request.
+ """
+ if ( url == "" ):
+ url = DumpGenerator.urltoindex
+ else:
+ url = url
+ headers = { "User-Agent": DumpGenerator.UserAgent }
+ paras = urllib.urlencode( params )
+ # index.php does not support POST request, formulating a correct GET URL here
+ queryurl = "%s?%s" % ( url, paras )
+ request = urllib2.Request( queryurl, headers=headers )
+ # TODO: Make urlopen follow redirects
+ try:
+ result = urllib2.urlopen( request )
+ except:
+ try:
+ # Add a little delay between requests if server is slow
+ sleeptime = DumpGenerator.delay + 10
+ Output.warn( "Failed to get a response from the server, retrying in %d seconds..." % (sleeptime) )
+ time.sleep( sleeptime )
+ result = urllib2.urlopen( request )
+ except:
+ Output.error( "An error occurred when trying to get a response from the server. Please resume the dump with --resume." )
+ sys.exit(2)
+ output = result.read()
+ result.close()
+ return output
+
+ def removeIP(self, content):
+ """
+ Remove the user's IP address while fetching HTML pages.
+ """
+ # Remove IPv4 addresses
+ content = re.sub( r"\d+\.\d+\.\d+\.\d+", "0.0.0.0", content )
+ # Remove IPv6 addresses
+ content = re.sub( r"(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}", "0:0:0:0:0:0:0:0", content )
+ return content
+
+class Updater:
+ """
+ The class to auto-update the user's script to the latest version of DumpGenerator.
+ """
+ # TODO: Get the script to check only occasionally, this is a performance concern
+ def __init__(self):
+ """
+ The constructor function.
+ """
+ self.controlUrl = "http://wikiteam.googlecode.com/svn/trunk/revnum.json"
+ self.controlUrl2 = "https://raw.github.com/dumps/DumpGenerator/master/revnum.json"
+ self.result = {}
+
+ def checkRevision(self):
+ """
+ Check the current revision and ensure that it is up-to-date.
+ """
+ jsonresult = self.getRevisionJson()
+ if ( jsonresult == False ):
+ pass
+ else:
+ result = json.loads( jsonresult )
+ self.result = result
+ if ( result[ "latest" ] == DumpGenerator.Version ):
+ if ( result[ "releases" ][ DumpGenerator.Version ][ "revision" ] == DumpGenerator.revision ):
+ pass
+ else:
+ self.update()
+ else:
+ self.update()
+
+ def getRevisionJson(self):
+ """
+ Download the controlling JSON file.
+ """
+ headers = {'User-Agent': DumpGenerator.UserAgent}
+ skip = False
+ # TODO: Handle 404 errors
+ try:
+ revjson = urllib2.urlopen( urllib2.Request( self.controlUrl, headers=headers ) )
+ except:
+ try:
+ revjson = urllib2.urlopen( urllib2.Request( self.controlUrl2, headers=headers ) )
+ except:
+ Output.warn( "Unable to check if a new version of dumpgenerator.py is available, continuing..." )
+ skip = True
+ if ( skip == False ):
+ output = revjson.read()
+ revjson.close()
+ return output
+ else:
+ return False
+
+ def update(self):
+ """
+ Update DumpGenerator.py to the current latest version
+ """
+ currentfile = sys.argv[0]
+ latestver = self.result[ "latest" ]
+ latestrev = self.result[ "releases" ][ latestver ][ "revision" ]
+ latesturl = self.result[ "releases" ][ latestver ][ "downloadurl" ]
+ latesturl2 = self.result[ "releases" ][ latestver ][ "downloadurl2" ]
+ updated = True
+ # TODO: Handle 404 errors
+ try:
+ urllib.urlretrieve( latesturl, currentfile )
+ except:
+ try:
+ urllib.urlretrieve( latesturl2, currentfile )
+ except:
+ updated = False
+ if ( updated == False ):
+ Output.warn( "Unable to update DumpGenerator, skipping update for now..." )
+ else:
+ Output.message( "DumpGenerator was updated to %s (revision %s)! Changes will take effect on next run." % ( latestver, latestrev ) )
if __name__ == "__main__":
- main()
+ # Class registry, for use throughout the whole script
+ RequestAPI = RequestAPI()
+ RequestIndex = RequestIndex()
+ DumpGenerator = DumpGenerator()
+ DumpImages = DumpImages()
+ DumpLogs = DumpLogs()
+ DumpXML = DumpXML()
+ Output = Output()
+ Updater = Updater()
+
+ # Start everything up
+ DumpGenerator.run()