wikiteam/dumpgenerator.py

#!/usr/bin/env python2.5
# -*- coding: utf-8 -*-

# Copyright (C) 2011 emijrp
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import datetime
import os
import re
import subprocess
import sys
import urllib
import urllib2

# todo:
# curonly and all history (curonly si puede acumular varias peticiones en un solo GET, ara full history pedir cada pagina una a una)
# usar api o parsear html si no está disponible
# http://www.mediawiki.org/wiki/Manual:Parameters_to_Special:Export
# threads para bajar más rápido? pedir varias páginas a la vez
# images?
# Special:Log? uploads, account creations, etc

def cleanHTML(raw=''):
    if re.search('<!-- bodytext -->', raw): #<!-- bodytext --> <!-- /bodytext --> <!-- start content --> <!-- end content -->
        raw = raw.split('<!-- bodytext -->')[1].split('<!-- /bodytext -->')[0]
    elif re.search('<!-- start content -->', raw):
        raw = raw.split('<!-- start content -->')[1].split('<!-- end content -->')[0]
    else:
        print 'This wiki doesn\'t use marks to split contain'
        sys.exit()
    return raw

def getAllPageTitles(domain='', namespaces=[]):
    #http://en.wikipedia.org/wiki/Special:AllPages
    #http://archiveteam.org/index.php?title=Special:AllPages
    #http://www.wikanda.es/wiki/Especial:Todas
    if not domain:
        print 'Please, use --domain parameter'
        sys.exit()
    
    #namespace checks and stuff
    namespacenames = {0:''} # main is 0, no prefix
    if namespaces:
        raw = urllib.urlopen('%s?title=Special:Allpages' % (domain)).read()
        m = re.compile(r'<option [^>]*?value="(?P<namespaceid>\d+)"[^>]*?>(?P<namespacename>[^<]+)</option>').finditer(raw) # [^>]*? to include selected="selected"
        if 'all' in namespaces:
            namespaces = []
            for i in m:
                namespaces.append(int(i.group("namespaceid")))
                namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
        else:
            #check if those namespaces really exist in this wiki
            namespaces2 = []
            for i in m:
                if int(i.group("namespaceid")) in namespaces:
                    namespaces2.append(int(i.group("namespaceid")))
                    namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
            namespaces = namespaces2
    else:
        namespaces = [0]
    
    #retrieve all titles from Special:Allpages, if the wiki is big, perhaps there are sub-Allpages to explore
    namespaces = [i for i in set(namespaces)] #uniques
    titles = []
    for namespace in namespaces:
        print '    Retrieving titles in the namespace', namespace
        url = '%s?title=Special:Allpages&namespace=%s' % (domain, namespace)
        raw = urllib.urlopen(url).read()
        raw = cleanHTML(raw)
        
        r_title = r'title="(?P<title>[^>]+)">'
        r_suballpages = r'&amp;from=(?P<from>[^>]+)&amp;to=(?P<to>[^>]+)">'
        deep = 3 # 3 is the current deep of English Wikipedia for Special:Allpages, 3 levels
        c = 0
        checked_suballpages = []
        rawacum = raw
        while re.search(r_suballpages, raw) and c < deep:
            #load sub-Allpages
            m = re.compile(r_suballpages).finditer(raw)
            for i in m:
                fr = i.group('from')
                to = i.group('to')
                name = '%s-%s' % (fr, to)
                if not name in checked_suballpages:
                    checked_suballpages.append(name)
                    url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % (domain, namespace, fr, to) #do not put urllib.quote in fr or to
                    raw2 = urllib.urlopen(url).read()
                    raw2 = cleanHTML(raw2)
                    rawacum += raw2 #merge it after removed junk
                    print '    Detected sub-Allpages:', name, len(raw2), 'bytes', len(re.findall(r_title, raw2))
            c += 1
        
        m = re.compile(r_title).finditer(rawacum)
        for i in m:
            if not i.group('title').startswith('Special:'):
                if not i.group('title') in titles:
                    titles.append(i.group('title'))
    return titles

def getHeader(domain=''):
    #get the header of a random page, to attach it in the complete XML backup
    #similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:x....
    randomtitle = 'AMF5LKE43MNFGHKSDMRTJ'
    xml = getXML(domain=domain, title=randomtitle)
    header = xml.split('</mediawiki>')[0]
    return header

def getXML(domain='', title='', curonly=False):
    #http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F
    limit = 1000
    truncated = False
    title_ = re.sub(' ', '_', title)
    headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4'}
    params = {'title': 'Special:Export', 'pages': title, 'action': 'submit', }
    if curonly:
        params['curonly'] = 1
    else:
        params['offset'] = '1'
        params['limit'] = limit
    data = urllib.urlencode(params)
    req = urllib2.Request(url=domain, data=data, headers=headers)
    f = urllib2.urlopen(req)
    xml = f.read()

    #if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
    #else, warning about Special:Export truncating large page histories
    r_timestamp = r'<timestamp>([^<]+)</timestamp>'
    if not curonly and re.search(r_timestamp, xml): # to avoid empty pages: Special:Allpages and the random one
        params['offset'] = re.findall(r_timestamp, xml)[-1]
        while not truncated and params['offset'] == re.findall(r_timestamp, xml)[-1]:
            data = urllib.urlencode(params)
            req2 = urllib2.Request(url=domain, data=data, headers=headers)
            f2 = urllib2.urlopen(req2)
            xml2 = f2.read()
            if re.findall(r_timestamp, xml2)[-1] == params['offset']:
                print 'ATTENTION: This wiki does not allow some parameters in Special:Export, so, pages with large histories may be truncated'
                truncated = True
                break
            else:
                xml = xml.split('</page>')[0]+xml2.split('<page>\n')[1]
            print title, len(xml2), re.findall(r_timestamp, xml2)
    return xml

def cleanXML(xml=''):
    xml = xml.split('</siteinfo>\n')[1]
    xml = xml.split('</mediawiki>')[0]
    return xml

if __name__ == '__main__':
    domain = 'http://archiveteam.org/index.php' # 'http://en.wikipedia.org/w'
    #domain = 'http://wikanda.cadizpedia.eu/w/index.php' # 'http://en.wikipedia.org/w'
    curonly = False
    namespaces = [0]
    
    if re.findall(r'(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews)\.org', domain):
        print 'DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!\nDownload the dumps from http://download.wikimedia.org\nThanks!'
        sys.exit()
    
    #get titles
    print 'Loading page titles from namespaces =', ','.join([str(i) for i in namespaces])
    titles = getAllPageTitles(domain=domain, namespaces=namespaces)
    #print '\n'.join(titles)
    print '%d titles loaded' % (len(titles))
    
    #get xml
    print 'Retrieving the XML for every title'
    header = getHeader(domain=domain)
    footer = '</mediawiki>'
    xmlfilename = 'wikidump-%s.xml' % (str(datetime.datetime.now()))
    xmlfile = open(xmlfilename, 'w')
    xmlfile.write(header)
    c = 1
    for title in titles:
        if c % 10 == 0:
            print '    Downloaded %d pages' % (c)
        xml = getXML(domain=domain, title=title, curonly=curonly)
        xml = cleanXML(xml=xml)
        xmlfile.write(xml)
        c += 1
    xmlfile.write(footer)
    xmlfile.close()
coding a xml generator git-svn-id: https://wikiteam.googlecode.com/svn/trunk@2 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`#!/usr/bin/env python2.5`
			`# -- coding: utf-8 --`

			`# Copyright (C) 2011 emijrp`
			`# This program is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with this program. If not, see <http://www.gnu.org/licenses/>.`

suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`import datetime`
			`import os`
coding a xml generator git-svn-id: https://wikiteam.googlecode.com/svn/trunk@2 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`import re`
suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`import subprocess`
coding a xml generator git-svn-id: https://wikiteam.googlecode.com/svn/trunk@2 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`import sys`
			`import urllib`
suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`import urllib2`
coding a xml generator git-svn-id: https://wikiteam.googlecode.com/svn/trunk@2 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago
			`# todo:`
			`# curonly and all history (curonly si puede acumular varias peticiones en un solo GET, ara full history pedir cada pagina una a una)`
			`# usar api o parsear html si no está disponible`
			`# http://www.mediawiki.org/wiki/Manual:Parameters_to_Special:Export`
git-svn-id: https://wikiteam.googlecode.com/svn/trunk@6 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`# threads para bajar más rápido? pedir varias páginas a la vez`
git-svn-id: https://wikiteam.googlecode.com/svn/trunk@8 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`# images?`
			`# Special:Log? uploads, account creations, etc`
coding a xml generator git-svn-id: https://wikiteam.googlecode.com/svn/trunk@2 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago
suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`def cleanHTML(raw=''):`
			`if re.search('<!-- bodytext -->', raw): #<!-- bodytext --> <!-- /bodytext --> <!-- start content --> <!-- end content -->`
			`raw = raw.split('<!-- bodytext -->')[1].split('<!-- /bodytext -->')[0]`
			`elif re.search('<!-- start content -->', raw):`
			`raw = raw.split('<!-- start content -->')[1].split('<!-- end content -->')[0]`
			`else:`
			`print 'This wiki doesn\'t use marks to split contain'`
			`sys.exit()`
			`return raw`

coding a xml generator git-svn-id: https://wikiteam.googlecode.com/svn/trunk@2 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`def getAllPageTitles(domain='', namespaces=[]):`
			`#http://en.wikipedia.org/wiki/Special:AllPages`
			`#http://archiveteam.org/index.php?title=Special:AllPages`
			`#http://www.wikanda.es/wiki/Especial:Todas`
			`if not domain:`
			`print 'Please, use --domain parameter'`
			`sys.exit()`

suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`#namespace checks and stuff`
coding a xml generator git-svn-id: https://wikiteam.googlecode.com/svn/trunk@2 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`namespacenames = {0:''} # main is 0, no prefix`
			`if namespaces:`
suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`raw = urllib.urlopen('%s?title=Special:Allpages' % (domain)).read()`
coding a xml generator git-svn-id: https://wikiteam.googlecode.com/svn/trunk@2 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`m = re.compile(r'<option [^>]?value="(?P<namespaceid>\d+)"[^>]?>(?P<namespacename>[^<]+)</option>').finditer(raw) # [^>]*? to include selected="selected"`
			`if 'all' in namespaces:`
			`namespaces = []`
			`for i in m:`
			`namespaces.append(int(i.group("namespaceid")))`
			`namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")`
			`else:`
			`#check if those namespaces really exist in this wiki`
			`namespaces2 = []`
			`for i in m:`
			`if int(i.group("namespaceid")) in namespaces:`
			`namespaces2.append(int(i.group("namespaceid")))`
			`namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")`
			`namespaces = namespaces2`
			`else:`
			`namespaces = [0]`

suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`#retrieve all titles from Special:Allpages, if the wiki is big, perhaps there are sub-Allpages to explore`
coding a xml generator git-svn-id: https://wikiteam.googlecode.com/svn/trunk@2 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`namespaces = [i for i in set(namespaces)] #uniques`
			`titles = []`
			`for namespace in namespaces:`
suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`print ' Retrieving titles in the namespace', namespace`
			`url = '%s?title=Special:Allpages&namespace=%s' % (domain, namespace)`
			`raw = urllib.urlopen(url).read()`
			`raw = cleanHTML(raw)`
coding a xml generator git-svn-id: https://wikiteam.googlecode.com/svn/trunk@2 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago
suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`r_title = r'title="(?P<title>[^>]+)">'`
			`r_suballpages = r'&from=(?P<from>[^>]+)&to=(?P<to>[^>]+)">'`
			`deep = 3 # 3 is the current deep of English Wikipedia for Special:Allpages, 3 levels`
			`c = 0`
			`checked_suballpages = []`
suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@4 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`rawacum = raw`
suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`while re.search(r_suballpages, raw) and c < deep:`
			`#load sub-Allpages`
			`m = re.compile(r_suballpages).finditer(raw)`
			`for i in m:`
			`fr = i.group('from')`
			`to = i.group('to')`
			`name = '%s-%s' % (fr, to)`
			`if not name in checked_suballpages:`
			`checked_suballpages.append(name)`
			`url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % (domain, namespace, fr, to) #do not put urllib.quote in fr or to`
			`raw2 = urllib.urlopen(url).read()`
			`raw2 = cleanHTML(raw2)`
			`rawacum += raw2 #merge it after removed junk`
			`print ' Detected sub-Allpages:', name, len(raw2), 'bytes', len(re.findall(r_title, raw2))`
			`c += 1`
coding a xml generator git-svn-id: https://wikiteam.googlecode.com/svn/trunk@2 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago
suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`m = re.compile(r_title).finditer(rawacum)`
coding a xml generator git-svn-id: https://wikiteam.googlecode.com/svn/trunk@2 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`for i in m:`
			`if not i.group('title').startswith('Special:'):`
suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`if not i.group('title') in titles:`
			`titles.append(i.group('title'))`
coding a xml generator git-svn-id: https://wikiteam.googlecode.com/svn/trunk@2 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`return titles`

suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`def getHeader(domain=''):`
			`#get the header of a random page, to attach it in the complete XML backup`
			`#similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:x....`
			`randomtitle = 'AMF5LKE43MNFGHKSDMRTJ'`
			`xml = getXML(domain=domain, title=randomtitle)`
			`header = xml.split('</mediawiki>')[0]`
			`return header`

			`def getXML(domain='', title='', curonly=False):`
			`#http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F`
git-svn-id: https://wikiteam.googlecode.com/svn/trunk@6 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`limit = 1000`
git-svn-id: https://wikiteam.googlecode.com/svn/trunk@8 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`truncated = False`
suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`title_ = re.sub(' ', '_', title)`
			`headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4'}`
			`params = {'title': 'Special:Export', 'pages': title, 'action': 'submit', }`
			`if curonly:`
			`params['curonly'] = 1`
			`else:`
git-svn-id: https://wikiteam.googlecode.com/svn/trunk@6 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`params['offset'] = '1'`
suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`params['limit'] = limit`
git-svn-id: https://wikiteam.googlecode.com/svn/trunk@6 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`data = urllib.urlencode(params)`
			`req = urllib2.Request(url=domain, data=data, headers=headers)`
suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`f = urllib2.urlopen(req)`
			`xml = f.read()`

git-svn-id: https://wikiteam.googlecode.com/svn/trunk@7 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`#if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available`
			`#else, warning about Special:Export truncating large page histories`
			`r_timestamp = r'<timestamp>([^<]+)</timestamp>'`
			`if not curonly and re.search(r_timestamp, xml): # to avoid empty pages: Special:Allpages and the random one`
			`params['offset'] = re.findall(r_timestamp, xml)[-1]`
git-svn-id: https://wikiteam.googlecode.com/svn/trunk@8 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`while not truncated and params['offset'] == re.findall(r_timestamp, xml)[-1]:`
git-svn-id: https://wikiteam.googlecode.com/svn/trunk@6 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`data = urllib.urlencode(params)`
			`req2 = urllib2.Request(url=domain, data=data, headers=headers)`
suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`f2 = urllib2.urlopen(req2)`
			`xml2 = f2.read()`
git-svn-id: https://wikiteam.googlecode.com/svn/trunk@7 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`if re.findall(r_timestamp, xml2)[-1] == params['offset']:`
			`print 'ATTENTION: This wiki does not allow some parameters in Special:Export, so, pages with large histories may be truncated'`
git-svn-id: https://wikiteam.googlecode.com/svn/trunk@8 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`truncated = True`
git-svn-id: https://wikiteam.googlecode.com/svn/trunk@6 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`break`
git-svn-id: https://wikiteam.googlecode.com/svn/trunk@7 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`else:`
			`xml = xml.split('</page>')[0]+xml2.split('<page>\n')[1]`
			`print title, len(xml2), re.findall(r_timestamp, xml2)`
suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`return xml`

			`def cleanXML(xml=''):`
			`xml = xml.split('</siteinfo>\n')[1]`
			`xml = xml.split('</mediawiki>')[0]`
			`return xml`
coding a xml generator git-svn-id: https://wikiteam.googlecode.com/svn/trunk@2 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago
			`if __name__ == '__main__':`
git-svn-id: https://wikiteam.googlecode.com/svn/trunk@6 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`domain = 'http://archiveteam.org/index.php' # 'http://en.wikipedia.org/w'`
			`#domain = 'http://wikanda.cadizpedia.eu/w/index.php' # 'http://en.wikipedia.org/w'`
coding a xml generator git-svn-id: https://wikiteam.googlecode.com/svn/trunk@2 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`curonly = False`
suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`namespaces = [0]`
coding a xml generator git-svn-id: https://wikiteam.googlecode.com/svn/trunk@2 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago
			`if re.findall(r'(wikipedia\|wikisource\|wiktionary\|wikibooks\|wikiversity\|wikimedia\|wikispecies\|wikiquote\|wikinews)\.org', domain):`
			`print 'DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!\nDownload the dumps from http://download.wikimedia.org\nThanks!'`
			`sys.exit()`

suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`#get titles`
			`print 'Loading page titles from namespaces =', ','.join([str(i) for i in namespaces])`
coding a xml generator git-svn-id: https://wikiteam.googlecode.com/svn/trunk@2 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`titles = getAllPageTitles(domain=domain, namespaces=namespaces)`
suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`#print '\n'.join(titles)`
coding a xml generator git-svn-id: https://wikiteam.googlecode.com/svn/trunk@2 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`print '%d titles loaded' % (len(titles))`
suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago
			`#get xml`
			`print 'Retrieving the XML for every title'`
			`header = getHeader(domain=domain)`
			`footer = '</mediawiki>'`
			`xmlfilename = 'wikidump-%s.xml' % (str(datetime.datetime.now()))`
			`xmlfile = open(xmlfilename, 'w')`
			`xmlfile.write(header)`
git-svn-id: https://wikiteam.googlecode.com/svn/trunk@6 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`c = 1`
suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`for title in titles:`
git-svn-id: https://wikiteam.googlecode.com/svn/trunk@6 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`if c % 10 == 0:`
			`print ' Downloaded %d pages' % (c)`
suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`xml = getXML(domain=domain, title=title, curonly=curonly)`
			`xml = cleanXML(xml=xml)`
			`xmlfile.write(xml)`
git-svn-id: https://wikiteam.googlecode.com/svn/trunk@6 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`c += 1`
suballpages git-svn-id: https://wikiteam.googlecode.com/svn/trunk@3 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 13 years ago			`xmlfile.write(footer)`
			`xmlfile.close()`