diff --git a/wikiteam/__init__.py b/wikiteam/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/wikiteam/__init__.py @@ -0,0 +1 @@ + diff --git a/wikiteam/mediawiki.py b/wikiteam/mediawiki.py new file mode 100644 index 0000000..5881afd --- /dev/null +++ b/wikiteam/mediawiki.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright (C) 2011-2016 WikiTeam developers +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# Documentation for users: https://github.com/WikiTeam/wikiteam/wiki +# Documentation for developers: http://wikiteam.readthedocs.com + +import re + +import wikiteam + +def mwGetAPI(url=''): + """ Returns API for a MediaWiki wiki, if available """ + + api = '' + html = wikiteam.getURL(url=url) + m = re.findall( + r'(?im)<\s*link\s*rel="EditURI"\s*type="application/rsd\+xml"\s*href="([^>]+?)\?action=rsd"\s*/\s*>', + html) + if m: + api = m[0] + if api.startswith('//'): # gentoo wiki and others + api = url.split('//')[0] + api + return api + +def mwGetIndex(url=''): + """ Returns Index.php for a MediaWiki wiki, if available """ + + api = mwGetAPI(url=url) + index = '' + html = wikiteam.getURL(url=url) + m = re.findall(r'
  • ]*?>\s*(?:)?\s*]*?>\s*(?:)?\s*. + +# Documentation for users: https://github.com/WikiTeam/wikiteam/wiki +# Documentation for developers: http://wikiteam.readthedocs.com + +import argparse +import datetime +import random +import re +import requests # seriously needed? +import sys +import urllib + +if sys.version_info < (3, 0): + import cookielib +else: + import http.cookiejar as cookielib + +import mediawiki + +__version__ = "0.3.0" + +def avoidWikimediaProjects(config={}): + """ Skip Wikimedia projects and redirect to the dumps website """ + + # notice about wikipedia dumps + if re.findall(r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org', config['wiki']): + print('PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!') + print('Download Wikimedia dumps from https://dumps.wikimedia.org') + """if not other['force']: + print 'Thanks!' + sys.exit()""" + +def bye(): + """ Print closing message """ + + message = """ +---> Congratulations! Your dump is complete <--- +If you found any bug, report a new issue here: https://github.com/WikiTeam/wikiteam/issues +If this is a public wiki, please consider publishing this dump. Do it yourself as explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump or contact us at https://github.com/WikiTeam/wikiteam +Good luck! Bye!""" + print(message) + +def domain2prefix(config={}): + """ Convert domain name to a valid prefix filename. """ + + domain = '' + if config['wiki']: + domain = config['wiki'] + domain = domain.lower() + domain = re.sub(r'(https?://|www\.|/index\.php|/api\.php)', '', domain) + domain = re.sub(r'/', '_', domain) + domain = re.sub(r'\.', '', domain) + domain = re.sub(r'[^A-Za-z0-9]', '_', domain) + domain = domain.strip('_') + return domain + +def getAPI(url=''): + """ Returns API for a wiki, if available """ + + wikiengine = getWikiEngine(url=url) + api = '' + if wikiengine == 'mediawiki': + api = mediawiki.mwGetAPI(url=url) + + return api + +def getIndex(url=''): + """ Returns Index.php for a wiki, if available """ + + wikiengine = getWikiEngine(url=url) + index = '' + if wikiengine == 'mediawiki': + index = mediawiki.mwGetIndex(url=url) + + return index + +def getParameters(params=[]): + """ Import parameters into variable """ + + if not params: + params = sys.argv + + config = {} + parser = argparse.ArgumentParser(description='Tools for downloading and preserving wikis.') + + # General params + parser.add_argument( + '-v', '--version', action='version', version=getVersion()) + parser.add_argument( + '--cookies', metavar="cookies.txt", help="Path to a cookies.txt file.") + parser.add_argument( + '--delay', + metavar=5, + default=0, + type=float, + help="Adds a delay (in seconds).") + parser.add_argument( + '--retries', + metavar=5, + default=5, + help="Maximum number of retries.") + parser.add_argument('--path', help='Path to store wiki dump at.') + parser.add_argument( + '--resume', + action='store_true', + help='Resumes previous incomplete dump (requires --path).') + parser.add_argument('--force', action='store_true', help='') + parser.add_argument( + '--user', help='Username if authentication is required.') + parser.add_argument( + '--pass', + dest='password', + help='Password if authentication is required.') + + # URL params + groupWiki = parser.add_argument_group() + groupWiki.add_argument( + 'wiki', + default='', + nargs='?', + help="URL to wiki (e.g. http://wiki.domain.org).") + groupWiki.add_argument( + '--mw-api', + help="URL to MediaWiki API (e.g. http://wiki.domain.org/w/api.php).") + groupWiki.add_argument( + '--mw-index', + help="URL to MediaWiki index.php (e.g. http://wiki.domain.org/w/index.php).") + + # Download params + groupDownload = parser.add_argument_group( + 'Data to download', + 'What info download from the wiki') + groupDownload.add_argument( + '--pages', + action='store_true', + help="Generates a dump of pages (--pages --curonly for current revisions only).") + groupDownload.add_argument('--curonly', action='store_true', + help='Store only the current version of pages.') + groupDownload.add_argument( + '--images', action='store_true', help="Generates an image dump.") + groupDownload.add_argument( + '--namespaces', + metavar="1,2,3", + help='Comma-separated value of namespaces to include (all by default).') + groupDownload.add_argument( + '--exnamespaces', + metavar="1,2,3", + help='Comma-separated value of namespaces to exclude.') + + # Meta info params + groupMeta = parser.add_argument_group( + 'Meta info', + 'What meta info to retrieve from the wiki') + groupMeta.add_argument( + '--get-api', + action='store_true', + help="Returns wiki API when available.") + groupMeta.add_argument( + '--get-index', + action='store_true', + help="Returns wiki Index.php when available.") + groupMeta.add_argument( + '--get-wiki-engine', + action='store_true', + help="Returns wiki engine.") + + args = parser.parse_args() + #print(args) + + # Not wiki? Exit + if not args.wiki: + print('ERROR: Provide a URL to a wiki') + parser.print_help() + sys.exit(1) + + # Don't mix download params and meta info params + if (args.pages or args.images) and \ + (args.get_wiki_engine): + print('ERROR: Don\'t mix download params and meta info params') + parser.print_help() + sys.exit(1) + + # No download params and no meta info params? Exit + if (not args.pages and not args.images) and \ + (not args.get_api and not args.get_wiki_engine): + print('ERROR: Use at least one download param or meta info param') + parser.print_help() + sys.exit(1) + + # Execute meta info params + if args.wiki: + if args.get_api: + print(getAPI(url=args.wiki)) + sys.exit() + if args.get_wiki_engine: + print(getWikiEngine(url=args.wiki)) + sys.exit() + + # Create session + cj = cookielib.MozillaCookieJar() + if args.cookies: + cj.load(args.cookies) + print('Using cookies from %s' % args.cookies) + + session = requests.Session() + session.cookies = cj + session.headers.update({'User-Agent': getUserAgent()}) + if args.user and args.password: + session.auth = (args.user, args.password) + # session.mount(args.mw_api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret)) + + # check URLs + for url in [args.mw_api, args.mw_index, args.wiki]: + if url and (not url.startswith('http://') and not url.startswith('https://')): + print(url) + print('ERROR: URLs must start with http:// or https://\n') + parser.print_help() + sys.exit(1) + + wikiengine = getWikiEngine(args.wiki) + if wikiengine == 'wikispaces': + pass + else: # presume is a mediawiki + if not args.mw_api: + api = mediawiki.mwGetAPI(url=args.wiki) + if not api: + print('ERROR: Provide a URL to API') + if not args.mw_index: + index = mediawiki.mwGetIndex(url=args.wiki) + if not index: + print('ERROR: Provide a URL to Index.php') + + # check user and pass (one requires both) + if (args.user and not args.password) or (args.password and not args.user): + print('ERROR: Both --user and --pass are required for authentication.') + parser.print_help() + sys.exit(1) + + namespaces = ['all'] + exnamespaces = [] + # Process namespace inclusions + if args.namespaces: + # fix, why - ? and... --namespaces= all with a space works? + if re.search( + r'[^\d, \-]', + args.namespaces) and args.namespaces.lower() != 'all': + print("Invalid namespace values.\nValid format is integer(s) separated by commas") + sys.exit() + else: + ns = re.sub(' ', '', args.namespaces) + if ns.lower() == 'all': + namespaces = ['all'] + else: + namespaces = [int(i) for i in ns.split(',')] + + # Process namespace exclusions + if args.exnamespaces: + if re.search(r'[^\d, \-]', args.exnamespaces): + print("Invalid namespace values.\nValid format is integer(s) separated by commas") + sys.exit(1) + else: + ns = re.sub(' ', '', args.exnamespaces) + if ns.lower() == 'all': + print('You cannot exclude all namespaces.') + sys.exit(1) + else: + exnamespaces = [int(i) for i in ns.split(',')] + + # --curonly requires --xml + if args.curonly and not args.pages: + print("--curonly requires --pages\n") + parser.print_help() + sys.exit(1) + + config = { + 'wiki': args.wiki, + 'wikiengine': wikiengine, + 'curonly': args.curonly, + 'date': datetime.datetime.now().strftime('%Y%m%d'), + 'images': args.images, + 'pages': args.pages, + 'logs': False, + 'pages': args.pages, + 'namespaces': namespaces, + 'exnamespaces': exnamespaces, + 'path': args.path and os.path.normpath(args.path) or '', + 'cookies': args.cookies or '', + 'delay': args.delay, + 'retries': int(args.retries), + 'other': { + 'resume': args.resume, + 'filenamelimit': 100, # do not change + 'force': args.force, + 'session': session, + } + } + + # calculating path, if not defined by user with --path= + if not config['path']: + config['path'] = './%s-%s-wikidump' % (domain2prefix(config=config), config['date']) + + return config + +def getURL(url=''): + html = '' + try: + req = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.0' }) + html = urllib.request.urlopen(req).read().decode().strip() + except: + print("Error while retrieving URL", url) + sys.exit() + return html + +def getUserAgent(): + """ Return a cool user-agent to hide Python user-agent """ + + useragents = [ + 'Mozilla/5.0', + ] + return random.choice(useragents) + +def getVersion(): + return __version__ + +def getWikiEngine(url=''): + """ Returns wiki engine of a URL, if known """ + + wikiengine = 'unknown' + if url: + session = requests.Session() + session.headers.update({'User-Agent': getUserAgent()}) + r = session.post(url=url) + if r.status_code == 405 or r.text == '': + r = session.get(url=url) + result = r.text + else: + return wikiengine.lower() + + if re.search( + r'(?im)()', result): + wikiengine = 'moinmoin' + elif re.search(r'(?im)(twikiCurrentTopicLink|twikiCurrentWebHomeLink|twikiLink)', result): + wikiengine = 'twiki' + elif re.search(r'(?im)()', result): + wikiengine = 'pmwiki' + elif re.search(r'(?im)(|)', result): + wikiengine = 'wagn' + elif re.search(r'(?im)(\s*(

    )?JSPWiki|xmlns:jspwiki="http://www\.jspwiki\.org")', result): + wikiengine = 'jspwiki' + elif re.search(r'(?im)(Powered by:?\s*(
    )?\s*
    |\bKwikiNavigation\b)', result): + wikiengine = 'kwiki' + elif re.search(r'(?im)(Powered by )', result): + wikiengine = 'zwiki' + # WakkaWiki forks + elif re.search(r'(?im)()', result): + wikiengine = 'wikkawiki' # formerly WikkaWakkaWiki + elif re.search(r'(?im)(CitiWiki)', result): + wikiengine = 'citiwiki' + elif re.search(r'(?im)(Powered by |wikidot-privacy-button-hovertip|javascript:WIKIDOT\.page)', result): + wikiengine = 'wikidot' + elif re.search(r'(?im)(IS_WETPAINT_USER|wetpaintLoad|WPC-bodyContentContainer)', result): + wikiengine = 'wetpaint' + elif re.search(r'(?im)(