)?\s*.
+
+# Documentation for users: https://github.com/WikiTeam/wikiteam/wiki
+# Documentation for developers: http://wikiteam.readthedocs.com
+
+import argparse
+import datetime
+import random
+import re
+import requests # seriously needed?
+import sys
+import urllib
+
+if sys.version_info < (3, 0):
+ import cookielib
+else:
+ import http.cookiejar as cookielib
+
+import mediawiki
+
+__version__ = "0.3.0"
+
+def avoidWikimediaProjects(config={}):
+ """ Skip Wikimedia projects and redirect to the dumps website """
+
+ # notice about wikipedia dumps
+ if re.findall(r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org', config['wiki']):
+ print('PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!')
+ print('Download Wikimedia dumps from https://dumps.wikimedia.org')
+ """if not other['force']:
+ print 'Thanks!'
+ sys.exit()"""
+
+def bye():
+ """ Print closing message """
+
+ message = """
+---> Congratulations! Your dump is complete <---
+If you found any bug, report a new issue here: https://github.com/WikiTeam/wikiteam/issues
+If this is a public wiki, please consider publishing this dump. Do it yourself as explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump or contact us at https://github.com/WikiTeam/wikiteam
+Good luck! Bye!"""
+ print(message)
+
+def domain2prefix(config={}):
+ """ Convert domain name to a valid prefix filename. """
+
+ domain = ''
+ if config['wiki']:
+ domain = config['wiki']
+ domain = domain.lower()
+ domain = re.sub(r'(https?://|www\.|/index\.php|/api\.php)', '', domain)
+ domain = re.sub(r'/', '_', domain)
+ domain = re.sub(r'\.', '', domain)
+ domain = re.sub(r'[^A-Za-z0-9]', '_', domain)
+ domain = domain.strip('_')
+ return domain
+
+def getAPI(url=''):
+ """ Returns API for a wiki, if available """
+
+ wikiengine = getWikiEngine(url=url)
+ api = ''
+ if wikiengine == 'mediawiki':
+ api = mediawiki.mwGetAPI(url=url)
+
+ return api
+
+def getIndex(url=''):
+ """ Returns Index.php for a wiki, if available """
+
+ wikiengine = getWikiEngine(url=url)
+ index = ''
+ if wikiengine == 'mediawiki':
+ index = mediawiki.mwGetIndex(url=url)
+
+ return index
+
+def getParameters(params=[]):
+ """ Import parameters into variable """
+
+ if not params:
+ params = sys.argv
+
+ config = {}
+ parser = argparse.ArgumentParser(description='Tools for downloading and preserving wikis.')
+
+ # General params
+ parser.add_argument(
+ '-v', '--version', action='version', version=getVersion())
+ parser.add_argument(
+ '--cookies', metavar="cookies.txt", help="Path to a cookies.txt file.")
+ parser.add_argument(
+ '--delay',
+ metavar=5,
+ default=0,
+ type=float,
+ help="Adds a delay (in seconds).")
+ parser.add_argument(
+ '--retries',
+ metavar=5,
+ default=5,
+ help="Maximum number of retries.")
+ parser.add_argument('--path', help='Path to store wiki dump at.')
+ parser.add_argument(
+ '--resume',
+ action='store_true',
+ help='Resumes previous incomplete dump (requires --path).')
+ parser.add_argument('--force', action='store_true', help='')
+ parser.add_argument(
+ '--user', help='Username if authentication is required.')
+ parser.add_argument(
+ '--pass',
+ dest='password',
+ help='Password if authentication is required.')
+
+ # URL params
+ groupWiki = parser.add_argument_group()
+ groupWiki.add_argument(
+ 'wiki',
+ default='',
+ nargs='?',
+ help="URL to wiki (e.g. http://wiki.domain.org).")
+ groupWiki.add_argument(
+ '--mw-api',
+ help="URL to MediaWiki API (e.g. http://wiki.domain.org/w/api.php).")
+ groupWiki.add_argument(
+ '--mw-index',
+ help="URL to MediaWiki index.php (e.g. http://wiki.domain.org/w/index.php).")
+
+ # Download params
+ groupDownload = parser.add_argument_group(
+ 'Data to download',
+ 'What info download from the wiki')
+ groupDownload.add_argument(
+ '--pages',
+ action='store_true',
+ help="Generates a dump of pages (--pages --curonly for current revisions only).")
+ groupDownload.add_argument('--curonly', action='store_true',
+ help='Store only the current version of pages.')
+ groupDownload.add_argument(
+ '--images', action='store_true', help="Generates an image dump.")
+ groupDownload.add_argument(
+ '--namespaces',
+ metavar="1,2,3",
+ help='Comma-separated value of namespaces to include (all by default).')
+ groupDownload.add_argument(
+ '--exnamespaces',
+ metavar="1,2,3",
+ help='Comma-separated value of namespaces to exclude.')
+
+ # Meta info params
+ groupMeta = parser.add_argument_group(
+ 'Meta info',
+ 'What meta info to retrieve from the wiki')
+ groupMeta.add_argument(
+ '--get-api',
+ action='store_true',
+ help="Returns wiki API when available.")
+ groupMeta.add_argument(
+ '--get-index',
+ action='store_true',
+ help="Returns wiki Index.php when available.")
+ groupMeta.add_argument(
+ '--get-wiki-engine',
+ action='store_true',
+ help="Returns wiki engine.")
+
+ args = parser.parse_args()
+ #print(args)
+
+ # Not wiki? Exit
+ if not args.wiki:
+ print('ERROR: Provide a URL to a wiki')
+ parser.print_help()
+ sys.exit(1)
+
+ # Don't mix download params and meta info params
+ if (args.pages or args.images) and \
+ (args.get_wiki_engine):
+ print('ERROR: Don\'t mix download params and meta info params')
+ parser.print_help()
+ sys.exit(1)
+
+ # No download params and no meta info params? Exit
+ if (not args.pages and not args.images) and \
+ (not args.get_api and not args.get_wiki_engine):
+ print('ERROR: Use at least one download param or meta info param')
+ parser.print_help()
+ sys.exit(1)
+
+ # Execute meta info params
+ if args.wiki:
+ if args.get_api:
+ print(getAPI(url=args.wiki))
+ sys.exit()
+ if args.get_wiki_engine:
+ print(getWikiEngine(url=args.wiki))
+ sys.exit()
+
+ # Create session
+ cj = cookielib.MozillaCookieJar()
+ if args.cookies:
+ cj.load(args.cookies)
+ print('Using cookies from %s' % args.cookies)
+
+ session = requests.Session()
+ session.cookies = cj
+ session.headers.update({'User-Agent': getUserAgent()})
+ if args.user and args.password:
+ session.auth = (args.user, args.password)
+ # session.mount(args.mw_api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret))
+
+ # check URLs
+ for url in [args.mw_api, args.mw_index, args.wiki]:
+ if url and (not url.startswith('http://') and not url.startswith('https://')):
+ print(url)
+ print('ERROR: URLs must start with http:// or https://\n')
+ parser.print_help()
+ sys.exit(1)
+
+ wikiengine = getWikiEngine(args.wiki)
+ if wikiengine == 'wikispaces':
+ pass
+ else: # presume is a mediawiki
+ if not args.mw_api:
+ api = mediawiki.mwGetAPI(url=args.wiki)
+ if not api:
+ print('ERROR: Provide a URL to API')
+ if not args.mw_index:
+ index = mediawiki.mwGetIndex(url=args.wiki)
+ if not index:
+ print('ERROR: Provide a URL to Index.php')
+
+ # check user and pass (one requires both)
+ if (args.user and not args.password) or (args.password and not args.user):
+ print('ERROR: Both --user and --pass are required for authentication.')
+ parser.print_help()
+ sys.exit(1)
+
+ namespaces = ['all']
+ exnamespaces = []
+ # Process namespace inclusions
+ if args.namespaces:
+ # fix, why - ? and... --namespaces= all with a space works?
+ if re.search(
+ r'[^\d, \-]',
+ args.namespaces) and args.namespaces.lower() != 'all':
+ print("Invalid namespace values.\nValid format is integer(s) separated by commas")
+ sys.exit()
+ else:
+ ns = re.sub(' ', '', args.namespaces)
+ if ns.lower() == 'all':
+ namespaces = ['all']
+ else:
+ namespaces = [int(i) for i in ns.split(',')]
+
+ # Process namespace exclusions
+ if args.exnamespaces:
+ if re.search(r'[^\d, \-]', args.exnamespaces):
+ print("Invalid namespace values.\nValid format is integer(s) separated by commas")
+ sys.exit(1)
+ else:
+ ns = re.sub(' ', '', args.exnamespaces)
+ if ns.lower() == 'all':
+ print('You cannot exclude all namespaces.')
+ sys.exit(1)
+ else:
+ exnamespaces = [int(i) for i in ns.split(',')]
+
+ # --curonly requires --xml
+ if args.curonly and not args.pages:
+ print("--curonly requires --pages\n")
+ parser.print_help()
+ sys.exit(1)
+
+ config = {
+ 'wiki': args.wiki,
+ 'wikiengine': wikiengine,
+ 'curonly': args.curonly,
+ 'date': datetime.datetime.now().strftime('%Y%m%d'),
+ 'images': args.images,
+ 'pages': args.pages,
+ 'logs': False,
+ 'pages': args.pages,
+ 'namespaces': namespaces,
+ 'exnamespaces': exnamespaces,
+ 'path': args.path and os.path.normpath(args.path) or '',
+ 'cookies': args.cookies or '',
+ 'delay': args.delay,
+ 'retries': int(args.retries),
+ 'other': {
+ 'resume': args.resume,
+ 'filenamelimit': 100, # do not change
+ 'force': args.force,
+ 'session': session,
+ }
+ }
+
+ # calculating path, if not defined by user with --path=
+ if not config['path']:
+ config['path'] = './%s-%s-wikidump' % (domain2prefix(config=config), config['date'])
+
+ return config
+
+def getURL(url=''):
+ html = ''
+ try:
+ req = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.0' })
+ html = urllib.request.urlopen(req).read().decode().strip()
+ except:
+ print("Error while retrieving URL", url)
+ sys.exit()
+ return html
+
+def getUserAgent():
+ """ Return a cool user-agent to hide Python user-agent """
+
+ useragents = [
+ 'Mozilla/5.0',
+ ]
+ return random.choice(useragents)
+
+def getVersion():
+ return __version__
+
+def getWikiEngine(url=''):
+ """ Returns wiki engine of a URL, if known """
+
+ wikiengine = 'unknown'
+ if url:
+ session = requests.Session()
+ session.headers.update({'User-Agent': getUserAgent()})
+ r = session.post(url=url)
+ if r.status_code == 405 or r.text == '':
+ r = session.get(url=url)
+ result = r.text
+ else:
+ return wikiengine.lower()
+
+ if re.search(
+ r'(?im)()', result):
+ wikiengine = 'moinmoin'
+ elif re.search(r'(?im)(twikiCurrentTopicLink|twikiCurrentWebHomeLink|twikiLink)', result):
+ wikiengine = 'twiki'
+ elif re.search(r'(?im)()', result):
+ wikiengine = 'pmwiki'
+ elif re.search(r'(?im)(|)', result):
+ wikiengine = 'wagn'
+ elif re.search(r'(?im)(\s*()?JSPWiki|xmlns:jspwiki="http://www\.jspwiki\.org")', result):
+ wikiengine = 'jspwiki'
+ elif re.search(r'(?im)(Powered by:?\s*(
)?\s*|\bKwikiNavigation\b)', result):
+ wikiengine = 'kwiki'
+ elif re.search(r'(?im)(Powered by )', result):
+ wikiengine = 'zwiki'
+ # WakkaWiki forks
+ elif re.search(r'(?im)()', result):
+ wikiengine = 'wikkawiki' # formerly WikkaWakkaWiki
+ elif re.search(r'(?im)(CitiWiki)', result):
+ wikiengine = 'citiwiki'
+ elif re.search(r'(?im)(Powered by |wikidot-privacy-button-hovertip|javascript:WIKIDOT\.page)', result):
+ wikiengine = 'wikidot'
+ elif re.search(r'(?im)(IS_WETPAINT_USER|wetpaintLoad|WPC-bodyContentContainer)', result):
+ wikiengine = 'wetpaint'
+ elif re.search(r'(?im)(