xmlrevisions: actually allow index to be undefined, don't POST data

* http://biografias.bcn.cl/api.php does not like the data to be POSTed.
  Just use URL parameters. Some wikis had anti-spam protections which
  made us POST everything, but for most wikis this should be fine.
* If the index is not defined, don't fail.
* Use only the base api.php URL, not parameters, in domain2prefix.

https://github.com/WikiTeam/wikiteam/issues/314
pull/319/head
Federico Leva 6 years ago
parent 3a56037279
commit 754027de42

@ -51,6 +51,10 @@ except ImportError:
print "Please install the wikitools 1.3+ module if you want to use --xmlrevisions."
import time
import urllib
try:
from urlparse import urlparse, urlunparse
except ImportError:
from urllib.parse import urlparse, urlunparse
UTF8Writer = getwriter('utf8')
sys.stdout = UTF8Writer(sys.stdout)
@ -192,7 +196,7 @@ def getNamespacesAPI(config={}, session=None):
if namespaces:
r = session.post(
url=config['api'],
data={
params={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'namespaces',
@ -255,7 +259,7 @@ def getPageTitlesAPI(config={}, session=None):
retryCount = 0
while retryCount < config["retries"]:
try:
r = session.post(url=config['api'], data=params, timeout=30)
r = session.post(url=config['api'], params=params, timeout=30)
break
except ConnectionError as err:
print "Connection error: %s" % (str(err),)
@ -392,7 +396,7 @@ def getPageTitles(config={}, session=None):
titles = []
if 'api' in config and config['api']:
r = session.post(config['api'], {'action': 'query', 'list': 'allpages', 'format': 'json'}, timeout=30)
r = session.post(config['api'], params={'action': 'query', 'list': 'allpages', 'format': 'json'}, timeout=30)
test = getJSON(r)
if ('warnings' in test and 'allpages' in test['warnings'] and '*' in test['warnings']['allpages']
and test['warnings']['allpages']['*'] == 'The "allpages" module has been disabled.'):
@ -442,9 +446,11 @@ def getXMLHeader(config={}, session=None):
# similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/"
# xmlns:x....
randomtitle = 'Main_Page' # previously AMF5LKE43MNFGHKSDMRTJ
print config['api']
if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
xml = None
try:
print 'Getting the XML header from the API'
r = session.get(config['api'] + '?action=query&revids=1&export&exportnowrap', timeout=10)
xml = r.text
except requests.exceptions.RetryError:
@ -1254,7 +1260,7 @@ def domain2prefix(config={}, session=None):
domain = config['index']
domain = domain.lower()
domain = re.sub(r'(https?://|www\.|/index\.php|/api\.php)', '', domain)
domain = re.sub(r'(https?://|www\.|/index\.php.+|/api\.php.+)', '', domain)
domain = re.sub(r'/', '_', domain)
domain = re.sub(r'\.', '', domain)
domain = re.sub(r'[^A-Za-z0-9]', '_', domain)
@ -1530,15 +1536,20 @@ def getParameters(params=[]):
session=session):
print 'index.php is OK'
else:
index = '/'.join(index.split('/')[:-1])
try:
index = '/'.join(index.split('/')[:-1])
except AttributeError:
index = None
if index and checkIndex(
index=index,
cookies=args.cookies,
session=session):
print 'index.php is OK'
else:
print 'Error in index.php, please, provide a correct path to index.php'
sys.exit(1)
print 'Error in index.php.'
if not args.xmlrevisions:
print 'Please, provide a correct path to index.php or use --xmlrevisions. Terminating.'
sys.exit(1)
# check user and pass (one requires both)
if (args.user and not args.password) or (args.password and not args.user):
@ -1622,16 +1633,17 @@ def checkAPI(api=None, session=None):
print 'Checking API...', api
r = session.post(
url=api,
data={
params={
'action': 'query',
'meta': 'siteinfo',
'format': 'json'},
timeout=30
)
if r.url == api:
if r.status_code == 200:
break
else:
api = r.url
p = r.url
api = urlunparse([p.scheme, p.netloc, p.path, '', '', ''])
if "MediaWiki API is not enabled for this site." in r.text:
return False
try:
@ -1988,10 +2000,14 @@ def avoidWikimediaProjects(config={}, other={}):
""" Skip Wikimedia projects and redirect to the dumps website """
# notice about wikipedia dumps
url = ''
if config['api']:
url = url + config['api']
if config['index']:
url = url + config['index']
if re.findall(
r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org',
config['api'] +
config['index']):
url):
print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!'
print 'Download the dumps from http://dumps.wikimedia.org'
if not other['force']:

Loading…
Cancel
Save