pull/319/head
emijrp 6 years ago
commit 273f1b33cb

@ -20,7 +20,7 @@
# https://github.com/WikiTeam/wikiteam/wiki
try:
from kitchen.text.converters import getwriter
from kitchen.text.converters import getwriter, to_unicode
except ImportError:
print "Please install the kitchen module."
import cookielib
@ -210,25 +210,32 @@ def getNamespacesAPI(config={}, session=None):
)
result = getJSON(r)
delay(config=config, session=session)
try:
nsquery = result['query']['namespaces']
except KeyError:
print "Error: could not get namespaces from the API request"
print "HTTP %d" % r.status_code
print r.text
return None
if 'all' in namespaces:
namespaces = []
for i in result['query']['namespaces'].keys():
for i in nsquery.keys():
if int(i) < 0: # -1: Special, -2: Media, excluding
continue
namespaces.append(int(i))
namespacenames[int(i)] = result['query']['namespaces'][i]['*']
namespacenames[int(i)] = nsquery[i]['*']
else:
# check if those namespaces really exist in this wiki
namespaces2 = []
for i in result['query']['namespaces'].keys():
for i in nsquery.keys():
bi = i
i = int(i)
if i < 0: # -1: Special, -2: Media, excluding
continue
if i in namespaces:
namespaces2.append(i)
namespacenames[i] = result['query']['namespaces'][bi]['*']
namespacenames[i] = nsquery[bi]['*']
namespaces = namespaces2
else:
namespaces = [0]
@ -289,18 +296,24 @@ def getPageTitlesAPI(config={}, session=None):
# print apfrom
# print jsontitles
allpages = jsontitles['query']['allpages']
try:
allpages = jsontitles['query']['allpages']
except KeyError:
print "The allpages API returned nothing. Exit."
sys.exit(1)
# Hack for old versions of MediaWiki API where result is dict
if isinstance(allpages, dict):
allpages = allpages.values()
for page in allpages:
yield page['title']
title = page['title']
titles.append(title)
yield title
c += len(allpages)
if len(titles) != len(set(titles)):
# probably we are in a loop, server returning dupe titles, stop
# it
print 'Probably a loop, finishing'
print 'Probably a loop, switching to next namespace. Duplicate title:'
print title
titles = list(set(titles))
apfrom = ''
@ -858,7 +871,15 @@ def getXMLRevisions(config={}, session=None, allpages=False):
for result in results:
pages = result['query']['pages']
for page in pages:
yield makeXmlFromPage(pages[page])
try:
xml = makeXmlFromPage(pages[page])
except PageMissingError:
logerror(
config=config,
text=u'Error: empty revision from API. Could not export page: %s' % (title.decode('utf-8'))
)
continue
yield xml
except wikitools.api.APIError:
print "This wikitools version seems not to work for us. Exiting."
@ -866,26 +887,31 @@ def getXMLRevisions(config={}, session=None, allpages=False):
def makeXmlFromPage(page):
""" Output an XML document as a string from a page as in the API JSON """
p = E.page(
E.title(page['title']),
E.ns(str(page['ns'])),
E.id(str(page['pageid'])),
)
for rev in page['revisions']:
revision = E.revision(
E.id(str(rev['revid'])),
E.timestamp(rev['timestamp']),
E.contributor(
E.id(str(rev['userid'])),
E.username(str(rev['user'])),
),
E.comment(rev['comment']),
E.text(rev['*'], space="preserve", bytes=str(rev['size'])),
E.sha1(rev['sha1']),
try:
p = E.page(
E.title(page['title']),
E.ns(to_unicode(page['ns'])),
E.id(to_unicode(page['pageid'])),
)
if 'contentmodel' in rev:
revision.append(E.model)
p.append(revision)
for rev in page['revisions']:
revision = E.revision(
E.id(to_unicode(rev['revid'])),
E.timestamp(rev['timestamp']),
E.contributor(
E.id(to_unicode(rev['userid'])),
E.username(to_unicode(rev['user'])),
),
E.comment(rev['comment']),
E.text(rev['*'], space="preserve", bytes=to_unicode(rev['size'])),
)
if 'contentmodel' in rev:
revision.append(E.model(rev['contentmodel']))
# The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia).
if 'sha1' in rev:
revision.append(E.sha1(rev['sha1']))
p.append(revision)
except KeyError:
raise PageMissingError(page['title'], '')
return etree.tostring(p, pretty_print=True)
def readTitles(config={}, start=None):
@ -1575,6 +1601,7 @@ def getParameters(params=[]):
retry = 0
maxretries = args.retries
retrydelay = 20
check = None
while retry < maxretries:
try:
check = checkAPI(api=api, session=session)
@ -1832,6 +1859,8 @@ def checkXMLIntegrity(config={}, titles=[], session=None):
else:
print 'XML dump seems to be corrupted.'
reply = ''
if config['failfast']:
reply = 'yes'
while reply.lower() not in ['yes', 'y', 'no', 'n']:
reply = raw_input('Regenerate a new dump ([yes, y], [no, n])? ')
if reply.lower() in ['yes', 'y']:

@ -6,12 +6,12 @@
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
@ -30,11 +30,11 @@ def main():
if len(sys.argv) < 2:
print 'python script.py file-with-apis.txt'
sys.exit()
print 'Reading list of APIs from', sys.argv[1]
wikis = open(sys.argv[1], 'r').read().splitlines()
print '%d APIs found' % (len(wikis))
for wiki in wikis:
print "#"*73
print "# Downloading", wiki
@ -42,17 +42,15 @@ def main():
wiki = wiki.lower()
# Make the prefix in standard way; api and index must be defined, not important which is which
prefix = dumpgenerator.domain2prefix(config={'api': wiki, 'index': wiki})
#check if compressed, in that case dump was finished previously
compressed = False
for dirname, dirnames, filenames in os.walk('.'):
if dirname == '.':
for f in filenames:
if f.startswith(prefix) and f.endswith('.7z'):
compressed = True
zipfilename = f
for f in os.listdir('.'):
if f.startswith(prefix) and f.endswith('.7z'):
compressed = True
zipfilename = f
break #stop searching, dot not explore subdirectories
if compressed:
print 'Skipping... This wiki was downloaded and compressed before in', zipfilename
# Get the archive's file list.
@ -67,18 +65,17 @@ def main():
print "WARNING: Content of the archive not checked, we need python 2.7+ or 3.1+."
# TODO: Find a way like grep -q below without doing a 7z l multiple times?
continue
#download
started = False #was this wiki download started before? then resume
wikidir = ''
for dirname, dirnames, filenames in os.walk('.'):
if dirname == '.':
for d in dirnames:
if d.startswith(prefix):
wikidir = d
started = True
for f in os.listdir('.'):
# Does not find numbered wikidumps not verify directories
if f.startswith(prefix) and f.endswith('wikidump'):
wikidir = f
started = True
break #stop searching, dot not explore subdirectories
# time.sleep(60)
# Uncomment what above and add --delay=60 in the dumpgenerator.py calls below for broken wiki farms
# such as editthis.info, wiki-site.com, wikkii (adjust the value as needed;
@ -90,15 +87,14 @@ def main():
subprocess.call('./dumpgenerator.py --api=%s --xml --images' % wiki, shell=True)
started = True
#save wikidir now
for dirname, dirnames, filenames in os.walk('.'):
if dirname == '.':
for d in dirnames:
if d.startswith(prefix):
wikidir = d
for f in os.listdir('.'):
# Does not find numbered wikidumps not verify directories
if f.startswith(prefix) and f.endswith('wikidump'):
wikidir = f
break #stop searching, dot not explore subdirectories
prefix = wikidir.split('-wikidump')[0]
finished = False
if started and wikidir and prefix:
if (subprocess.call (['tail -n 1 %s/%s-history.xml | grep -q "</mediawiki>"' % (wikidir, prefix)], shell=True) ):
@ -107,7 +103,7 @@ def main():
finished = True
# You can also issue this on your working directory to find all incomplete dumps:
# tail -n 1 */*-history.xml | grep -Ev -B 1 "</page>|</mediawiki>|==|^$"
#compress
if finished:
time.sleep(1)

File diff suppressed because it is too large Load Diff

@ -23,7 +23,7 @@ import subprocess
import re
from wikitools import wiki, api
def getlist(wikia, wkfrom = 1, wkto = 1000):
def getlist(wikia, wkfrom = 1, wkto = 100):
params = {'action': 'query', 'list': 'wkdomains', 'wkactive': '1', 'wkfrom': wkfrom, 'wkto': wkto,}
request = api.APIRequest(wikia, params)
return request.query()['query']['wkdomains']
@ -31,8 +31,9 @@ def getlist(wikia, wkfrom = 1, wkto = 1000):
def getall():
wikia = wiki.Wiki('http://community.wikia.com/api.php')
offset = 0
limit = 1000
limit = 100
domains = {}
empty = 0
# This API module has no query continuation facility
print 'Getting list of active domains...'
while True:
@ -40,13 +41,21 @@ def getall():
if list:
print offset
domains = dict(domains.items() + list.items() )
offset += 1000
empty = 0
else:
empty += 1
offset += limit
if empty > 100:
# Hopefully we don't have more than 10k wikis deleted in a row
break
return domains
def main():
domains = getall()
with open('wikia.com', 'w') as out:
out.write('\n'.join(str(domains[i]['domain']) for i in domains))
undumped = []
# Or we could iterate over each sublist while we get it?
for i in domains:
@ -69,7 +78,7 @@ def main():
try:
#subprocess.check_call(['wget', '-e', 'robots=off', '--fail', '-nc', '-a', 'wikia.log', full])
# Use this instead, and comment out the next try, to only list.
subprocess.check_call(['curl', '-I', '--fail', full])
subprocess.call(['curl', '-I', '--fail', full])
except subprocess.CalledProcessError as e:
# We added --fail for this https://superuser.com/a/854102/283120
if e.returncode == 22:
@ -81,7 +90,9 @@ def main():
# subprocess.check_call(['wget', '-e', 'robots=off', '-nc', '-a', 'wikia.log', images])
#except:
# pass
print '\n'.join(str(dump) for dump in undumped)
with open('wikia.com-unarchived', 'w+') as out:
out.write('\n'.join(str(domain) for domain in undumped))
if __name__ == '__main__':
main()

Loading…
Cancel
Save