Merge branch 'master' of https://github.com/WikiTeam/wikiteam

6 years ago · 273f1b33cb
parent 70eefcc945 3b74173e0f
commit 273f1b33cb
4 changed files with 234954 additions and 184044 deletions
--- a/dumpgenerator.py
+++ b/dumpgenerator.py
@ -20,7 +20,7 @@
 #     https://github.com/WikiTeam/wikiteam/wiki

 try:
-    from kitchen.text.converters import getwriter
+    from kitchen.text.converters import getwriter, to_unicode
 except ImportError:
    print "Please install the kitchen module."
 import cookielib
@ -210,25 +210,32 @@ def getNamespacesAPI(config={}, session=None):
        )
        result = getJSON(r)
        delay(config=config, session=session)
+        try:
+            nsquery = result['query']['namespaces']
+        except KeyError:
+            print "Error: could not get namespaces from the API request"
+            print "HTTP %d" % r.status_code
+            print r.text
+            return None

        if 'all' in namespaces:
            namespaces = []
-            for i in result['query']['namespaces'].keys():
+            for i in nsquery.keys():
                if int(i) < 0:  # -1: Special, -2: Media, excluding
                    continue
                namespaces.append(int(i))
-                namespacenames[int(i)] = result['query']['namespaces'][i]['*']
+                namespacenames[int(i)] = nsquery[i]['*']
        else:
            # check if those namespaces really exist in this wiki
            namespaces2 = []
-            for i in result['query']['namespaces'].keys():
+            for i in nsquery.keys():
                bi = i
                i = int(i)
                if i < 0:  # -1: Special, -2: Media, excluding
                    continue
                if i in namespaces:
                    namespaces2.append(i)
-                    namespacenames[i] = result['query']['namespaces'][bi]['*']
+                    namespacenames[i] = nsquery[bi]['*']
            namespaces = namespaces2
    else:
        namespaces = [0]
@ -289,18 +296,24 @@ def getPageTitlesAPI(config={}, session=None):

            # print apfrom
            # print jsontitles
-            allpages = jsontitles['query']['allpages']
+            try:
+                allpages = jsontitles['query']['allpages']
+            except KeyError:
+                print "The allpages API returned nothing. Exit."
+                sys.exit(1)
+
            # Hack for old versions of MediaWiki API where result is dict
            if isinstance(allpages, dict):
                allpages = allpages.values()
            for page in allpages:
-                yield page['title']
+                title = page['title']
+                titles.append(title)
+                yield title
            c += len(allpages)

            if len(titles) != len(set(titles)):
-                # probably we are in a loop, server returning dupe titles, stop
-                # it
-                print 'Probably a loop, finishing'
+                print 'Probably a loop, switching to next namespace. Duplicate title:'
+                print title
                titles = list(set(titles))
                apfrom = ''

@ -858,7 +871,15 @@ def getXMLRevisions(config={}, session=None, allpages=False):
                for result in results:
                    pages = result['query']['pages']
                    for page in pages:
-                        yield makeXmlFromPage(pages[page])
+                        try:
+                            xml = makeXmlFromPage(pages[page])
+                        except PageMissingError:
+                            logerror(
+                                config=config,
+                                text=u'Error: empty revision from API. Could not export page: %s' % (title.decode('utf-8'))
+                            )
+                            continue
+                        yield xml

    except wikitools.api.APIError:
        print "This wikitools version seems not to work for us. Exiting."
@ -866,26 +887,31 @@ def getXMLRevisions(config={}, session=None, allpages=False):

 def makeXmlFromPage(page):
    """ Output an XML document as a string from a page as in the API JSON """
-    p = E.page(
-            E.title(page['title']),
-            E.ns(str(page['ns'])),
-            E.id(str(page['pageid'])),
-       )
-    for rev in page['revisions']:
-        revision = E.revision(
-               E.id(str(rev['revid'])),
-               E.timestamp(rev['timestamp']),
-               E.contributor(
-                    E.id(str(rev['userid'])),
-                    E.username(str(rev['user'])),
-               ),
-               E.comment(rev['comment']),
-               E.text(rev['*'], space="preserve", bytes=str(rev['size'])),
-               E.sha1(rev['sha1']),
+    try:
+        p = E.page(
+                E.title(page['title']),
+                E.ns(to_unicode(page['ns'])),
+                E.id(to_unicode(page['pageid'])),
        )
-        if 'contentmodel' in rev:
-            revision.append(E.model)
-        p.append(revision)
+        for rev in page['revisions']:
+            revision = E.revision(
+                E.id(to_unicode(rev['revid'])),
+                E.timestamp(rev['timestamp']),
+                E.contributor(
+                        E.id(to_unicode(rev['userid'])),
+                        E.username(to_unicode(rev['user'])),
+                ),
+                E.comment(rev['comment']),
+                E.text(rev['*'], space="preserve", bytes=to_unicode(rev['size'])),
+            )
+            if 'contentmodel' in rev:
+                revision.append(E.model(rev['contentmodel']))
+            # The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia).
+            if 'sha1' in rev:
+                revision.append(E.sha1(rev['sha1']))
+            p.append(revision)
+    except KeyError:
+        raise PageMissingError(page['title'], '')
    return etree.tostring(p, pretty_print=True)

 def readTitles(config={}, start=None):
@ -1575,6 +1601,7 @@ def getParameters(params=[]):
        retry = 0
        maxretries = args.retries
        retrydelay = 20
+        check = None
        while retry < maxretries:
            try:
                check = checkAPI(api=api, session=session)
@ -1832,6 +1859,8 @@ def checkXMLIntegrity(config={}, titles=[], session=None):
    else:
        print 'XML dump seems to be corrupted.'
        reply = ''
+        if config['failfast']:
+            reply = 'yes'
        while reply.lower() not in ['yes', 'y', 'no', 'n']:
            reply = raw_input('Regenerate a new dump ([yes, y], [no, n])? ')
        if reply.lower() in ['yes', 'y']:
--- a/launcher.py
+++ b/launcher.py
@ -6,12 +6,12 @@
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
-# 
+#
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
-# 
+#
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

@ -30,11 +30,11 @@ def main():
    if len(sys.argv) < 2:
        print 'python script.py file-with-apis.txt'
        sys.exit()
-    
+
    print 'Reading list of APIs from', sys.argv[1]
    wikis = open(sys.argv[1], 'r').read().splitlines()
    print '%d APIs found' % (len(wikis))
-    
+
    for wiki in wikis:
        print "#"*73
        print "# Downloading", wiki
@ -42,17 +42,15 @@ def main():
        wiki = wiki.lower()
        # Make the prefix in standard way; api and index must be defined, not important which is which
        prefix = dumpgenerator.domain2prefix(config={'api': wiki, 'index': wiki})
-        
+
        #check if compressed, in that case dump was finished previously
        compressed = False
-        for dirname, dirnames, filenames in os.walk('.'):
-            if dirname == '.':
-                for f in filenames:
-                    if f.startswith(prefix) and f.endswith('.7z'):
-                        compressed = True
-                        zipfilename = f
+        for f in os.listdir('.'):
+            if f.startswith(prefix) and f.endswith('.7z'):
+                compressed = True
+                zipfilename = f
                break #stop searching, dot not explore subdirectories
-        
+
        if compressed:
            print 'Skipping... This wiki was downloaded and compressed before in', zipfilename
            # Get the archive's file list.
@ -67,18 +65,17 @@ def main():
                print "WARNING: Content of the archive not checked, we need python 2.7+ or 3.1+."
                # TODO: Find a way like grep -q below without doing a 7z l multiple times?
            continue
-        
+
        #download
        started = False #was this wiki download started before? then resume
        wikidir = ''
-        for dirname, dirnames, filenames in os.walk('.'):
-            if dirname == '.':
-                for d in dirnames:
-                    if d.startswith(prefix):
-                        wikidir = d
-                        started = True
+        for f in os.listdir('.'):
+            # Does not find numbered wikidumps not verify directories
+            if f.startswith(prefix) and f.endswith('wikidump'):
+                wikidir = f
+                started = True
                break #stop searching, dot not explore subdirectories
-        
+
        # time.sleep(60)
        # Uncomment what above and add --delay=60 in the dumpgenerator.py calls below for broken wiki farms
        # such as editthis.info, wiki-site.com, wikkii (adjust the value as needed;
@ -90,15 +87,14 @@ def main():
            subprocess.call('./dumpgenerator.py --api=%s --xml --images' % wiki, shell=True)
            started = True
            #save wikidir now
-            for dirname, dirnames, filenames in os.walk('.'):
-                if dirname == '.':
-                    for d in dirnames:
-                        if d.startswith(prefix):
-                            wikidir = d
+            for f in os.listdir('.'):
+                # Does not find numbered wikidumps not verify directories
+                if f.startswith(prefix) and f.endswith('wikidump'):
+                    wikidir = f
                    break #stop searching, dot not explore subdirectories
-        
+
        prefix = wikidir.split('-wikidump')[0]
-        
+
        finished = False
        if started and wikidir and prefix:
            if (subprocess.call (['tail -n 1 %s/%s-history.xml | grep -q "</mediawiki>"' % (wikidir, prefix)], shell=True) ):
@ -107,7 +103,7 @@ def main():
                finished = True
        # You can also issue this on your working directory to find all incomplete dumps:
        # tail -n 1 */*-history.xml | grep -Ev -B 1 "</page>|</mediawiki>|==|^$"
-        
+
        #compress
        if finished:
            time.sleep(1)
--- a/listsofwikis/mediawiki/wikia.com
+++ b/listsofwikis/mediawiki/wikia.com
--- a/listsofwikis/mediawiki/wikia.py
+++ b/listsofwikis/mediawiki/wikia.py
@ -23,7 +23,7 @@ import subprocess
 import re
 from wikitools import wiki, api

-def getlist(wikia, wkfrom = 1, wkto = 1000):
+def getlist(wikia, wkfrom = 1, wkto = 100):
    params = {'action': 'query', 'list': 'wkdomains', 'wkactive': '1', 'wkfrom': wkfrom, 'wkto': wkto,}
    request = api.APIRequest(wikia, params)
    return request.query()['query']['wkdomains']
@ -31,8 +31,9 @@ def getlist(wikia, wkfrom = 1, wkto = 1000):
 def getall():
    wikia = wiki.Wiki('http://community.wikia.com/api.php')
    offset = 0
-    limit = 1000
+    limit = 100
    domains = {}
+    empty = 0
    # This API module has no query continuation facility
    print 'Getting list of active domains...'
    while True:
@ -40,13 +41,21 @@ def getall():
        if list:
            print offset
            domains = dict(domains.items() + list.items() )
-            offset += 1000
+            empty = 0
        else:
+            empty += 1
+
+        offset += limit
+        if empty > 100:
+            # Hopefully we don't have more than 10k wikis deleted in a row
            break
    return domains

 def main():
    domains = getall()
+    with open('wikia.com', 'w') as out:
+		out.write('\n'.join(str(domains[i]['domain']) for i in domains))
+
    undumped = []
    # Or we could iterate over each sublist while we get it?
    for i in domains:
@ -69,7 +78,7 @@ def main():
        try:
            #subprocess.check_call(['wget', '-e', 'robots=off', '--fail', '-nc', '-a', 'wikia.log', full])
            # Use this instead, and comment out the next try, to only list.
-            subprocess.check_call(['curl', '-I', '--fail', full])
+            subprocess.call(['curl', '-I', '--fail', full])
        except subprocess.CalledProcessError as e:
            # We added --fail for this https://superuser.com/a/854102/283120
            if e.returncode == 22:
@ -81,7 +90,9 @@ def main():
        #    subprocess.check_call(['wget', '-e', 'robots=off', '-nc', '-a', 'wikia.log', images])
        #except:
        #    pass
-    print '\n'.join(str(dump) for dump in undumped)
+
+    with open('wikia.com-unarchived', 'w+') as out:
+        out.write('\n'.join(str(domain) for domain in undumped))

 if __name__ == '__main__':
    main()