uploader included

6 years ago · 2fe1c0b6b2
parent 254486af06
commit 2fe1c0b6b2
1 changed files with 82 additions and 6 deletions
--- a/wikispaces.py
+++ b/wikispaces.py
@ -19,12 +19,18 @@
 # Documentation for developers: http://wikiteam.readthedocs.com

 import csv
+import datetime
 import os
 import re
+import subprocess
 import sys
 import time
 import urllib.request

+# Requirements:
+# zip command (apt-get install zip)
+# ia command (pip install internetarchive)
+
 def saveURL(wikidomain='', url='', filename='', path=''):
    filename2 = '%s/%s' % (wikidomain, filename)
    if path:
@ -142,12 +148,29 @@ def downloadPagesAndFiles(wikidomain='', wikiurl=''):
    print('Downloaded %d files' % (filesc))

 def downloadSitemap(wikidomain='', wikiurl=''):
+    print('Downloading sitemap.xml')
    saveURL(wikidomain=wikidomain, url=wikiurl, filename='sitemap.xml', path='')

 def downloadMainPage(wikidomain='', wikiurl=''):
+    print('Downloading index.html')
    saveURL(wikidomain=wikidomain, url=wikiurl, filename='index.html', path='')

+def downloadLogo(wikidomain='', wikiurl=''):
+    index = '%s/index.html' % (wikidomain)
+    if os.path.exists(index):
+        with open(index, 'r') as f:
+            m = re.findall(r'class="WikiLogo WikiElement"><img src="([^<> "]+?)"', f.read())
+            if m:
+                logourl = m[0]
+                logofilename = logourl.split('/')[-1]
+                print('Downloading logo')
+                saveURL(wikidomain=wikidomain, url=logourl, filename=logofilename, path='')
+                return logofilename
+    return ''
+
 def main():
+    upload = False
+    isadmin = False
    if len(sys.argv) < 2:
        print('Please, introduce a wikispaces wiki url or filename.\nExample: https://yourwiki.wikispaces.com or mylistofwikis.txt')
        sys.exit()
@ -155,6 +178,11 @@ def main():
    if not param:
        print('Please, introduce a wikispaces wiki url or filename.\nExample: https://yourwiki.wikispaces.com or mylistofwikis.txt')
        sys.exit()
+    if len(sys.argv) > 2:
+        if '--upload' in sys.argv:
+            upload = True
+        if '--admin' in sys.argv:
+            isadmin = True
    
    wikilist = []
    if '://' in param:
@ -169,17 +197,65 @@ def main():
    
    for wikiurl in wikilist:
        wikidomain = wikiurl.split('://')[1].split('/')[0]
-        print('#'*40,'\n Analyzing:', wikiurl)
+        print('#'*40,'\n Downloading:', wikiurl)
        print('#'*40,'\n')
-        print('Creating directories for %s' % (wikidomain))
-        if not os.path.exists('%s/files' % (wikidomain)):
-            os.makedirs('%s/files' % (wikidomain))
-        if not os.path.exists('%s/pages' % (wikidomain)):
-            os.makedirs('%s/pages' % (wikidomain))
+        dirfiles = '%s/files' % (wikidomain)
+        if not os.path.exists(dirfiles):
+            print('Creating directory %s' % (dirfiles))
+            os.makedirs(dirfiles)
+        dirpages = '%s/pages' % (wikidomain)
+        if not os.path.exists(dirpages):
+            print('Creating directory %s' % (dirpages))
+            os.makedirs(dirpages)
        downloadPagesAndFiles(wikidomain=wikidomain, wikiurl=wikiurl)
        sitemapurl = 'https://%s/sitemap.xml' % (wikidomain)
        downloadSitemap(wikidomain=wikidomain, wikiurl=sitemapurl)
        downloadMainPage(wikidomain=wikidomain, wikiurl=wikiurl)
+        logofilename = downloadLogo(wikidomain=wikidomain, wikiurl=wikiurl)
+        
+        if upload:
+            print('\nCompressing dump...')
+            wikidir = wikidomain
+            os.chdir(wikidir)
+            print('Changed directory to', os.getcwd())
+            wikizip = '%s.zip' % (wikidomain)
+            subprocess.call('zip' + ' -r ../%s files/ pages/ index.html pages-and-files.csv sitemap.xml %s' % (wikizip, logofilename), shell=True)
+            os.chdir('..')
+            print('Changed directory to', os.getcwd())
+            
+            print('\nUploading to Internet Archive...')
+            indexfilename = '%s/index.html' % (wikidir)
+            if not os.path.exists(indexfilename):
+                print('\nError dump incomplete, skipping upload\n')
+                continue
+            f = open(indexfilename, 'r')
+            indexhtml = f.read()
+            f.close()
+            itemid = 'wiki-%s' % (wikidomain)
+            wikititle = ''
+            try:
+                wikititle = indexhtml.split('wiki: {')[1].split('}')[0].split("text: '")[1].split("',")[0].strip()
+            except:
+                wikititle = wikidomain
+            if not wikititle:
+                wikititle = wikidomain
+            itemtitle = 'Wiki - %s' % wikititle
+            itemdesc = '<a href=\"%s\">%s</a> dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools.' % (wikiurl, wikititle)
+            itemtags = ['wiki', 'wikiteam', 'wikispaces', wikititle, wikidomain.split('.wikispaces.com')[0], wikidomain]
+            itemoriginalurl = wikiurl
+            itemlicenseurl = ''
+            m = re.findall(r'<a rel="license" href="([^<>]+?)">', indexhtml.split('<div class="WikiLicense')[1].split('</div>')[0])
+            if m:
+                itemlicenseurl = m[0]
+            if not itemlicenseurl:
+                itemtags.append('unknowncopyright')
+            itemtags_ = ' '.join(["--metadata='subject:%s'" % (tag) for tag in itemtags])
+            itemcollection = isadmin and 'wikiteam' or 'opensource'
+            itemlang = 'Unknown'
+            itemdate = datetime.datetime.now().strftime("%Y-%m-%d")
+            itemlogo = logofilename and '%s/%s' % (wikidir, logofilename) or ''                
+            subprocess.call('ia' + ' upload %s %s %s --metadata="mediatype:web" --metadata="collection:%s" --metadata="title:%s" --metadata="description:%s" --metadata="language:%s" --metadata="last-updated-date:%s" %s %s' % (itemid, wikizip, itemlogo and itemlogo or '', itemcollection, itemtitle, itemdesc, itemlang, itemdate, itemlicenseurl and '--metadata="licenseurl:%s"' % (itemlicenseurl) or '', itemtags_), shell=True)
+            print('You can find it in https://archive.org/details/%s' % (itemid))

 if __name__ == "__main__":
    main()