uploader included

pull/287/merge
emijrp 6 years ago
parent 254486af06
commit 2fe1c0b6b2

@ -19,12 +19,18 @@
# Documentation for developers: http://wikiteam.readthedocs.com
import csv
import datetime
import os
import re
import subprocess
import sys
import time
import urllib.request
# Requirements:
# zip command (apt-get install zip)
# ia command (pip install internetarchive)
def saveURL(wikidomain='', url='', filename='', path=''):
filename2 = '%s/%s' % (wikidomain, filename)
if path:
@ -142,12 +148,29 @@ def downloadPagesAndFiles(wikidomain='', wikiurl=''):
print('Downloaded %d files' % (filesc))
def downloadSitemap(wikidomain='', wikiurl=''):
print('Downloading sitemap.xml')
saveURL(wikidomain=wikidomain, url=wikiurl, filename='sitemap.xml', path='')
def downloadMainPage(wikidomain='', wikiurl=''):
print('Downloading index.html')
saveURL(wikidomain=wikidomain, url=wikiurl, filename='index.html', path='')
def downloadLogo(wikidomain='', wikiurl=''):
index = '%s/index.html' % (wikidomain)
if os.path.exists(index):
with open(index, 'r') as f:
m = re.findall(r'class="WikiLogo WikiElement"><img src="([^<> "]+?)"', f.read())
if m:
logourl = m[0]
logofilename = logourl.split('/')[-1]
print('Downloading logo')
saveURL(wikidomain=wikidomain, url=logourl, filename=logofilename, path='')
return logofilename
return ''
def main():
upload = False
isadmin = False
if len(sys.argv) < 2:
print('Please, introduce a wikispaces wiki url or filename.\nExample: https://yourwiki.wikispaces.com or mylistofwikis.txt')
sys.exit()
@ -155,6 +178,11 @@ def main():
if not param:
print('Please, introduce a wikispaces wiki url or filename.\nExample: https://yourwiki.wikispaces.com or mylistofwikis.txt')
sys.exit()
if len(sys.argv) > 2:
if '--upload' in sys.argv:
upload = True
if '--admin' in sys.argv:
isadmin = True
wikilist = []
if '://' in param:
@ -169,17 +197,65 @@ def main():
for wikiurl in wikilist:
wikidomain = wikiurl.split('://')[1].split('/')[0]
print('#'*40,'\n Analyzing:', wikiurl)
print('#'*40,'\n Downloading:', wikiurl)
print('#'*40,'\n')
print('Creating directories for %s' % (wikidomain))
if not os.path.exists('%s/files' % (wikidomain)):
os.makedirs('%s/files' % (wikidomain))
if not os.path.exists('%s/pages' % (wikidomain)):
os.makedirs('%s/pages' % (wikidomain))
dirfiles = '%s/files' % (wikidomain)
if not os.path.exists(dirfiles):
print('Creating directory %s' % (dirfiles))
os.makedirs(dirfiles)
dirpages = '%s/pages' % (wikidomain)
if not os.path.exists(dirpages):
print('Creating directory %s' % (dirpages))
os.makedirs(dirpages)
downloadPagesAndFiles(wikidomain=wikidomain, wikiurl=wikiurl)
sitemapurl = 'https://%s/sitemap.xml' % (wikidomain)
downloadSitemap(wikidomain=wikidomain, wikiurl=sitemapurl)
downloadMainPage(wikidomain=wikidomain, wikiurl=wikiurl)
logofilename = downloadLogo(wikidomain=wikidomain, wikiurl=wikiurl)
if upload:
print('\nCompressing dump...')
wikidir = wikidomain
os.chdir(wikidir)
print('Changed directory to', os.getcwd())
wikizip = '%s.zip' % (wikidomain)
subprocess.call('zip' + ' -r ../%s files/ pages/ index.html pages-and-files.csv sitemap.xml %s' % (wikizip, logofilename), shell=True)
os.chdir('..')
print('Changed directory to', os.getcwd())
print('\nUploading to Internet Archive...')
indexfilename = '%s/index.html' % (wikidir)
if not os.path.exists(indexfilename):
print('\nError dump incomplete, skipping upload\n')
continue
f = open(indexfilename, 'r')
indexhtml = f.read()
f.close()
itemid = 'wiki-%s' % (wikidomain)
wikititle = ''
try:
wikititle = indexhtml.split('wiki: {')[1].split('}')[0].split("text: '")[1].split("',")[0].strip()
except:
wikititle = wikidomain
if not wikititle:
wikititle = wikidomain
itemtitle = 'Wiki - %s' % wikititle
itemdesc = '<a href=\"%s\">%s</a> dumped with <a href=\"https://github.com/WikiTeam/wikiteam\" rel=\"nofollow\">WikiTeam</a> tools.' % (wikiurl, wikititle)
itemtags = ['wiki', 'wikiteam', 'wikispaces', wikititle, wikidomain.split('.wikispaces.com')[0], wikidomain]
itemoriginalurl = wikiurl
itemlicenseurl = ''
m = re.findall(r'<a rel="license" href="([^<>]+?)">', indexhtml.split('<div class="WikiLicense')[1].split('</div>')[0])
if m:
itemlicenseurl = m[0]
if not itemlicenseurl:
itemtags.append('unknowncopyright')
itemtags_ = ' '.join(["--metadata='subject:%s'" % (tag) for tag in itemtags])
itemcollection = isadmin and 'wikiteam' or 'opensource'
itemlang = 'Unknown'
itemdate = datetime.datetime.now().strftime("%Y-%m-%d")
itemlogo = logofilename and '%s/%s' % (wikidir, logofilename) or ''
subprocess.call('ia' + ' upload %s %s %s --metadata="mediatype:web" --metadata="collection:%s" --metadata="title:%s" --metadata="description:%s" --metadata="language:%s" --metadata="last-updated-date:%s" %s %s' % (itemid, wikizip, itemlogo and itemlogo or '', itemcollection, itemtitle, itemdesc, itemlang, itemdate, itemlicenseurl and '--metadata="licenseurl:%s"' % (itemlicenseurl) or '', itemtags_), shell=True)
print('You can find it in https://archive.org/details/%s' % (itemid))
if __name__ == "__main__":
main()

Loading…
Cancel
Save