bug wikitext

pull/287/merge
emijrp 6 years ago
parent ffff6cf568
commit e01b2fb0c3

@ -71,9 +71,7 @@ def undoHTMLEntities(text=''):
return text
def convertHTML2Wikitext(wikidomain='', filename='', path='', overwrite=False):
if not overwrite:
return
def convertHTML2Wikitext(wikidomain='', filename='', path=''):
wikitext = ''
wikitextfile = '%s/%s/%s' % (wikidomain, path, filename)
if not os.path.exists(wikitextfile):
@ -88,11 +86,7 @@ def convertHTML2Wikitext(wikidomain='', filename='', path='', overwrite=False):
wikitext = wikitext.split(m[0])[1].split('</pre>')[0].strip()
wikitext = undoHTMLEntities(text=wikitext)
except:
wikitext = ''
print('Error extracting wikitext.')
else:
wikitext = ''
print('Error extracting wikitext.')
pass
f.write(wikitext)
def downloadPage(wikidomain='', wikiurl='', pagename='', overwrite=False):
@ -108,7 +102,7 @@ def downloadPage(wikidomain='', wikiurl='', pagename='', overwrite=False):
filename2 = '%s.wikitext' % (pagenameplus)
print('Downloading page: %s' % (filename2))
saveURL(wikidomain=wikidomain, url=pageurl2, filename=filename2, path='pages', overwrite=overwrite)
convertHTML2Wikitext(wikidomain=wikidomain, filename=filename2, path='pages', overwrite=overwrite)
convertHTML2Wikitext(wikidomain=wikidomain, filename=filename2, path='pages')
#csv with page history
csvurl = '%s/page/history/%s?utable=WikiTablePageHistoryList&ut_csv=1' % (wikiurl, pagename_)
@ -245,6 +239,18 @@ def main():
print('\n')
print('#'*40,'\n Downloading:', wikiurl)
print('#'*40,'\n')
if upload and not overwriteia:
itemid = 'wiki-%s' % (wikidomain)
try:
iahtml = urllib.request.urlopen('https://archive.org/details/%s' % (itemid)).read().decode('utf-8')
if not re.findall(r'Item cannot be found', iahtml):
if not overwriteia:
print('Warning: item exists on Internet Archive. Skipping wiki. Force with parameter --overwrite-ia')
continue
except:
pass
dirfiles = '%s/files' % (wikidomain)
if not os.path.exists(dirfiles):
print('Creating directory %s' % (dirfiles))
@ -261,14 +267,6 @@ def main():
if upload:
itemid = 'wiki-%s' % (wikidomain)
try:
iahtml = urllib.request.urlopen('https://archive.org/details/%s' % (itemid)).read().decode('utf-8')
if not re.findall(r'Item cannot be found', iahtml):
if not overwriteia:
print('Warning: item exists on Internet Archive. Skipping upload. Force upload with parameter --overwrite-ia')
continue
except:
pass
print('\nCompressing dump...')
wikidir = wikidomain
os.chdir(wikidir)

Loading…
Cancel
Save