From 6dabef598049b7d823dc69b2d1d51187fdef2ab8 Mon Sep 17 00:00:00 2001 From: emijrp Date: Fri, 29 Jan 2016 15:00:33 +0100 Subject: [PATCH] wikiapiary bots --- wikiapiary/wikiapiary-update-ia-params.py | 84 +++++++++++++++++++++++ wikiapiary/wikiapiary_family.py | 61 ++++++++++++++++ 2 files changed, 145 insertions(+) create mode 100644 wikiapiary/wikiapiary-update-ia-params.py create mode 100644 wikiapiary/wikiapiary_family.py diff --git a/wikiapiary/wikiapiary-update-ia-params.py b/wikiapiary/wikiapiary-update-ia-params.py new file mode 100644 index 0000000..a22e8d0 --- /dev/null +++ b/wikiapiary/wikiapiary-update-ia-params.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright (C) 2016 WikiTeam +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import re +import urllib + +import pywikibot +from pywikibot import pagegenerators + +def main(): + site = pywikibot.Site('wikiapiary', 'wikiapiary') + catname = 'Category:Website' + cat = pywikibot.Category(site, catname) + gen = pagegenerators.CategorizedPageGenerator(cat) + pre = pagegenerators.PreloadingGenerator(gen) + + for page in pre: + if page.isRedirectPage(): + continue + + wtitle = page.title() + wtext = page.text + + if re.search('Internet Archive', wtext): + #print('It has IA parameter') + pass + else: + print('\n','#'*50,'\n',wtitle,'\n','#'*50) + print('https://wikiapiary.com/wiki/%s' % (re.sub(' ', ',ยก_', wtitle))) + print('Missing IA parameter') + + if re.search(r'(?i)API URL=http', wtext): + apiurl = re.findall(r'(?i)API URL=(http[^\n]+?)\n', wtext)[0] + print('API:', apiurl) + else: + print('No API found in WikiApiary, skiping') + continue + + urliasearch = 'https://archive.org/search.php?query=originalurl:"%s"' % (apiurl) + f = urllib.request.urlopen(urliasearch) + raw = f.read().decode('utf-8') + if re.search(r'(?i)Your search did not match any items', raw): + print('No dumps found at Internet Archive') + else: + itemidentifier = re.findall(r'\s*\d+\s*(\d+)', raw2) + itemfiles = [[int(x), int(y)] for x, y in itemfiles] + itemfiles.sort(reverse=True) + print(itemfiles) + itemdate = str(itemfiles[0][0])[0:4] + '/' + str(itemfiles[0][0])[4:6] + '/' + str(itemfiles[0][0])[6:8] + itemsize = itemfiles[0][1] + + iaparams = """|Internet Archive identifier=%s +|Internet Archive URL=%s +|Internet Archive added date=%s 00:00:00 +|Internet Archive file size=%s""" % (itemidentifier, itemurl, itemdate, itemsize) + newtext = page.text + newtext = re.sub(r'(?im)\}\}\n', '%s\n}}\n' % (iaparams), newtext) + pywikibot.showDiff(page.text, newtext) + page.text = newtext + page.save('BOT - Adding dump details: %s, %s, %s bytes' % (itemidentifier, itemdate, itemsize)) + +if __name__ == "__main__": + main() + diff --git a/wikiapiary/wikiapiary_family.py b/wikiapiary/wikiapiary_family.py new file mode 100644 index 0000000..daf381f --- /dev/null +++ b/wikiapiary/wikiapiary_family.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- +"""Family module for WikiApiary wiki.""" +from __future__ import unicode_literals + +__version__ = '$Id: 8c9856dd7c0af8d400d0d95b00bf406002729008 $' + +from pywikibot import family + +# The MediaWiki family +# user-config.py: usernames['wikiapiary']['wikiapiary'] = 'User name' +class Family(family.WikimediaFamily): + + """Family module for WikiApiary wiki.""" + + def __init__(self): + """Constructor.""" + super(Family, self).__init__() + self.name = 'wikiapiary' + + self.langs = { + 'wikiapiary': 'wikiapiary.com', + } + + # Wikimedia wikis all use "bodyContent" as the id of the
+ # element that contains the actual page content; change this for + # wikis that use something else (e.g., mozilla family) + self.content_id = "bodyContent" + + def scriptpath(self, code): + """The prefix used to locate scripts on this wiki. + + This is the value displayed when you enter {{SCRIPTPATH}} on a + wiki page (often displayed at [[Help:Variables]] if the wiki has + copied the master help page correctly). + + The default value is the one used on Wikimedia Foundation wikis, + but needs to be overridden in the family file for any wiki that + uses a different value. + + """ + return '/w' + + # Which version of MediaWiki is used? REQUIRED + def version(self, code): + # Replace with the actual version being run on your wiki + return '1.25.3' + + def code2encoding(self, code): + """Return the encoding for a specific language wiki""" + # Most wikis nowadays use UTF-8, but change this if yours uses + # a different encoding + return 'utf-8' + + def path(self, code): + return '/w/index.php' + + def apipath(self, code): + return '/w/api.php' + + def protocol(self, code): + return 'HTTPS'