pull/327/head
emijrp 6 years ago
parent 4e8c92b6d2
commit 7c72c27f2a

@ -0,0 +1,61 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# Copyright (C) 2018 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import random
import re
import sys
import time
import urllib.request
def main():
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.1')]
urllib.request.install_opener(opener)
for i in range(1, 100000):
url = 'https://duckduckgo.com/html/?q=%s%%20%s%%20site:wikidot.com' % (random.randint(100, 5000), random.randint(1000, 9999))
print('URL search', url)
try:
html = urllib.request.urlopen(url).read().decode('utf-8')
except:
print('Search error')
time.sleep(30)
continue
html = urllib.parse.unquote(html)
m = re.findall(r'://([^/]+?\.wikidot\.com)', html)
for wiki in m:
wiki = 'https://' + wiki
if not wiki in wikis:
wikis.append(wiki)
wikis.sort()
print(wiki)
with open('wikidot-duckduckgo.txt', 'w') as f:
wikis2 = []
for wiki in wikis:
wiki = re.sub(r'https?://www\.', 'http://', wiki)
if not wiki in wikis2:
wikis2.append(wiki)
wikis = wikis2
wikis.sort()
f.write('\n'.join(wikis))
print('%d wikis found' % (len(wikis)))
sleep = random.randint(5,20)
print('Sleeping %d seconds' % (sleep))
time.sleep(sleep)
if __name__ == '__main__':
main()

@ -0,0 +1,70 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
# Copyright (C) 2018 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import random
import re
import sys
import time
import urllib.request
def main():
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)
wikis = [
'http://hackersderede.wikidot.com',
'http://ds2009a.wikidot.com',
'http://retrowiki.wikidot.com',
'http://heroesofalvena.wikidot.com',
'http://solariapedia.wikidot.com',
'http://denver.wikidot.com',
]
for i in range(1, 1000000):
url = random.choice(wikis)
print('URL search', url)
try:
html = urllib.request.urlopen(url).read().decode('utf-8')
except:
print('Search error')
time.sleep(30)
continue
html = urllib.parse.unquote(html)
m = re.findall(r'://([^/]+?\.wikidot\.com)', html)
for wiki in m:
wiki = 'http://' + wiki
if not wiki in wikis:
wikis.append(wiki)
wikis.sort()
print(wiki)
with open('wikidot-spider.txt', 'w') as f:
wikis2 = []
for wiki in wikis:
wiki = re.sub(r'https?://www\.', 'http://', wiki)
if not wiki in wikis2:
wikis2.append(wiki)
wikis = wikis2
wikis.sort()
f.write('\n'.join(wikis))
print('%d wikis found' % (len(wikis)))
sleep = random.randint(5,20)
print('Sleeping %d seconds' % (sleep))
time.sleep(sleep)
if __name__ == '__main__':
main()

@ -0,0 +1,108 @@
http://abarrelfull.wikidot.com
http://airchairbuild.wikidot.com
http://albums-template.wikidot.com
http://amiii.wikidot.com
http://angarmegia.wikidot.com
http://aqwwiki.wikidot.com
http://arcana.wikidot.com
http://aviationknowledge.wikidot.com
http://biol252-biol319.wikidot.com
http://blmodding.wikidot.com
http://bozic-nation.wikidot.com
http://brmehta12.wikidot.com
http://bzhlab.wikidot.com
http://caosinsurgente.wikidot.com
http://ccyms.wikidot.com
http://chavezbraintrust.wikidot.com
http://ci-sandbox.wikidot.com
http://ci-wiki.wikidot.com
http://cityofangels.wikidot.com
http://community.wikidot.com
http://corvidcollege.wikidot.com
http://css.wikidot.com
http://defunct-elitequestworlds.wikidot.com
http://denver.wikidot.com
http://downsfolk.wikidot.com
http://ds09.wikidot.com
http://ds2009a.wikidot.com
http://ds2010a.wikidot.com
http://educ400-401.wikidot.com
http://eime.wikidot.com
http://eng270.wikidot.com
http://epitome.wikidot.com
http://eventidemush.wikidot.com
http://falchionvalley.wikidot.com
http://fed20.wikidot.com
http://fretsonfire.wikidot.com
http://gdt2009.wikidot.com
http://gear-sandbox.wikidot.com
http://geararc.wikidot.com
http://greatestfilipino.wikidot.com
http://hackersderede.wikidot.com
http://handbook.wikidot.com
http://harvey-capital-lectures.wikidot.com
http://heroesofalvena.wikidot.com
http://hswiki.wikidot.com
http://insurrection-du-chaos-sandbox.wikidot.com
http://insurrection-du-chaos.wikidot.com
http://internetior.wikidot.com
http://ipr11.wikidot.com
http://irongiant.wikidot.com
http://itb322uap.wikidot.com
http://kingsway.wikidot.com
http://kingswayeap.wikidot.com
http://kingswayelem.wikidot.com
http://kingswayielts.wikidot.com
http://kingswayint.wikidot.com
http://kingswaypreint.wikidot.com
http://kingswayupper.wikidot.com
http://knightswrite.wikidot.com
http://ksemoudania.wikidot.com
http://latmari.wikidot.com
http://m5snapoli.wikidot.com
http://mathaerobics4samvedna.wikidot.com
http://me1065.wikidot.com
http://measurementcamp.wikidot.com
http://morningsidemicro.wikidot.com
http://morphopedics.wikidot.com
http://mybookworld.wikidot.com
http://neo-dimension.wikidot.com
http://neozone.wikidot.com
http://oblivionshard.wikidot.com
http://oneeleventwentyten.wikidot.com
http://oversoulgame.wikidot.com
http://packages.wikidot.com
http://porsche.wikidot.com
http://retrowiki.wikidot.com
http://roboticspedia.wikidot.com
http://rock-xproject.wikidot.com
http://schrijven.wikidot.com
http://scp-jp-archive.wikidot.com
http://scp-wiki-de.wikidot.com
http://scpexplained.wikidot.com
http://scpsandbox2.wikidot.com
http://scratch4samvedna.wikidot.com
http://sfi.wikidot.com
http://sharecokecodes.wikidot.com
http://sicurezzapubblica.wikidot.com
http://siluria.wikidot.com
http://simtrackipedia.wikidot.com
http://sliscomps.wikidot.com
http://smsalgebra.wikidot.com
http://snippets.wikidot.com
http://solariapedia.wikidot.com
http://soymilkls.wikidot.com
http://srm.wikidot.com
http://stallmanism.wikidot.com
http://themes.wikidot.com
http://try.wikidot.com
http://ucsdgrads.wikidot.com
http://uscta.wikidot.com
http://virtualwargamer.wikidot.com
http://vyprmedia.wikidot.com
http://w24.wikidot.com
http://warsztatywww.wikidot.com
http://webcomicauthority.wikidot.com
http://wikidot.com
http://wikim5s.wikidot.com
http://wiwimush.wikidot.com

@ -0,0 +1,214 @@
arte
cine
lengua
literatura
matematicas
ingles
frances
aleman
ruso
idiomas
geografia
historia
secundaria
bachillerato
examen
examenes
profesor
educacion
profesores
historias
extremadura
andalucia
iberia
oceano
cultura
periodico
television
radio
italiano
polaco
chino
japones
coreano
musica
mozart
beethoven
asimov
newton
kilogramo
teoria
fisica
deporte
cancion
futbol
astronomia
telescopio
cuaderno
libro
texto
pizarra
descartes
galileo
fosiles
paisaje
fosil
paisajes
mar
oceano
espacio
meteorologia
nubes
religion
bandera
lengua
politica
biologia
quimica
medicina
tecnologia
diagrama
mapa
mapas
dibujos
pronunciacion
arquitectura
compositor
pintor
pintura
escultura
museo
biblioteca
museos
bibliotecas
enciclopedia
diccionario
filosofia
filosofos
feminismo
sociologia
leyes
coche
barco
avion
transporte
teatro
europa
america
africa
asia
oceania
australia
atlantico
mediterraneo
fenicios
griegos
cartagineses
palabras
numeros
escritura
isla
java
python
programacion
piramide
cuadrado
geometria
rectangulo
circulo
ciencia
marx
engels
platon
socrates
continente
tormenta
terremoto
proyecto
glosario
vocabulario
aprender
recursos
lectura
comunicacion
salud
bienestar
europeo
africano
asiatico
americano
wiki
wikis
documental
documentales
bibliografia
documentacion
ciencias
naturales
sociales
inteligencia
investigacion
cientifico
tecnico
cientifica
enlaces
antropologia
arqueologia
arqueologo
filologia
arduino
software
hardware
computador
ordenador
siglo xx
siglo xix
siglo xviii
siglo xvii
siglo xvi
siglo xv
libros
marte
tierra
mercurio
jupiter
saturno
urano
neptuno
pluton
cometa
asteroide
luna
pajaro
ave
aves
reptil
reptiles
flores
arboles
flor
dictadura
democracia
parlamento
universidad
universidades
empresa
comida
alimento
equipo
lampara
luz
bombilla
electricidad
frigorifico
lavadora
mueble
fregona
espacio
sol
estrella
fenomeno
hispanico
hispanica
biodiversidad
guerra fria
Loading…
Cancel
Save