initial commit
commit
5b3f050fd6
Binary file not shown.
Binary file not shown.
@ -0,0 +1,13 @@
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# http://doc.scrapy.org/topics/items.html
|
||||
|
||||
from scrapy.item import Item, Field
|
||||
|
||||
class PyGSItem(Item):
|
||||
# define the fields for your item here like:
|
||||
# name = Field()
|
||||
title = Field()
|
||||
link = Field()
|
||||
desc = Field()
|
Binary file not shown.
@ -0,0 +1,8 @@
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: http://doc.scrapy.org/topics/item-pipeline.html
|
||||
|
||||
class PygooglesearchPipeline(object):
|
||||
def process_item(self, item, spider):
|
||||
return item
|
@ -0,0 +1,16 @@
|
||||
# Scrapy settings for PyGoogleSearch project
|
||||
#
|
||||
# For simplicity, this file contains only the most important settings by
|
||||
# default. All the other settings are documented here:
|
||||
#
|
||||
# http://doc.scrapy.org/topics/settings.html
|
||||
#
|
||||
|
||||
BOT_NAME = 'PyGoogleSearch'
|
||||
BOT_VERSION = '1.0'
|
||||
|
||||
SPIDER_MODULES = ['PyGoogleSearch.spiders']
|
||||
NEWSPIDER_MODULE = 'PyGoogleSearch.spiders'
|
||||
DEFAULT_ITEM_CLASS = 'PyGoogleSearch.items.PygooglesearchItem'
|
||||
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
|
||||
|
Binary file not shown.
@ -0,0 +1,8 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# To create the first spider for your project use this command:
|
||||
#
|
||||
# scrapy genspider myspider myspider-domain.com
|
||||
#
|
||||
# For more info see:
|
||||
# http://doc.scrapy.org/topics/spiders.html
|
Binary file not shown.
@ -0,0 +1,40 @@
|
||||
from scrapy.contrib.spiders import CrawlSpider, Rule
|
||||
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
|
||||
from scrapy.selector import HtmlXPathSelector
|
||||
import time
|
||||
|
||||
from PyGoogleSearch.items import PyGSItem
|
||||
import sys
|
||||
|
||||
class PyGSSpider(CrawlSpider):
|
||||
'''
|
||||
docstring for PyGSSpider
|
||||
'''
|
||||
name = 'google.com'
|
||||
allowed_domains = ['google.com']
|
||||
#start_urls = [
|
||||
# 'http://www.google.fr/search?sclient=psy&hl=fr&source=hp&q=sexe&btnG=Rechercher'
|
||||
# ]
|
||||
rules = (
|
||||
Rule(SgmlLinkExtractor(restrict_xpaths='//a[@id="pnnext"]',
|
||||
),
|
||||
callback='parse_item',
|
||||
follow=True),
|
||||
)
|
||||
|
||||
def parse_item(self, response):
|
||||
time.sleep(3)
|
||||
hxs = HtmlXPathSelector(response)
|
||||
sites = hxs.select('//div[@id="ires"]/ol/li')
|
||||
items = []
|
||||
for site in sites:
|
||||
item = PyGSItem()
|
||||
|
||||
item['title'] = site.select('h3[@class="r"]/a/text() | \
|
||||
h3[@class="r"]/a/em/text()').extract()
|
||||
item['desc'] = site.select('div/text()').extract()
|
||||
item['link'] = site.select('h3[@class="r"]/a/@href').extract()
|
||||
items.append(item)
|
||||
return items
|
||||
|
||||
|
Binary file not shown.
@ -0,0 +1,8 @@
|
||||
This project is released to the public domain.
|
||||
It is based on Scrapy open source screen scrapping.
|
||||
contact: spike@boxls.com
|
||||
|
||||
PyGoogleSeach is a simple Google Data crawler. I created a simple command line
|
||||
interface to make queries, it exports the results in json format to the file
|
||||
data.json.
|
||||
|
@ -0,0 +1,42 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import subprocess
|
||||
from optparse import OptionParser
|
||||
from urllib import quote
|
||||
import sys
|
||||
|
||||
__VERSION__ = 0.1
|
||||
SEARCH_URL ='''\
|
||||
http://www.google.com/search?sclient=psy&hl=en&site=&source=hp&q=foo&btnG=Google+Search\
|
||||
'''
|
||||
SCRAPY_PARAMS = ['--set', 'FEED_URI=data.json', '--set', 'FEED_FORMAT=json']
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
usage = 'usage: %prog query [OPTION...]'
|
||||
op = OptionParser(usage, version='%%prog %s' % __VERSION__)
|
||||
opts, args = op.parse_args()
|
||||
#try:
|
||||
if len(args) != 1:
|
||||
print 'Bad arguments: '
|
||||
op.print_usage()
|
||||
else:
|
||||
scrapy = 'scrapy'
|
||||
scrapy_cmd = 'crawl'
|
||||
query = args[0]
|
||||
url_query = quote(query)
|
||||
search = SEARCH_URL.replace('foo', url_query)
|
||||
print url_query
|
||||
print 'Launching scrapy with paramters <%s> <%s>' % (scrapy,
|
||||
search)
|
||||
p = subprocess.Popen([
|
||||
scrapy, scrapy_cmd,
|
||||
search, '--set',
|
||||
'FEED_URI=data.json',
|
||||
'--set',
|
||||
'FEED_FORMAT=JSON'
|
||||
],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
output = p.communicate()[0]
|
||||
print output
|
@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# http://doc.scrapy.org/topics/scrapyd.html
|
||||
|
||||
[settings]
|
||||
default = PyGoogleSearch.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = PyGoogleSearch
|
Loading…
Reference in New Issue