DD
Size: a a a
DD
DD
B
DD
B
class MySpider(CrawlSpider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = ['http://www.example.com']
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
Rule(LinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),
# Extract links matching 'item.php' and parse them with the spider's method parse_item
Rule(LinkExtractor(allow=('item\.php', )), callback='parse_item'),
)
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class MySpider(CrawlSpider):
custom_settings = None
def __init__(self, *a, **kw):
self.rules = (
Rule(LinkExtractor(allow=('en/latest/',), deny=(kw['rule1'],))),
Rule(LinkExtractor(allow=('en/latest/topics/',)), callback='parse_item'),
)
super(CrawlSpider, self).__init__(*a, **kw)
self._compile_rules()
name = 'example'
allowed_domains = ['doc.scrapy.org']
start_urls = ['https://doc.scrapy.org']
def parse_item(self, response):
self.logger.info('Response %s', response.url)
NK
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class MySpider(CrawlSpider):
custom_settings = None
def __init__(self, *a, **kw):
self.rules = (
Rule(LinkExtractor(allow=('en/latest/',), deny=(kw['rule1'],))),
Rule(LinkExtractor(allow=('en/latest/topics/',)), callback='parse_item'),
)
super(CrawlSpider, self).__init__(*a, **kw)
self._compile_rules()
name = 'example'
allowed_domains = ['doc.scrapy.org']
start_urls = ['https://doc.scrapy.org']
def parse_item(self, response):
self.logger.info('Response %s', response.url)
B
scrapy crawl example -a rule1=topicsB
B
B
B
B
B
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class MySpider(CrawlSpider):
custom_settings = None
def __init__(self, *a, **kw):
self.rules = (
Rule(LinkExtractor(allow=('en/latest/',), deny=(kw['rule1'],))),
Rule(LinkExtractor(allow=('en/latest/topics/',)), callback='parse_item'),
)
super(CrawlSpider, self).__init__(*a, **kw)
self._compile_rules()
name = 'example'
allowed_domains = ['doc.scrapy.org']
start_urls = ['https://doc.scrapy.org']
def parse_item(self, response):
self.logger.info('Response %s', response.url)
B
Spider только self._compile_rules() юзаетDD
NK
NK