Changeset 1841:59d784dfbf9a for scrapy/contrib/spidermiddleware/offsite.py
- Timestamp:
- 11/12/09 10:17:21 (10 months ago)
- Branch:
- default
- Files:
-
- 1 modified
-
scrapy/contrib/spidermiddleware/offsite.py (modified) (3 diffs)
Legend:
- Unmodified
- Added
- Removed
-
scrapy/contrib/spidermiddleware/offsite.py
r1822 r1841 11 11 from scrapy.http import Request 12 12 from scrapy.utils.httpobj import urlparse_cached 13 from scrapy import log 13 14 14 15 class OffsiteMiddleware(object): … … 16 17 def __init__(self): 17 18 self.host_regexes = {} 19 self.domains_seen = {} 18 20 dispatcher.connect(self.spider_opened, signal=signals.spider_opened) 19 21 dispatcher.connect(self.spider_closed, signal=signals.spider_closed) 20 22 21 23 def process_spider_output(self, response, result, spider): 22 return (x for x in result if not isinstance(x, Request) or \ 23 self.should_follow(x, spider)) 24 for x in result: 25 if isinstance(x, Request): 26 if self.should_follow(x, spider): 27 yield x 28 else: 29 domain = urlparse_cached(x).hostname 30 if domain and domain not in self.domains_seen[spider]: 31 log.msg("Filtered offsite request to %r: %s" % (domain, x), 32 level=log.DEBUG, spider=spider) 33 self.domains_seen[spider].add(domain) 34 else: 35 yield x 24 36 25 37 def should_follow(self, request, spider): … … 38 50 domains = [spider.domain_name] + spider.extra_domain_names 39 51 self.host_regexes[spider] = self.get_host_regex(domains) 52 self.domains_seen[spider] = set() 40 53 41 54 def spider_closed(self, spider): 42 55 del self.host_regexes[spider] 56 del self.domains_seen[spider]
