Show
Ignore:
Timestamp:
11/12/09 10:17:21 (10 months ago)
Author:
Pablo Hoffman <pablo@…>
Branch:
default
Message:

made offsite middleware log messages when filtering out requests

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • scrapy/contrib/spidermiddleware/offsite.py

    r1822 r1841  
    1111from scrapy.http import Request 
    1212from scrapy.utils.httpobj import urlparse_cached 
     13from scrapy import log 
    1314 
    1415class OffsiteMiddleware(object): 
     
    1617    def __init__(self): 
    1718        self.host_regexes = {} 
     19        self.domains_seen = {} 
    1820        dispatcher.connect(self.spider_opened, signal=signals.spider_opened) 
    1921        dispatcher.connect(self.spider_closed, signal=signals.spider_closed) 
    2022 
    2123    def process_spider_output(self, response, result, spider): 
    22         return (x for x in result if not isinstance(x, Request) or \ 
    23             self.should_follow(x, spider)) 
     24        for x in result: 
     25            if isinstance(x, Request): 
     26                if self.should_follow(x, spider): 
     27                    yield x 
     28                else: 
     29                    domain = urlparse_cached(x).hostname 
     30                    if domain and domain not in self.domains_seen[spider]: 
     31                        log.msg("Filtered offsite request to %r: %s" % (domain, x), 
     32                            level=log.DEBUG, spider=spider) 
     33                        self.domains_seen[spider].add(domain) 
     34            else: 
     35                yield x 
    2436 
    2537    def should_follow(self, request, spider): 
     
    3850        domains = [spider.domain_name] + spider.extra_domain_names 
    3951        self.host_regexes[spider] = self.get_host_regex(domains) 
     52        self.domains_seen[spider] = set() 
    4053 
    4154    def spider_closed(self, spider): 
    4255        del self.host_regexes[spider] 
     56        del self.domains_seen[spider]