Changeset 1822:af6645fd14ed

Show
Ignore:
Timestamp:
11/03/09 00:39:02 (9 months ago)
Author:
Pablo Hoffman <pablo@…>
Branch:
default
Message:

* Renamed domain_{opened,closed,idle} signals to spider_{opened,closed,idle}
* Changed them to pass spider instances only (no domains) (refs #105)

Files:
26 modified

Legend:

Unmodified
Added
Removed
  • docs/topics/exporters.rst

    r1729 r1822  
    4747 
    4848       def __init__(self): 
    49            dispatcher.connect(self.domain_opened, signals.domain_opened)  
    50            dispatcher.connect(self.domain_closed, signals.domain_closed) 
     49           dispatcher.connect(self.spider_opened, signals.spider_opened)  
     50           dispatcher.connect(self.spider_closed, signals.spider_closed) 
    5151           self.files = {} 
    5252 
    53        def domain_opened(self, domain): 
     53       def spider_opened(self, spider): 
     54           domain = spider.domain_name 
    5455           file = open('%s_products.xml' % domain, 'w+b') 
    5556           self.files[domain] = file 
     
    5758           self.exporter.start_exporting() 
    5859 
    59        def domain_closed(self, domain): 
     60       def spider_closed(self, spider): 
     61           domain = spider.domain_name 
    6062           self.exporter.finish_exporting() 
    6163           file = self.files.pop(domain) 
  • docs/topics/extensions.rst

    r1788 r1822  
    102102 
    103103        def __init__(self): 
    104             dispatcher.connect(self.domain_opened, signal=signals.domain_opened) 
    105             dispatcher.connect(self.domain_closed, signal=signals.domain_closed) 
    106  
    107         def domain_opened(self, domain, spider): 
    108             log.msg("opened domain %s" % domain) 
    109  
    110         def domain_closed(self, domain, spider): 
    111             log.msg("closed domain %s" % domain) 
     104            dispatcher.connect(self.spider_opened, signal=signals.spider_opened) 
     105            dispatcher.connect(self.spider_closed, signal=signals.spider_closed) 
     106 
     107        def spider_opened(self, spider): 
     108            log.msg("opened spider %s" % spider.domain_name) 
     109 
     110        def spider_closed(self, spider): 
     111            log.msg("closed spider %s" % spider.domain_name) 
    112112 
    113113 
  • docs/topics/item-pipeline.rst

    r1736 r1822  
    8686    class DuplicatesPipeline(object): 
    8787        def __init__(self): 
    88             self.domaininfo = {} 
    89             dispatcher.connect(self.domain_opened, signals.domain_opened) 
    90             dispatcher.connect(self.domain_closed, signals.domain_closed) 
    91  
    92         def domain_opened(self, domain): 
    93             self.duplicates[domain] = set() 
    94  
    95         def domain_closed(self, domain): 
    96             del self.duplicates[domain] 
     88            self.duplicates = {} 
     89            dispatcher.connect(self.spider_opened, signals.spider_opened) 
     90            dispatcher.connect(self.spider_closed, signals.spider_closed) 
     91 
     92        def spider_opened(self, spider): 
     93            self.duplicates[spider.domain_name] = set() 
     94 
     95        def spider_closed(self, spider): 
     96            del self.duplicates[spider.domain_name] 
    9797 
    9898        def process_item(self, domain, item): 
  • docs/topics/leaks.rst

    r1551 r1822  
    4646you have written, if you are not releasing the (previously allocated) resources 
    4747properly. For example, if you're allocating resources on 
    48 :signal:`domain_opened` but not releasing them on :signal:`domain_closed`. 
     48:signal:`spider_opened` but not releasing them on :signal:`spider_closed`. 
    4949 
    5050.. _topics-leaks-trackrefs: 
  • docs/topics/signals.rst

    r1564 r1822  
    3030order. 
    3131 
    32 domain_closed 
     32spider_closed 
    3333------------- 
    3434 
    35 .. signal:: domain_closed 
    36 .. function:: domain_closed(domain, spider, reason) 
    37  
    38     Sent after a spider/domain has been closed. This can be used to release 
    39     per-spider resources reserved on :signal:`domain_opened`. 
    40  
    41     :param domain: a string which contains the domain of the spider which has 
    42         been closed 
    43     :type domain: str 
     35.. signal:: spider_closed 
     36.. function:: spider_closed(spider, reason) 
     37 
     38    Sent after a spider has been closed. This can be used to release per-spider 
     39    resources reserved on :signal:`spider_opened`. 
    4440 
    4541    :param spider: the spider which has been closed 
    4642    :type spider: :class:`~scrapy.spider.BaseSpider` object 
    4743 
    48     :param reason: a string which describes the reason why the domain was closed. If 
    49         it was closed because the domain has completed scraping, it the reason 
    50         is ``'finished'``. Otherwise, if the domain was manually closed by 
    51         calling the ``close_domain`` engine method, then the reason is the one 
     44    :param reason: a string which describes the reason why the spider was closed. If 
     45        it was closed because the spider has completed scraping, it the reason 
     46        is ``'finished'``. Otherwise, if the spider was manually closed by 
     47        calling the ``close_spider`` engine method, then the reason is the one 
    5248        passed in the ``reason`` argument of that method (which defaults to 
    5349        ``'cancelled'``). If the engine was shutdown (for example, by hitting 
     
    5551    :type reason: str 
    5652 
    57 domain_opened 
     53spider_opened 
    5854------------- 
    5955 
    60 .. signal:: domain_opened 
    61 .. function:: domain_opened(domain, spider) 
    62  
    63     Sent after a spider/domain has been opened for crawling. This is typically 
    64     used to reserve per-spider resources, but can be used for any task that 
    65     needs to be performed when a spider/domain is opened. 
    66  
    67     :param domain: a string with the domain of the spider which has been opened 
    68     :type domain: str 
     56.. signal:: spider_opened 
     57.. function:: spider_opened(spider) 
     58 
     59    Sent after a spider has been opened for crawling. This is typically used to 
     60    reserve per-spider resources, but can be used for any task that needs to be 
     61    performed when a spider is opened. 
    6962 
    7063    :param spider: the spider which has been opened 
    7164    :type spider: :class:`~scrapy.spider.BaseSpider` object 
    7265 
    73 domain_idle 
     66spider_idle 
    7467----------- 
    7568 
    76 .. signal:: domain_idle 
    77 .. function:: domain_idle(domain, spider) 
    78  
    79     Sent when a domain has gone idle, which means the spider has no further: 
     69.. signal:: spider_idle 
     70.. function:: spider_idle(spider) 
     71 
     72    Sent when a spider has gone idle, which means the spider has no further: 
    8073 
    8174        * requests waiting to be downloaded 
     
    8477 
    8578    If the idle state persists after all handlers of this signal have finished, 
    86     the engine starts closing the domain. After the domain has finished 
    87     closing, the :signal:`domain_closed` signal is sent. 
    88  
    89     You can, for example, schedule some requests in your :signal:`domain_idle` 
    90     handler to prevent the domain from being closed. 
    91  
    92     :param domain: is a string with the domain of the spider which has gone idle 
    93     :type domain: str 
     79    the engine starts closing the spider. After the spider has finished 
     80    closing, the :signal:`spider_closed` signal is sent. 
     81 
     82    You can, for example, schedule some requests in your :signal:`spider_idle` 
     83    handler to prevent the spider from being closed. 
    9484 
    9585    :param spider: the spider which has gone idle 
  • docs/topics/stats.rst

    r1613 r1822  
    177177        Close the given domain. After this is called, no more specific stats 
    178178        for this domain can be accessed. This method is called automatically on 
    179         the :signal:`domain_closed` signal. 
     179        the :signal:`spider_closed` signal. 
    180180 
    181181Available Stats Collectors 
     
    303303 
    304304    :param reason: the reason why the domain is being closed. See 
    305         :signal:`domain_closed` signal for more info. 
     305        :signal:`spider_closed` signal for more info. 
    306306    :type reason: str 
    307307 
     
    317317 
    318318    :param reason: the reason why the domain was closed. See 
    319         :signal:`domain_closed` signal for more info. 
     319        :signal:`spider_closed` signal for more info. 
    320320    :type reason: str 
    321321 
  • scrapy/contrib/closedomain.py

    r1713 r1822  
    2424 
    2525        if self.timeout: 
    26             dispatcher.connect(self.domain_opened, signal=signals.domain_opened) 
     26            dispatcher.connect(self.spider_opened, signal=signals.spider_opened) 
    2727        if self.itempassed: 
    2828            dispatcher.connect(self.item_passed, signal=signals.item_passed) 
    29         dispatcher.connect(self.domain_closed, signal=signals.domain_closed) 
     29        dispatcher.connect(self.spider_closed, signal=signals.spider_closed) 
    3030 
    31     def domain_opened(self, spider): 
     31    def spider_opened(self, spider): 
    3232        self.tasks[spider] = reactor.callLater(self.timeout, scrapyengine.close_spider, \ 
    3333            spider=spider, reason='closedomain_timeout') 
     
    3838            scrapyengine.close_spider(spider, 'closedomain_itempassed') 
    3939 
    40     def domain_closed(self, spider): 
     40    def spider_closed(self, spider): 
    4141        self.counts.pop(spider, None) 
    4242        tsk = self.tasks.pop(spider, None) 
  • scrapy/contrib/delayedclosedomain.py

    r1344 r1822  
    2222 
    2323        self.opened_at = defaultdict(time) 
    24         dispatcher.connect(self.domain_idle, signal=signals.domain_idle) 
    25         dispatcher.connect(self.domain_closed, signal=signals.domain_closed) 
     24        dispatcher.connect(self.spider_idle, signal=signals.spider_idle) 
     25        dispatcher.connect(self.spider_closed, signal=signals.spider_closed) 
    2626 
    27     def domain_idle(self, domain): 
     27    def spider_idle(self, spider): 
    2828        try: 
    29             lastseen = scrapyengine.downloader.sites[domain].lastseen 
     29            lastseen = scrapyengine.downloader.sites[spider].lastseen 
    3030        except KeyError: 
    3131            lastseen = None 
    3232        if not lastseen: 
    33             lastseen = self.opened_at[domain] 
     33            lastseen = self.opened_at[spider] 
    3434 
    3535        if time() < lastseen + self.delay: 
    3636            raise DontCloseDomain 
    3737 
    38     def domain_closed(self, domain): 
    39         self.opened_at.pop(domain, None) 
     38    def spider_closed(self, spider): 
     39        self.opened_at.pop(spider, None) 
  • scrapy/contrib/downloadermiddleware/cookies.py

    r1257 r1822  
    1717    def __init__(self): 
    1818        self.jars = defaultdict(CookieJar) 
    19         dispatcher.connect(self.domain_closed, signals.domain_closed) 
     19        dispatcher.connect(self.spider_closed, signals.spider_closed) 
    2020 
    2121    def process_request(self, request, spider): 
     
    2323            return 
    2424 
    25         jar = self.jars[spider.domain_name] 
     25        jar = self.jars[spider] 
    2626        cookies = self._get_request_cookies(jar, request) 
    2727        for cookie in cookies: 
     
    3838 
    3939        # extract cookies from Set-Cookie and drop invalid/expired cookies 
    40         jar = self.jars[spider.domain_name] 
     40        jar = self.jars[spider] 
    4141        jar.extract_cookies(response, request) 
    4242        self._debug_set_cookie(response) 
     
    4444        return response 
    4545 
    46     def domain_closed(self, domain): 
    47         self.jars.pop(domain, None) 
     46    def spider_closed(self, spider): 
     47        self.jars.pop(spider, None) 
    4848 
    4949    def _debug_cookie(self, request): 
  • scrapy/contrib/downloadermiddleware/httpcache.py

    r1518 r1822  
    2525        self.cache = Cache(settings['HTTPCACHE_DIR'], sectorize=settings.getbool('HTTPCACHE_SECTORIZE')) 
    2626        self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING') 
    27         dispatcher.connect(self.open_domain, signal=signals.domain_opened) 
     27        dispatcher.connect(self.open_domain, signal=signals.spider_opened) 
    2828 
    29     def open_domain(self, domain): 
    30         self.cache.open_domain(domain) 
     29    def open_domain(self, spider): 
     30        self.cache.open_domain(spider.domain_name) 
    3131 
    3232    def process_request(self, request, spider): 
  • scrapy/contrib/downloadermiddleware/robotstxt.py

    r1670 r1822  
    2727        self._useragents = {} 
    2828        self._pending = {} 
    29         dispatcher.connect(self.domain_opened, signals.domain_opened) 
    30         dispatcher.connect(self.domain_closed, signals.domain_closed) 
     29        dispatcher.connect(self.spider_opened, signals.spider_opened) 
     30        dispatcher.connect(self.spider_closed, signals.spider_closed) 
    3131 
    3232    def process_request(self, request, spider): 
     
    5353        self._parsers[urlparse_cached(response).netloc] = rp 
    5454 
    55     def domain_opened(self, spider): 
     55    def spider_opened(self, spider): 
    5656        self._spider_netlocs[spider] = set() 
    5757        self._useragents[spider] = getattr(spider, 'user_agent', None) \ 
    5858            or settings['USER_AGENT'] 
    5959 
    60     def domain_closed(self, domain, spider): 
     60    def spider_closed(self, spider): 
    6161        for netloc in self._spider_netlocs[domain]: 
    6262            del self._parsers[netloc] 
  • scrapy/contrib/itemsampler.py

    r1713 r1822  
    5050        self.domains_count = 0 
    5151        self.empty_domains = set() 
    52         dispatcher.connect(self.domain_closed, signal=signals.domain_closed) 
     52        dispatcher.connect(self.spider_closed, signal=signals.spider_closed) 
    5353        dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped) 
    5454 
     
    7171            log.msg("No products sampled for: %s" % " ".join(self.empty_domains), level=log.WARNING) 
    7272 
    73     def domain_closed(self, domain, spider, reason): 
     73    def spider_closed(self, spider, reason): 
     74        domain = spider.domain_name 
    7475        if reason == 'finished' and not stats.get_value("items_sampled", domain=domain): 
    7576            self.empty_domains.add(domain) 
  • scrapy/contrib/pipeline/images.py

    r1787 r1822  
    4545        self._mkdir(self.basedir) 
    4646        self.created_directories = defaultdict(set) 
    47         dispatcher.connect(self.domain_closed, signals.domain_closed) 
    48  
    49     def domain_closed(self, domain): 
    50         self.created_directories.pop(domain, None) 
     47        dispatcher.connect(self.spider_closed, signals.spider_closed) 
     48 
     49    def spider_closed(self, spider): 
     50        self.created_directories.pop(spider.domain_name, None) 
    5151 
    5252    def persist_image(self, key, image, buf, info): 
  • scrapy/contrib/pipeline/media.py

    r1758 r1822  
    2323    def __init__(self): 
    2424        self.domaininfo = {} 
    25         dispatcher.connect(self.domain_opened, signals.domain_opened) 
    26         dispatcher.connect(self.domain_closed, signals.domain_closed) 
     25        dispatcher.connect(self.spider_opened, signals.spider_opened) 
     26        dispatcher.connect(self.spider_closed, signals.spider_closed) 
    2727 
    28     def domain_opened(self, spider): 
     28    def spider_opened(self, spider): 
    2929        self.domaininfo[spider.domain_name] = self.DomainInfo(spider) 
    3030 
    31     def domain_closed(self, domain): 
    32         del self.domaininfo[domain] 
     31    def spider_closed(self, spider): 
     32        del self.domaininfo[spider.domain_name] 
    3333 
    3434    def process_item(self, domain, item): 
  • scrapy/contrib/resolver.py

    r1529 r1822  
    1717        reactor.installResolver(self.resolver) 
    1818        dispatcher.connect(self.request_received, signals.request_received) 
    19         dispatcher.connect(self.domain_closed, signal=signals.domain_closed) 
     19        dispatcher.connect(self.spider_closed, signal=signals.spider_closed) 
    2020 
    2121    def request_received(self, request, spider): 
    2222        url_hostname = urlparse_cached(request).hostname 
    23         self.spider_hostnames[spider.domain_name].add(url_hostname) 
     23        self.spider_hostnames[spider].add(url_hostname) 
    2424 
    25     def domain_closed(self, spider): 
    26         for hostname in self.spider_hostnames: 
     25    def spider_closed(self, spider): 
     26        for hostname in self.spider_hostnames[spider]: 
    2727            self.resolver._cache.pop(hostname, None) 
    2828 
  • scrapy/contrib/spidermiddleware/offsite.py

    r1516 r1822  
    1616    def __init__(self): 
    1717        self.host_regexes = {} 
    18         dispatcher.connect(self.domain_opened, signal=signals.domain_opened) 
    19         dispatcher.connect(self.domain_closed, signal=signals.domain_closed) 
     18        dispatcher.connect(self.spider_opened, signal=signals.spider_opened) 
     19        dispatcher.connect(self.spider_closed, signal=signals.spider_closed) 
    2020 
    2121    def process_spider_output(self, response, result, spider): 
     
    3535        return re.compile(regex) 
    3636 
    37     def domain_opened(self, spider): 
     37    def spider_opened(self, spider): 
    3838        domains = [spider.domain_name] + spider.extra_domain_names 
    3939        self.host_regexes[spider] = self.get_host_regex(domains) 
    4040 
    41     def domain_closed(self, spider): 
     41    def spider_closed(self, spider): 
    4242        del self.host_regexes[spider] 
  • scrapy/contrib/spidermiddleware/requestlimit.py

    r1516 r1822  
    2424        self.dropped_count = {} 
    2525 
    26         dispatcher.connect(self.domain_opened, signal=signals.domain_opened) 
    27         dispatcher.connect(self.domain_closed, signal=signals.domain_closed) 
     26        dispatcher.connect(self.spider_opened, signal=signals.spider_opened) 
     27        dispatcher.connect(self.spider_closed, signal=signals.spider_closed) 
    2828 
    29     def domain_opened(self, domain, spider): 
    30         self.max_pending[domain] = getattr(spider, 'requests_queue_size', self.max_queue_size) 
    31         self.dropped_count[domain] = 0 
     29    def spider_opened(self, spider): 
     30        self.max_pending[spider] = getattr(spider, 'requests_queue_size', self.max_queue_size) 
     31        self.dropped_count[spider] = 0 
    3232 
    33     def domain_closed(self, domain): 
    34         dropped_count = self.dropped_count[domain] 
     33    def spider_closed(self, spider): 
     34        dropped_count = self.dropped_count[spider] 
    3535        if dropped_count: 
    36             max_pending = self.max_pending[domain] 
     36            max_pending = self.max_pending[spider] 
    3737            log.msg('Dropped %d request(s) because the scheduler queue size limit (%d requests) was exceeded' % \ 
    38                     (dropped_count, max_pending), level=log.DEBUG, domain=domain) 
    39         del self.dropped_count[domain] 
    40         del self.max_pending[domain] 
     38                    (dropped_count, max_pending), level=log.DEBUG, spider=spider) 
     39        del self.dropped_count[spider] 
     40        del self.max_pending[spider] 
    4141 
    4242    def process_spider_output(self, response, result, spider): 
    43         domain = spider.domain_name 
    44         max_pending = self.max_pending.get(domain, 0) 
     43        max_pending = self.max_pending.get(spider, 0) 
    4544        if max_pending: 
    46             return imap(lambda v: self._limit_requests(v, domain, max_pending), result) 
     45            return imap(lambda v: self._limit_requests(v, spider, max_pending), result) 
    4746        else: 
    4847            return result 
    4948 
    50     def _limit_requests(self, request_or_other, domain, max_pending): 
     49    def _limit_requests(self, request_or_other, spider, max_pending): 
    5150        if isinstance(request_or_other, Request): 
    52             free_slots = max_pending - self._pending_count(domain) 
     51            free_slots = max_pending - self._pending_count(spider) 
    5352            if free_slots > 0: 
    5453                # Scheduler isn't saturated and it is fine to schedule more requests. 
     
    5655            else: 
    5756                # Skip the request and give engine time to handle other tasks. 
    58                 self.dropped_count[domain] += 1 
     57                self.dropped_count[spider] += 1 
    5958                return None 
    6059        else: 
     
    6261            return request_or_other 
    6362 
    64     def _pending_count(self, domain): 
    65         pending = scrapyengine.scheduler.pending_requests.get(domain, []) 
     63    def _pending_count(self, spider): 
     64        pending = scrapyengine.scheduler.pending_requests.get(spider, []) 
    6665        return len(pending) 
  • scrapy/contrib/webconsole/livestats.py

    r1518 r1822  
    2121    def __init__(self): 
    2222        self.domains = {} 
    23         dispatcher.connect(self.domain_opened, signal=signals.domain_opened) 
    24         dispatcher.connect(self.domain_closed, signal=signals.domain_closed) 
     23        dispatcher.connect(self.spider_opened, signal=signals.spider_opened) 
     24        dispatcher.connect(self.spider_closed, signal=signals.spider_closed) 
    2525        dispatcher.connect(self.item_scraped, signal=signals.item_scraped) 
    2626        dispatcher.connect(self.response_downloaded, signal=signals.response_downloaded) 
     
    2828        dispatcher.connect(self.webconsole_discover_module, signal=webconsole_discover_module) 
    2929 
    30     def domain_opened(self, domain, spider): 
     30    def spider_opened(self, spider): 
    3131        pstats = SpiderStats() 
    32         self.domains[spider.domain_name] = pstats 
     32        self.domains[spider] = pstats 
    3333        pstats.started = datetime.now().replace(microsecond=0) 
    3434        pstats.finished = None 
    3535 
    36     def domain_closed(self, domain, spider): 
    37         self.domains[spider.domain_name].finished = datetime.now().replace(microsecond=0) 
     36    def spider_closed(self, spider): 
     37        self.domains[spider].finished = datetime.now().replace(microsecond=0) 
    3838 
    3939    def item_scraped(self, item, spider): 
    40         self.domains[spider.domain_name].scraped += 1 
     40        self.domains[spider].scraped += 1 
    4141 
    4242    def response_downloaded(self, response, spider): 
    4343        # sometimes we download responses without opening/closing domains, 
    4444        # for example from scrapy shell 
    45         if self.domains.get(spider.domain_name): 
    46             self.domains[spider.domain_name].crawled += 1 
     45        if self.domains.get(spider): 
     46            self.domains[spider].crawled += 1 
    4747             
    4848    def webconsole_render(self, wc_request): 
     
    5050        dwl = scrapyengine.downloader 
    5151 
    52         totdomains = totscraped = totcrawled = totscheduled = totactive = totpending = totdqueued = tottransf = 0 
     52        totdomains = totscraped = totcrawled = totscheduled = totactive = totdqueued = tottransf = 0 
    5353        s = banner(self) 
    5454        s += "<table border='1'>\n" 
    5555        s += "<tr><th>Domain</th><th>Items<br>Scraped</th><th>Pages<br>Crawled</th><th>Scheduler<br>Pending</th><th>Downloader<br/>Queued</th><th>Downloader<br/>Active</th><th>Downloader<br/>Transferring</th><th>Start time</th><th>Finish time</th><th>Run time</th></tr>\n" 
    56         for d in sorted(self.domains.keys()): 
    57             scheduled = len(sch.pending_requests[d]) if d in sch.pending_requests else 0 
    58             active = len(dwl.sites[d].active) if d in dwl.sites else 0 
    59             dqueued = len(dwl.sites[d].queue) if d in dwl.sites else 0 
    60             transf = len(dwl.sites[d].transferring) if d in dwl.sites else 0 
    61             stats = self.domains[d] 
     56        for spider in sorted(self.domains.keys()): 
     57            scheduled = len(sch.pending_requests[spider]) if spider in sch.pending_requests else 0 
     58            active = len(dwl.sites[spider].active) if spider in dwl.sites else 0 
     59            dqueued = len(dwl.sites[spider].queue) if spider in dwl.sites else 0 
     60            transf = len(dwl.sites[spider].transferring) if spider in dwl.sites else 0 
     61            stats = self.domains[spider] 
    6262            runtime = stats.finished - stats.started if stats.finished else datetime.now() - stats.started 
    6363 
    6464            s += '<tr><td>%s</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td>%s</td><td>%s</td><td>%s</td></tr>\n' % \ 
    65                  (d, stats.scraped, stats.crawled, scheduled, dqueued, active, transf, str(stats.started), str(stats.finished), str(runtime)) 
     65                 (spider.domain_name, stats.scraped, stats.crawled, scheduled, dqueued, active, transf, str(stats.started), str(stats.finished), str(runtime)) 
    6666 
    6767            totdomains += 1 
  • scrapy/contrib/webconsole/spiderctl.py

    r1806 r1822  
    1919        self.running = {} 
    2020        self.finished = set() 
    21         dispatcher.connect(self.domain_opened, signal=signals.domain_opened) 
    22         dispatcher.connect(self.domain_closed, signal=signals.domain_closed) 
     21        dispatcher.connect(self.spider_opened, signal=signals.spider_opened) 
     22        dispatcher.connect(self.spider_closed, signal=signals.spider_closed) 
    2323 
    2424        from scrapy.management.web import webconsole_discover_module 
    2525        dispatcher.connect(self.webconsole_discover_module, signal=webconsole_discover_module) 
    2626 
    27     def domain_opened(self, spider): 
     27    def spider_opened(self, spider): 
    2828        self.running[spider.domain_name] = spider 
    2929 
    30     def domain_closed(self, spider): 
     30    def spider_closed(self, spider): 
    3131        del self.running[spider.domain_name] 
    3232        self.finished.add(spider.domain_name) 
  • scrapy/contrib_exp/pipeline/shoveitem.py

    r1518 r1822  
    2525        self.stores = {} 
    2626 
    27         dispatcher.connect(self.domain_opened, signal=signals.domain_opened) 
    28         dispatcher.connect(self.domain_closed, signal=signals.domain_closed) 
     27        dispatcher.connect(self.spider_opened, signal=signals.spider_opened) 
     28        dispatcher.connect(self.spider_closed, signal=signals.spider_closed) 
    2929 
    3030    def process_item(self, domain, item): 
     
    4444        return item 
    4545 
    46     def domain_opened(self, domain): 
     46    def spider_opened(self, spider): 
     47        domain = spider.domain_name 
    4748        uri = Template(self.uritpl).substitute(domain=domain) 
    4849        self.stores[domain] = Shove(uri, **self.opts) 
    4950 
    50     def domain_closed(self, domain): 
    51         self.stores[domain].sync() 
     51    def spider_closed(self, spider): 
     52        self.stores[spider.domain_name].sync() 
    5253 
    5354    def log(self, domain, item, status): 
  • scrapy/core/engine.py

    r1782 r1822  
    100100 
    101101    def next_request(self, spider, now=False): 
    102         """Scrape the next request for the domain passed. 
     102        """Scrape the next request for the spider passed. 
    103103 
    104104        The next request to be scraped is retrieved from the scheduler and 
     
    165165        if not request.deferred.callbacks: 
    166166            log.msg("Unable to crawl Request with no callback: %s" % request, 
    167                 level=log.ERROR, domain=spider.domain_name) 
     167                level=log.ERROR, spider=spider) 
    168168            return 
    169169        schd = mustbe_deferred(self.schedule, request, spider) 
     
    187187 
    188188    def _mainloop(self): 
    189         """Add more domains to be scraped if the downloader has the capacity. 
     189        """Add more spiders to be scraped if the downloader has the capacity. 
    190190 
    191191        If there is nothing else scheduled then stop the execution engine. 
     
    199199 
    200200    def download(self, request, spider): 
    201         domain = spider.domain_name 
    202  
    203201        def _on_success(response): 
    204202            """handle the result of a page download""" 
     
    207205                response.request = request # tie request to response received 
    208206                log.msg(self._crawled_logline(request, response), \ 
    209                     level=log.DEBUG, domain=spider.domain_name) 
     207                    level=log.DEBUG, spider=spider) 
    210208                return response 
    211209            elif isinstance(response, Request): 
     
    226224            if errmsg: 
    227225                log.msg("Crawling <%s>: %s" % (request.url, errmsg), \ 
    228                     level=level, domain=domain) 
     226                    level=level, spider=spider) 
    229227            return Failure(IgnoreRequest(str(exc))) 
    230228 
     
    242240 
    243241    def open_spider(self, spider): 
    244         domain = spider.domain_name 
    245         log.msg("Domain opened", domain=domain) 
     242        log.msg("Spider opened", spider=spider) 
    246243        self.next_request(spider) 
    247244 
    248245        self.downloader.open_spider(spider) 
    249246        self.scraper.open_spider(spider) 
    250         stats.open_domain(domain) 
    251  
    252         # XXX: sent for backwards compatibility (will be removed in Scrapy 0.8) 
    253         send_catch_log(signals.domain_open, sender=self.__class__, \ 
    254             domain=domain, spider=spider) 
    255  
    256         send_catch_log(signals.domain_opened, sender=self.__class__, \ 
    257             domain=domain, spider=spider) 
     247        stats.open_domain(spider.domain_name) 
     248 
     249        send_catch_log(signals.spider_opened, sender=self.__class__, spider=spider) 
    258250 
    259251    def _spider_idle(self, spider): 
    260         """Called when a domain gets idle. This function is called when there 
     252        """Called when a spider gets idle. This function is called when there 
    261253        are no remaining pages to download or schedule. It can be called 
    262254        multiple times. If some extension raises a DontCloseDomain exception 
    263         (in the domain_idle signal handler) the domain is not closed until the 
     255        (in the spider_idle signal handler) the spider is not closed until the 
    264256        next loop and this function is guaranteed to be called (at least) once 
    265         again for this domain. 
    266         """ 
    267         domain = spider.domain_name 
     257        again for this spider. 
     258        """ 
    268259        try: 
    269             dispatcher.send(signal=signals.domain_idle, sender=self.__class__, \ 
    270                 domain=domain, spider=spider) 
     260            dispatcher.send(signal=signals.spider_idle, sender=self.__class__, \ 
     261                spider=spider) 
    271262        except DontCloseDomain: 
    272263            self.next_request(spider) 
    273264            return 
    274265        except: 
    275             log.err("Exception catched on domain_idle signal dispatch") 
     266            log.err("Exception catched on spider_idle signal dispatch") 
    276267        if self.spider_is_idle(spider): 
    277268            self.close_spider(spider, reason='finished') 
     
    284275    def close_spider(self, spider, reason='cancelled'): 
    285276        """Close (cancel) spider and clear all its outstanding requests""" 
    286         domain = spider.domain_name 
    287277        if spider not in self.closing: 
    288             log.msg("Closing domain (%s)" % reason, domain=domain) 
     278            log.msg("Closing spider (%s)" % reason, spider=spider) 
    289279            self.closing[spider] = reason 
    290280            self.downloader.close_spider(spider) 
     
    299289 
    300290    def _finish_closing_spider_if_idle(self, spider): 
    301         """Call _finish_closing_spider if domain is idle""" 
     291        """Call _finish_closing_spider if spider is idle""" 
    302292        if self.spider_is_idle(spider) or self.killed: 
    303293            return self._finish_closing_spider(spider) 
     
    311301    def _finish_closing_spider(self, spider): 
    312302        """This function is called after the spider has been closed""" 
    313         domain = spider.domain_name 
    314303        self.scheduler.close_spider(spider) 
    315304        self.scraper.close_spider(spider) 
    316305        reason = self.closing.pop(spider, 'finished') 
    317         send_catch_log(signal=signals.domain_closed, sender=self.__class__, \ 
    318             domain=domain, spider=spider, reason=reason) 
    319         stats.close_domain(domain, reason=reason) 
     306        send_catch_log(signal=signals.spider_closed, sender=self.__class__, \ 
     307            spider=spider, reason=reason) 
     308        stats.close_domain(spider.domain_name, reason=reason) 
    320309        dfd = defer.maybeDeferred(spiders.close_spider, spider) 
    321         dfd.addBoth(log.msg, "Domain closed (%s)" % reason, domain=domain) 
     310        dfd.addBoth(log.msg, "Spider closed (%s)" % reason, spider=spider) 
    322311        reactor.callLater(0, self._mainloop) 
    323312        return dfd 
  • scrapy/core/signals.py

    r1527 r1822  
    88engine_started = object() 
    99engine_stopped = object() 
    10 domain_opened = object() 
    11 domain_idle = object() 
    12 domain_closed = object() 
     10spider_opened = object() 
     11spider_idle = object() 
     12spider_closed = object() 
    1313request_received = object() 
    1414request_uploaded = object() 
     
    1818item_passed = object() 
    1919item_dropped = object() 
    20  
    21 # XXX: deprecated signals (will be removed in Scrapy 0.8) 
    22 domain_open = object() 
    23  
  • scrapy/log.py

    r1818 r1822  
    6464        log.startLogging(file, setStdout=logstdout) 
    6565 
    66 def msg(message, level=INFO, component=BOT_NAME, domain=None): 
     66def msg(message, level=INFO, component=BOT_NAME, domain=None, spider=None): 
    6767    """Log message according to the level""" 
    6868    if level > log_level: 
    6969        return 
    7070    dispatcher.send(signal=logmessage_received, message=message, level=level, \ 
    71         domain=domain) 
    72     system = domain if domain else component 
     71        domain=domain, spider=spider) 
     72    system = domain or spider.domain_name if spider else component 
    7373    msg_txt = unicode_to_str("%s: %s" % (level_names[level], message)) 
    7474    log.msg(msg_txt, system=system) 
    7575 
    76 def exc(message, level=ERROR, component=BOT_NAME, domain=None): 
     76def exc(message, level=ERROR, component=BOT_NAME, domain=None, spider=None): 
    7777    message = message + '\n' + format_exc() 
    78     msg(message, level, component, domain) 
     78    msg(message, level, component, domain, spider) 
    7979 
    8080def err(_stuff=None, _why=None, **kwargs): 
     
    8282        return 
    8383    domain = kwargs.pop('domain', None) 
     84    spider = kwargs.pop('spider', None) 
    8485    component = kwargs.pop('component', BOT_NAME) 
    85     kwargs['system'] = domain if domain else component 
     86    kwargs['system'] = domain or spider.domain_name if spider else component 
    8687    if _why: 
    8788        _why = unicode_to_str("ERROR: %s" % _why) 
  • scrapy/tests/test_downloadermiddleware_cookies.py

    r1569 r1822  
    1515 
    1616    def tearDown(self): 
    17         self.mw.domain_closed('scrapytest.org') 
     17        self.mw.spider_closed('scrapytest.org') 
    1818        del self.mw 
    1919 
  • scrapy/tests/test_engine.py

    r1713 r1822  
    9090            dispatcher.connect(self.record_signal, signals.engine_started) 
    9191            dispatcher.connect(self.record_signal, signals.engine_stopped) 
    92             dispatcher.connect(self.record_signal, signals.domain_opened) 
    93             dispatcher.connect(self.record_signal, signals.domain_idle) 
    94             dispatcher.connect(self.record_signal, signals.domain_closed) 
     92            dispatcher.connect(self.record_signal, signals.spider_opened) 
     93            dispatcher.connect(self.record_signal, signals.spider_idle) 
     94            dispatcher.connect(self.record_signal, signals.spider_closed) 
    9595            dispatcher.connect(self.item_scraped, signals.item_scraped) 
    9696            dispatcher.connect(self.request_received, signals.request_received) 
     
    202202        assert signals.engine_started in session.signals_catched 
    203203        assert signals.engine_stopped in session.signals_catched 
    204         assert signals.domain_opened in session.signals_catched 
    205         assert signals.domain_idle in session.signals_catched 
    206         assert signals.domain_closed in session.signals_catched 
    207  
    208         self.assertEqual({'domain': session.domain, 'spider': session.spider}, 
    209                          session.signals_catched[signals.domain_opened]) 
    210         self.assertEqual({'domain': session.domain, 'spider': session.spider}, 
    211                          session.signals_catched[signals.domain_idle]) 
    212         self.assertEqual({'domain': session.domain, 'spider': session.spider, 'reason': 'finished'}, 
    213                          session.signals_catched[signals.domain_closed]) 
     204        assert signals.spider_opened in session.signals_catched 
     205        assert signals.spider_idle in session.signals_catched 
     206        assert signals.spider_closed in session.signals_catched 
     207 
     208        self.assertEqual({'spider': session.spider}, 
     209                         session.signals_catched[signals.spider_opened]) 
     210        self.assertEqual({'spider': session.spider}, 
     211                         session.signals_catched[signals.spider_idle]) 
     212        self.assertEqual({'spider': session.spider, 'reason': 'finished'}, 
     213                         session.signals_catched[signals.spider_closed]) 
    214214 
    215215if __name__ == "__main__": 
  • scrapy/tests/test_spidermiddleware_offsite.py

    r1686 r1822  
    1414 
    1515        self.mw = OffsiteMiddleware() 
    16         self.mw.domain_opened(self.spider) 
     16        self.mw.spider_opened(self.spider) 
    1717 
    1818    def test_process_spider_output(self): 
     
    2929 
    3030    def tearDown(self): 
    31         self.mw.domain_closed(self.spider) 
     31        self.mw.spider_closed(self.spider) 
    3232