Changeset 1822:af6645fd14ed
- Timestamp:
- 11/03/09 00:39:02 (9 months ago)
- Author:
- Pablo Hoffman <pablo@…>
- Branch:
- default
- Message:
-
* Renamed domain_{opened,closed,idle} signals to spider_{opened,closed,idle}
* Changed them to pass spider instances only (no domains) (refs #105)
- Files:
-
Legend:
- Unmodified
- Added
- Removed
-
|
r1729
|
r1822
|
|
| 47 | 47 | |
| 48 | 48 | def __init__(self): |
| 49 | | dispatcher.connect(self.domain_opened, signals.domain_opened) |
| 50 | | dispatcher.connect(self.domain_closed, signals.domain_closed) |
| | 49 | dispatcher.connect(self.spider_opened, signals.spider_opened) |
| | 50 | dispatcher.connect(self.spider_closed, signals.spider_closed) |
| 51 | 51 | self.files = {} |
| 52 | 52 | |
| 53 | | def domain_opened(self, domain): |
| | 53 | def spider_opened(self, spider): |
| | 54 | domain = spider.domain_name |
| 54 | 55 | file = open('%s_products.xml' % domain, 'w+b') |
| 55 | 56 | self.files[domain] = file |
| … |
… |
|
| 57 | 58 | self.exporter.start_exporting() |
| 58 | 59 | |
| 59 | | def domain_closed(self, domain): |
| | 60 | def spider_closed(self, spider): |
| | 61 | domain = spider.domain_name |
| 60 | 62 | self.exporter.finish_exporting() |
| 61 | 63 | file = self.files.pop(domain) |
-
|
r1788
|
r1822
|
|
| 102 | 102 | |
| 103 | 103 | def __init__(self): |
| 104 | | dispatcher.connect(self.domain_opened, signal=signals.domain_opened) |
| 105 | | dispatcher.connect(self.domain_closed, signal=signals.domain_closed) |
| 106 | | |
| 107 | | def domain_opened(self, domain, spider): |
| 108 | | log.msg("opened domain %s" % domain) |
| 109 | | |
| 110 | | def domain_closed(self, domain, spider): |
| 111 | | log.msg("closed domain %s" % domain) |
| | 104 | dispatcher.connect(self.spider_opened, signal=signals.spider_opened) |
| | 105 | dispatcher.connect(self.spider_closed, signal=signals.spider_closed) |
| | 106 | |
| | 107 | def spider_opened(self, spider): |
| | 108 | log.msg("opened spider %s" % spider.domain_name) |
| | 109 | |
| | 110 | def spider_closed(self, spider): |
| | 111 | log.msg("closed spider %s" % spider.domain_name) |
| 112 | 112 | |
| 113 | 113 | |
-
|
r1736
|
r1822
|
|
| 86 | 86 | class DuplicatesPipeline(object): |
| 87 | 87 | def __init__(self): |
| 88 | | self.domaininfo = {} |
| 89 | | dispatcher.connect(self.domain_opened, signals.domain_opened) |
| 90 | | dispatcher.connect(self.domain_closed, signals.domain_closed) |
| 91 | | |
| 92 | | def domain_opened(self, domain): |
| 93 | | self.duplicates[domain] = set() |
| 94 | | |
| 95 | | def domain_closed(self, domain): |
| 96 | | del self.duplicates[domain] |
| | 88 | self.duplicates = {} |
| | 89 | dispatcher.connect(self.spider_opened, signals.spider_opened) |
| | 90 | dispatcher.connect(self.spider_closed, signals.spider_closed) |
| | 91 | |
| | 92 | def spider_opened(self, spider): |
| | 93 | self.duplicates[spider.domain_name] = set() |
| | 94 | |
| | 95 | def spider_closed(self, spider): |
| | 96 | del self.duplicates[spider.domain_name] |
| 97 | 97 | |
| 98 | 98 | def process_item(self, domain, item): |
-
|
r1551
|
r1822
|
|
| 46 | 46 | you have written, if you are not releasing the (previously allocated) resources |
| 47 | 47 | properly. For example, if you're allocating resources on |
| 48 | | :signal:`domain_opened` but not releasing them on :signal:`domain_closed`. |
| | 48 | :signal:`spider_opened` but not releasing them on :signal:`spider_closed`. |
| 49 | 49 | |
| 50 | 50 | .. _topics-leaks-trackrefs: |
-
|
r1564
|
r1822
|
|
| 30 | 30 | order. |
| 31 | 31 | |
| 32 | | domain_closed |
| | 32 | spider_closed |
| 33 | 33 | ------------- |
| 34 | 34 | |
| 35 | | .. signal:: domain_closed |
| 36 | | .. function:: domain_closed(domain, spider, reason) |
| 37 | | |
| 38 | | Sent after a spider/domain has been closed. This can be used to release |
| 39 | | per-spider resources reserved on :signal:`domain_opened`. |
| 40 | | |
| 41 | | :param domain: a string which contains the domain of the spider which has |
| 42 | | been closed |
| 43 | | :type domain: str |
| | 35 | .. signal:: spider_closed |
| | 36 | .. function:: spider_closed(spider, reason) |
| | 37 | |
| | 38 | Sent after a spider has been closed. This can be used to release per-spider |
| | 39 | resources reserved on :signal:`spider_opened`. |
| 44 | 40 | |
| 45 | 41 | :param spider: the spider which has been closed |
| 46 | 42 | :type spider: :class:`~scrapy.spider.BaseSpider` object |
| 47 | 43 | |
| 48 | | :param reason: a string which describes the reason why the domain was closed. If |
| 49 | | it was closed because the domain has completed scraping, it the reason |
| 50 | | is ``'finished'``. Otherwise, if the domain was manually closed by |
| 51 | | calling the ``close_domain`` engine method, then the reason is the one |
| | 44 | :param reason: a string which describes the reason why the spider was closed. If |
| | 45 | it was closed because the spider has completed scraping, it the reason |
| | 46 | is ``'finished'``. Otherwise, if the spider was manually closed by |
| | 47 | calling the ``close_spider`` engine method, then the reason is the one |
| 52 | 48 | passed in the ``reason`` argument of that method (which defaults to |
| 53 | 49 | ``'cancelled'``). If the engine was shutdown (for example, by hitting |
| … |
… |
|
| 55 | 51 | :type reason: str |
| 56 | 52 | |
| 57 | | domain_opened |
| | 53 | spider_opened |
| 58 | 54 | ------------- |
| 59 | 55 | |
| 60 | | .. signal:: domain_opened |
| 61 | | .. function:: domain_opened(domain, spider) |
| 62 | | |
| 63 | | Sent after a spider/domain has been opened for crawling. This is typically |
| 64 | | used to reserve per-spider resources, but can be used for any task that |
| 65 | | needs to be performed when a spider/domain is opened. |
| 66 | | |
| 67 | | :param domain: a string with the domain of the spider which has been opened |
| 68 | | :type domain: str |
| | 56 | .. signal:: spider_opened |
| | 57 | .. function:: spider_opened(spider) |
| | 58 | |
| | 59 | Sent after a spider has been opened for crawling. This is typically used to |
| | 60 | reserve per-spider resources, but can be used for any task that needs to be |
| | 61 | performed when a spider is opened. |
| 69 | 62 | |
| 70 | 63 | :param spider: the spider which has been opened |
| 71 | 64 | :type spider: :class:`~scrapy.spider.BaseSpider` object |
| 72 | 65 | |
| 73 | | domain_idle |
| | 66 | spider_idle |
| 74 | 67 | ----------- |
| 75 | 68 | |
| 76 | | .. signal:: domain_idle |
| 77 | | .. function:: domain_idle(domain, spider) |
| 78 | | |
| 79 | | Sent when a domain has gone idle, which means the spider has no further: |
| | 69 | .. signal:: spider_idle |
| | 70 | .. function:: spider_idle(spider) |
| | 71 | |
| | 72 | Sent when a spider has gone idle, which means the spider has no further: |
| 80 | 73 | |
| 81 | 74 | * requests waiting to be downloaded |
| … |
… |
|
| 84 | 77 | |
| 85 | 78 | If the idle state persists after all handlers of this signal have finished, |
| 86 | | the engine starts closing the domain. After the domain has finished |
| 87 | | closing, the :signal:`domain_closed` signal is sent. |
| 88 | | |
| 89 | | You can, for example, schedule some requests in your :signal:`domain_idle` |
| 90 | | handler to prevent the domain from being closed. |
| 91 | | |
| 92 | | :param domain: is a string with the domain of the spider which has gone idle |
| 93 | | :type domain: str |
| | 79 | the engine starts closing the spider. After the spider has finished |
| | 80 | closing, the :signal:`spider_closed` signal is sent. |
| | 81 | |
| | 82 | You can, for example, schedule some requests in your :signal:`spider_idle` |
| | 83 | handler to prevent the spider from being closed. |
| 94 | 84 | |
| 95 | 85 | :param spider: the spider which has gone idle |
-
|
r1613
|
r1822
|
|
| 177 | 177 | Close the given domain. After this is called, no more specific stats |
| 178 | 178 | for this domain can be accessed. This method is called automatically on |
| 179 | | the :signal:`domain_closed` signal. |
| | 179 | the :signal:`spider_closed` signal. |
| 180 | 180 | |
| 181 | 181 | Available Stats Collectors |
| … |
… |
|
| 303 | 303 | |
| 304 | 304 | :param reason: the reason why the domain is being closed. See |
| 305 | | :signal:`domain_closed` signal for more info. |
| | 305 | :signal:`spider_closed` signal for more info. |
| 306 | 306 | :type reason: str |
| 307 | 307 | |
| … |
… |
|
| 317 | 317 | |
| 318 | 318 | :param reason: the reason why the domain was closed. See |
| 319 | | :signal:`domain_closed` signal for more info. |
| | 319 | :signal:`spider_closed` signal for more info. |
| 320 | 320 | :type reason: str |
| 321 | 321 | |
-
|
r1713
|
r1822
|
|
| 24 | 24 | |
| 25 | 25 | if self.timeout: |
| 26 | | dispatcher.connect(self.domain_opened, signal=signals.domain_opened) |
| | 26 | dispatcher.connect(self.spider_opened, signal=signals.spider_opened) |
| 27 | 27 | if self.itempassed: |
| 28 | 28 | dispatcher.connect(self.item_passed, signal=signals.item_passed) |
| 29 | | dispatcher.connect(self.domain_closed, signal=signals.domain_closed) |
| | 29 | dispatcher.connect(self.spider_closed, signal=signals.spider_closed) |
| 30 | 30 | |
| 31 | | def domain_opened(self, spider): |
| | 31 | def spider_opened(self, spider): |
| 32 | 32 | self.tasks[spider] = reactor.callLater(self.timeout, scrapyengine.close_spider, \ |
| 33 | 33 | spider=spider, reason='closedomain_timeout') |
| … |
… |
|
| 38 | 38 | scrapyengine.close_spider(spider, 'closedomain_itempassed') |
| 39 | 39 | |
| 40 | | def domain_closed(self, spider): |
| | 40 | def spider_closed(self, spider): |
| 41 | 41 | self.counts.pop(spider, None) |
| 42 | 42 | tsk = self.tasks.pop(spider, None) |
-
|
r1344
|
r1822
|
|
| 22 | 22 | |
| 23 | 23 | self.opened_at = defaultdict(time) |
| 24 | | dispatcher.connect(self.domain_idle, signal=signals.domain_idle) |
| 25 | | dispatcher.connect(self.domain_closed, signal=signals.domain_closed) |
| | 24 | dispatcher.connect(self.spider_idle, signal=signals.spider_idle) |
| | 25 | dispatcher.connect(self.spider_closed, signal=signals.spider_closed) |
| 26 | 26 | |
| 27 | | def domain_idle(self, domain): |
| | 27 | def spider_idle(self, spider): |
| 28 | 28 | try: |
| 29 | | lastseen = scrapyengine.downloader.sites[domain].lastseen |
| | 29 | lastseen = scrapyengine.downloader.sites[spider].lastseen |
| 30 | 30 | except KeyError: |
| 31 | 31 | lastseen = None |
| 32 | 32 | if not lastseen: |
| 33 | | lastseen = self.opened_at[domain] |
| | 33 | lastseen = self.opened_at[spider] |
| 34 | 34 | |
| 35 | 35 | if time() < lastseen + self.delay: |
| 36 | 36 | raise DontCloseDomain |
| 37 | 37 | |
| 38 | | def domain_closed(self, domain): |
| 39 | | self.opened_at.pop(domain, None) |
| | 38 | def spider_closed(self, spider): |
| | 39 | self.opened_at.pop(spider, None) |
-
|
r1257
|
r1822
|
|
| 17 | 17 | def __init__(self): |
| 18 | 18 | self.jars = defaultdict(CookieJar) |
| 19 | | dispatcher.connect(self.domain_closed, signals.domain_closed) |
| | 19 | dispatcher.connect(self.spider_closed, signals.spider_closed) |
| 20 | 20 | |
| 21 | 21 | def process_request(self, request, spider): |
| … |
… |
|
| 23 | 23 | return |
| 24 | 24 | |
| 25 | | jar = self.jars[spider.domain_name] |
| | 25 | jar = self.jars[spider] |
| 26 | 26 | cookies = self._get_request_cookies(jar, request) |
| 27 | 27 | for cookie in cookies: |
| … |
… |
|
| 38 | 38 | |
| 39 | 39 | # extract cookies from Set-Cookie and drop invalid/expired cookies |
| 40 | | jar = self.jars[spider.domain_name] |
| | 40 | jar = self.jars[spider] |
| 41 | 41 | jar.extract_cookies(response, request) |
| 42 | 42 | self._debug_set_cookie(response) |
| … |
… |
|
| 44 | 44 | return response |
| 45 | 45 | |
| 46 | | def domain_closed(self, domain): |
| 47 | | self.jars.pop(domain, None) |
| | 46 | def spider_closed(self, spider): |
| | 47 | self.jars.pop(spider, None) |
| 48 | 48 | |
| 49 | 49 | def _debug_cookie(self, request): |
-
|
r1518
|
r1822
|
|
| 25 | 25 | self.cache = Cache(settings['HTTPCACHE_DIR'], sectorize=settings.getbool('HTTPCACHE_SECTORIZE')) |
| 26 | 26 | self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING') |
| 27 | | dispatcher.connect(self.open_domain, signal=signals.domain_opened) |
| | 27 | dispatcher.connect(self.open_domain, signal=signals.spider_opened) |
| 28 | 28 | |
| 29 | | def open_domain(self, domain): |
| 30 | | self.cache.open_domain(domain) |
| | 29 | def open_domain(self, spider): |
| | 30 | self.cache.open_domain(spider.domain_name) |
| 31 | 31 | |
| 32 | 32 | def process_request(self, request, spider): |
-
|
r1670
|
r1822
|
|
| 27 | 27 | self._useragents = {} |
| 28 | 28 | self._pending = {} |
| 29 | | dispatcher.connect(self.domain_opened, signals.domain_opened) |
| 30 | | dispatcher.connect(self.domain_closed, signals.domain_closed) |
| | 29 | dispatcher.connect(self.spider_opened, signals.spider_opened) |
| | 30 | dispatcher.connect(self.spider_closed, signals.spider_closed) |
| 31 | 31 | |
| 32 | 32 | def process_request(self, request, spider): |
| … |
… |
|
| 53 | 53 | self._parsers[urlparse_cached(response).netloc] = rp |
| 54 | 54 | |
| 55 | | def domain_opened(self, spider): |
| | 55 | def spider_opened(self, spider): |
| 56 | 56 | self._spider_netlocs[spider] = set() |
| 57 | 57 | self._useragents[spider] = getattr(spider, 'user_agent', None) \ |
| 58 | 58 | or settings['USER_AGENT'] |
| 59 | 59 | |
| 60 | | def domain_closed(self, domain, spider): |
| | 60 | def spider_closed(self, spider): |
| 61 | 61 | for netloc in self._spider_netlocs[domain]: |
| 62 | 62 | del self._parsers[netloc] |
-
|
r1713
|
r1822
|
|
| 50 | 50 | self.domains_count = 0 |
| 51 | 51 | self.empty_domains = set() |
| 52 | | dispatcher.connect(self.domain_closed, signal=signals.domain_closed) |
| | 52 | dispatcher.connect(self.spider_closed, signal=signals.spider_closed) |
| 53 | 53 | dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped) |
| 54 | 54 | |
| … |
… |
|
| 71 | 71 | log.msg("No products sampled for: %s" % " ".join(self.empty_domains), level=log.WARNING) |
| 72 | 72 | |
| 73 | | def domain_closed(self, domain, spider, reason): |
| | 73 | def spider_closed(self, spider, reason): |
| | 74 | domain = spider.domain_name |
| 74 | 75 | if reason == 'finished' and not stats.get_value("items_sampled", domain=domain): |
| 75 | 76 | self.empty_domains.add(domain) |
-
|
r1787
|
r1822
|
|
| 45 | 45 | self._mkdir(self.basedir) |
| 46 | 46 | self.created_directories = defaultdict(set) |
| 47 | | dispatcher.connect(self.domain_closed, signals.domain_closed) |
| 48 | | |
| 49 | | def domain_closed(self, domain): |
| 50 | | self.created_directories.pop(domain, None) |
| | 47 | dispatcher.connect(self.spider_closed, signals.spider_closed) |
| | 48 | |
| | 49 | def spider_closed(self, spider): |
| | 50 | self.created_directories.pop(spider.domain_name, None) |
| 51 | 51 | |
| 52 | 52 | def persist_image(self, key, image, buf, info): |
-
|
r1758
|
r1822
|
|
| 23 | 23 | def __init__(self): |
| 24 | 24 | self.domaininfo = {} |
| 25 | | dispatcher.connect(self.domain_opened, signals.domain_opened) |
| 26 | | dispatcher.connect(self.domain_closed, signals.domain_closed) |
| | 25 | dispatcher.connect(self.spider_opened, signals.spider_opened) |
| | 26 | dispatcher.connect(self.spider_closed, signals.spider_closed) |
| 27 | 27 | |
| 28 | | def domain_opened(self, spider): |
| | 28 | def spider_opened(self, spider): |
| 29 | 29 | self.domaininfo[spider.domain_name] = self.DomainInfo(spider) |
| 30 | 30 | |
| 31 | | def domain_closed(self, domain): |
| 32 | | del self.domaininfo[domain] |
| | 31 | def spider_closed(self, spider): |
| | 32 | del self.domaininfo[spider.domain_name] |
| 33 | 33 | |
| 34 | 34 | def process_item(self, domain, item): |
-
|
r1529
|
r1822
|
|
| 17 | 17 | reactor.installResolver(self.resolver) |
| 18 | 18 | dispatcher.connect(self.request_received, signals.request_received) |
| 19 | | dispatcher.connect(self.domain_closed, signal=signals.domain_closed) |
| | 19 | dispatcher.connect(self.spider_closed, signal=signals.spider_closed) |
| 20 | 20 | |
| 21 | 21 | def request_received(self, request, spider): |
| 22 | 22 | url_hostname = urlparse_cached(request).hostname |
| 23 | | self.spider_hostnames[spider.domain_name].add(url_hostname) |
| | 23 | self.spider_hostnames[spider].add(url_hostname) |
| 24 | 24 | |
| 25 | | def domain_closed(self, spider): |
| 26 | | for hostname in self.spider_hostnames: |
| | 25 | def spider_closed(self, spider): |
| | 26 | for hostname in self.spider_hostnames[spider]: |
| 27 | 27 | self.resolver._cache.pop(hostname, None) |
| 28 | 28 | |
-
|
r1516
|
r1822
|
|
| 16 | 16 | def __init__(self): |
| 17 | 17 | self.host_regexes = {} |
| 18 | | dispatcher.connect(self.domain_opened, signal=signals.domain_opened) |
| 19 | | dispatcher.connect(self.domain_closed, signal=signals.domain_closed) |
| | 18 | dispatcher.connect(self.spider_opened, signal=signals.spider_opened) |
| | 19 | dispatcher.connect(self.spider_closed, signal=signals.spider_closed) |
| 20 | 20 | |
| 21 | 21 | def process_spider_output(self, response, result, spider): |
| … |
… |
|
| 35 | 35 | return re.compile(regex) |
| 36 | 36 | |
| 37 | | def domain_opened(self, spider): |
| | 37 | def spider_opened(self, spider): |
| 38 | 38 | domains = [spider.domain_name] + spider.extra_domain_names |
| 39 | 39 | self.host_regexes[spider] = self.get_host_regex(domains) |
| 40 | 40 | |
| 41 | | def domain_closed(self, spider): |
| | 41 | def spider_closed(self, spider): |
| 42 | 42 | del self.host_regexes[spider] |
-
|
r1516
|
r1822
|
|
| 24 | 24 | self.dropped_count = {} |
| 25 | 25 | |
| 26 | | dispatcher.connect(self.domain_opened, signal=signals.domain_opened) |
| 27 | | dispatcher.connect(self.domain_closed, signal=signals.domain_closed) |
| | 26 | dispatcher.connect(self.spider_opened, signal=signals.spider_opened) |
| | 27 | dispatcher.connect(self.spider_closed, signal=signals.spider_closed) |
| 28 | 28 | |
| 29 | | def domain_opened(self, domain, spider): |
| 30 | | self.max_pending[domain] = getattr(spider, 'requests_queue_size', self.max_queue_size) |
| 31 | | self.dropped_count[domain] = 0 |
| | 29 | def spider_opened(self, spider): |
| | 30 | self.max_pending[spider] = getattr(spider, 'requests_queue_size', self.max_queue_size) |
| | 31 | self.dropped_count[spider] = 0 |
| 32 | 32 | |
| 33 | | def domain_closed(self, domain): |
| 34 | | dropped_count = self.dropped_count[domain] |
| | 33 | def spider_closed(self, spider): |
| | 34 | dropped_count = self.dropped_count[spider] |
| 35 | 35 | if dropped_count: |
| 36 | | max_pending = self.max_pending[domain] |
| | 36 | max_pending = self.max_pending[spider] |
| 37 | 37 | log.msg('Dropped %d request(s) because the scheduler queue size limit (%d requests) was exceeded' % \ |
| 38 | | (dropped_count, max_pending), level=log.DEBUG, domain=domain) |
| 39 | | del self.dropped_count[domain] |
| 40 | | del self.max_pending[domain] |
| | 38 | (dropped_count, max_pending), level=log.DEBUG, spider=spider) |
| | 39 | del self.dropped_count[spider] |
| | 40 | del self.max_pending[spider] |
| 41 | 41 | |
| 42 | 42 | def process_spider_output(self, response, result, spider): |
| 43 | | domain = spider.domain_name |
| 44 | | max_pending = self.max_pending.get(domain, 0) |
| | 43 | max_pending = self.max_pending.get(spider, 0) |
| 45 | 44 | if max_pending: |
| 46 | | return imap(lambda v: self._limit_requests(v, domain, max_pending), result) |
| | 45 | return imap(lambda v: self._limit_requests(v, spider, max_pending), result) |
| 47 | 46 | else: |
| 48 | 47 | return result |
| 49 | 48 | |
| 50 | | def _limit_requests(self, request_or_other, domain, max_pending): |
| | 49 | def _limit_requests(self, request_or_other, spider, max_pending): |
| 51 | 50 | if isinstance(request_or_other, Request): |
| 52 | | free_slots = max_pending - self._pending_count(domain) |
| | 51 | free_slots = max_pending - self._pending_count(spider) |
| 53 | 52 | if free_slots > 0: |
| 54 | 53 | # Scheduler isn't saturated and it is fine to schedule more requests. |
| … |
… |
|
| 56 | 55 | else: |
| 57 | 56 | # Skip the request and give engine time to handle other tasks. |
| 58 | | self.dropped_count[domain] += 1 |
| | 57 | self.dropped_count[spider] += 1 |
| 59 | 58 | return None |
| 60 | 59 | else: |
| … |
… |
|
| 62 | 61 | return request_or_other |
| 63 | 62 | |
| 64 | | def _pending_count(self, domain): |
| 65 | | pending = scrapyengine.scheduler.pending_requests.get(domain, []) |
| | 63 | def _pending_count(self, spider): |
| | 64 | pending = scrapyengine.scheduler.pending_requests.get(spider, []) |
| 66 | 65 | return len(pending) |
-
|
r1518
|
r1822
|
|
| 21 | 21 | def __init__(self): |
| 22 | 22 | self.domains = {} |
| 23 | | dispatcher.connect(self.domain_opened, signal=signals.domain_opened) |
| 24 | | dispatcher.connect(self.domain_closed, signal=signals.domain_closed) |
| | 23 | dispatcher.connect(self.spider_opened, signal=signals.spider_opened) |
| | 24 | dispatcher.connect(self.spider_closed, signal=signals.spider_closed) |
| 25 | 25 | dispatcher.connect(self.item_scraped, signal=signals.item_scraped) |
| 26 | 26 | dispatcher.connect(self.response_downloaded, signal=signals.response_downloaded) |
| … |
… |
|
| 28 | 28 | dispatcher.connect(self.webconsole_discover_module, signal=webconsole_discover_module) |
| 29 | 29 | |
| 30 | | def domain_opened(self, domain, spider): |
| | 30 | def spider_opened(self, spider): |
| 31 | 31 | pstats = SpiderStats() |
| 32 | | self.domains[spider.domain_name] = pstats |
| | 32 | self.domains[spider] = pstats |
| 33 | 33 | pstats.started = datetime.now().replace(microsecond=0) |
| 34 | 34 | pstats.finished = None |
| 35 | 35 | |
| 36 | | def domain_closed(self, domain, spider): |
| 37 | | self.domains[spider.domain_name].finished = datetime.now().replace(microsecond=0) |
| | 36 | def spider_closed(self, spider): |
| | 37 | self.domains[spider].finished = datetime.now().replace(microsecond=0) |
| 38 | 38 | |
| 39 | 39 | def item_scraped(self, item, spider): |
| 40 | | self.domains[spider.domain_name].scraped += 1 |
| | 40 | self.domains[spider].scraped += 1 |
| 41 | 41 | |
| 42 | 42 | def response_downloaded(self, response, spider): |
| 43 | 43 | # sometimes we download responses without opening/closing domains, |
| 44 | 44 | # for example from scrapy shell |
| 45 | | if self.domains.get(spider.domain_name): |
| 46 | | self.domains[spider.domain_name].crawled += 1 |
| | 45 | if self.domains.get(spider): |
| | 46 | self.domains[spider].crawled += 1 |
| 47 | 47 | |
| 48 | 48 | def webconsole_render(self, wc_request): |
| … |
… |
|
| 50 | 50 | dwl = scrapyengine.downloader |
| 51 | 51 | |
| 52 | | totdomains = totscraped = totcrawled = totscheduled = totactive = totpending = totdqueued = tottransf = 0 |
| | 52 | totdomains = totscraped = totcrawled = totscheduled = totactive = totdqueued = tottransf = 0 |
| 53 | 53 | s = banner(self) |
| 54 | 54 | s += "<table border='1'>\n" |
| 55 | 55 | s += "<tr><th>Domain</th><th>Items<br>Scraped</th><th>Pages<br>Crawled</th><th>Scheduler<br>Pending</th><th>Downloader<br/>Queued</th><th>Downloader<br/>Active</th><th>Downloader<br/>Transferring</th><th>Start time</th><th>Finish time</th><th>Run time</th></tr>\n" |
| 56 | | for d in sorted(self.domains.keys()): |
| 57 | | scheduled = len(sch.pending_requests[d]) if d in sch.pending_requests else 0 |
| 58 | | active = len(dwl.sites[d].active) if d in dwl.sites else 0 |
| 59 | | dqueued = len(dwl.sites[d].queue) if d in dwl.sites else 0 |
| 60 | | transf = len(dwl.sites[d].transferring) if d in dwl.sites else 0 |
| 61 | | stats = self.domains[d] |
| | 56 | for spider in sorted(self.domains.keys()): |
| | 57 | scheduled = len(sch.pending_requests[spider]) if spider in sch.pending_requests else 0 |
| | 58 | active = len(dwl.sites[spider].active) if spider in dwl.sites else 0 |
| | 59 | dqueued = len(dwl.sites[spider].queue) if spider in dwl.sites else 0 |
| | 60 | transf = len(dwl.sites[spider].transferring) if spider in dwl.sites else 0 |
| | 61 | stats = self.domains[spider] |
| 62 | 62 | runtime = stats.finished - stats.started if stats.finished else datetime.now() - stats.started |
| 63 | 63 | |
| 64 | 64 | s += '<tr><td>%s</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td>%s</td><td>%s</td><td>%s</td></tr>\n' % \ |
| 65 | | (d, stats.scraped, stats.crawled, scheduled, dqueued, active, transf, str(stats.started), str(stats.finished), str(runtime)) |
| | 65 | (spider.domain_name, stats.scraped, stats.crawled, scheduled, dqueued, active, transf, str(stats.started), str(stats.finished), str(runtime)) |
| 66 | 66 | |
| 67 | 67 | totdomains += 1 |
-
|
r1806
|
r1822
|
|
| 19 | 19 | self.running = {} |
| 20 | 20 | self.finished = set() |
| 21 | | dispatcher.connect(self.domain_opened, signal=signals.domain_opened) |
| 22 | | dispatcher.connect(self.domain_closed, signal=signals.domain_closed) |
| | 21 | dispatcher.connect(self.spider_opened, signal=signals.spider_opened) |
| | 22 | dispatcher.connect(self.spider_closed, signal=signals.spider_closed) |
| 23 | 23 | |
| 24 | 24 | from scrapy.management.web import webconsole_discover_module |
| 25 | 25 | dispatcher.connect(self.webconsole_discover_module, signal=webconsole_discover_module) |
| 26 | 26 | |
| 27 | | def domain_opened(self, spider): |
| | 27 | def spider_opened(self, spider): |
| 28 | 28 | self.running[spider.domain_name] = spider |
| 29 | 29 | |
| 30 | | def domain_closed(self, spider): |
| | 30 | def spider_closed(self, spider): |
| 31 | 31 | del self.running[spider.domain_name] |
| 32 | 32 | self.finished.add(spider.domain_name) |
-
|
r1518
|
r1822
|
|
| 25 | 25 | self.stores = {} |
| 26 | 26 | |
| 27 | | dispatcher.connect(self.domain_opened, signal=signals.domain_opened) |
| 28 | | dispatcher.connect(self.domain_closed, signal=signals.domain_closed) |
| | 27 | dispatcher.connect(self.spider_opened, signal=signals.spider_opened) |
| | 28 | dispatcher.connect(self.spider_closed, signal=signals.spider_closed) |
| 29 | 29 | |
| 30 | 30 | def process_item(self, domain, item): |
| … |
… |
|
| 44 | 44 | return item |
| 45 | 45 | |
| 46 | | def domain_opened(self, domain): |
| | 46 | def spider_opened(self, spider): |
| | 47 | domain = spider.domain_name |
| 47 | 48 | uri = Template(self.uritpl).substitute(domain=domain) |
| 48 | 49 | self.stores[domain] = Shove(uri, **self.opts) |
| 49 | 50 | |
| 50 | | def domain_closed(self, domain): |
| 51 | | self.stores[domain].sync() |
| | 51 | def spider_closed(self, spider): |
| | 52 | self.stores[spider.domain_name].sync() |
| 52 | 53 | |
| 53 | 54 | def log(self, domain, item, status): |
-
|
r1782
|
r1822
|
|
| 100 | 100 | |
| 101 | 101 | def next_request(self, spider, now=False): |
| 102 | | """Scrape the next request for the domain passed. |
| | 102 | """Scrape the next request for the spider passed. |
| 103 | 103 | |
| 104 | 104 | The next request to be scraped is retrieved from the scheduler and |
| … |
… |
|
| 165 | 165 | if not request.deferred.callbacks: |
| 166 | 166 | log.msg("Unable to crawl Request with no callback: %s" % request, |
| 167 | | level=log.ERROR, domain=spider.domain_name) |
| | 167 | level=log.ERROR, spider=spider) |
| 168 | 168 | return |
| 169 | 169 | schd = mustbe_deferred(self.schedule, request, spider) |
| … |
… |
|
| 187 | 187 | |
| 188 | 188 | def _mainloop(self): |
| 189 | | """Add more domains to be scraped if the downloader has the capacity. |
| | 189 | """Add more spiders to be scraped if the downloader has the capacity. |
| 190 | 190 | |
| 191 | 191 | If there is nothing else scheduled then stop the execution engine. |
| … |
… |
|
| 199 | 199 | |
| 200 | 200 | def download(self, request, spider): |
| 201 | | domain = spider.domain_name |
| 202 | | |
| 203 | 201 | def _on_success(response): |
| 204 | 202 | """handle the result of a page download""" |
| … |
… |
|
| 207 | 205 | response.request = request # tie request to response received |
| 208 | 206 | log.msg(self._crawled_logline(request, response), \ |
| 209 | | level=log.DEBUG, domain=spider.domain_name) |
| | 207 | level=log.DEBUG, spider=spider) |
| 210 | 208 | return response |
| 211 | 209 | elif isinstance(response, Request): |
| … |
… |
|
| 226 | 224 | if errmsg: |
| 227 | 225 | log.msg("Crawling <%s>: %s" % (request.url, errmsg), \ |
| 228 | | level=level, domain=domain) |
| | 226 | level=level, spider=spider) |
| 229 | 227 | return Failure(IgnoreRequest(str(exc))) |
| 230 | 228 | |
| … |
… |
|
| 242 | 240 | |
| 243 | 241 | def open_spider(self, spider): |
| 244 | | domain = spider.domain_name |
| 245 | | log.msg("Domain opened", domain=domain) |
| | 242 | log.msg("Spider opened", spider=spider) |
| 246 | 243 | self.next_request(spider) |
| 247 | 244 | |
| 248 | 245 | self.downloader.open_spider(spider) |
| 249 | 246 | self.scraper.open_spider(spider) |
| 250 | | stats.open_domain(domain) |
| 251 | | |
| 252 | | # XXX: sent for backwards compatibility (will be removed in Scrapy 0.8) |
| 253 | | send_catch_log(signals.domain_open, sender=self.__class__, \ |
| 254 | | domain=domain, spider=spider) |
| 255 | | |
| 256 | | send_catch_log(signals.domain_opened, sender=self.__class__, \ |
| 257 | | domain=domain, spider=spider) |
| | 247 | stats.open_domain(spider.domain_name) |
| | 248 | |
| | 249 | send_catch_log(signals.spider_opened, sender=self.__class__, spider=spider) |
| 258 | 250 | |
| 259 | 251 | def _spider_idle(self, spider): |
| 260 | | """Called when a domain gets idle. This function is called when there |
| | 252 | """Called when a spider gets idle. This function is called when there |
| 261 | 253 | are no remaining pages to download or schedule. It can be called |
| 262 | 254 | multiple times. If some extension raises a DontCloseDomain exception |
| 263 | | (in the domain_idle signal handler) the domain is not closed until the |
| | 255 | (in the spider_idle signal handler) the spider is not closed until the |
| 264 | 256 | next loop and this function is guaranteed to be called (at least) once |
| 265 | | again for this domain. |
| 266 | | """ |
| 267 | | domain = spider.domain_name |
| | 257 | again for this spider. |
| | 258 | """ |
| 268 | 259 | try: |
| 269 | | dispatcher.send(signal=signals.domain_idle, sender=self.__class__, \ |
| 270 | | domain=domain, spider=spider) |
| | 260 | dispatcher.send(signal=signals.spider_idle, sender=self.__class__, \ |
| | 261 | spider=spider) |
| 271 | 262 | except DontCloseDomain: |
| 272 | 263 | self.next_request(spider) |
| 273 | 264 | return |
| 274 | 265 | except: |
| 275 | | log.err("Exception catched on domain_idle signal dispatch") |
| | 266 | log.err("Exception catched on spider_idle signal dispatch") |
| 276 | 267 | if self.spider_is_idle(spider): |
| 277 | 268 | self.close_spider(spider, reason='finished') |
| … |
… |
|
| 284 | 275 | def close_spider(self, spider, reason='cancelled'): |
| 285 | 276 | """Close (cancel) spider and clear all its outstanding requests""" |
| 286 | | domain = spider.domain_name |
| 287 | 277 | if spider not in self.closing: |
| 288 | | log.msg("Closing domain (%s)" % reason, domain=domain) |
| | 278 | log.msg("Closing spider (%s)" % reason, spider=spider) |
| 289 | 279 | self.closing[spider] = reason |
| 290 | 280 | self.downloader.close_spider(spider) |
| … |
… |
|
| 299 | 289 | |
| 300 | 290 | def _finish_closing_spider_if_idle(self, spider): |
| 301 | | """Call _finish_closing_spider if domain is idle""" |
| | 291 | """Call _finish_closing_spider if spider is idle""" |
| 302 | 292 | if self.spider_is_idle(spider) or self.killed: |
| 303 | 293 | return self._finish_closing_spider(spider) |
| … |
… |
|
| 311 | 301 | def _finish_closing_spider(self, spider): |
| 312 | 302 | """This function is called after the spider has been closed""" |
| 313 | | domain = spider.domain_name |
| 314 | 303 | self.scheduler.close_spider(spider) |
| 315 | 304 | self.scraper.close_spider(spider) |
| 316 | 305 | reason = self.closing.pop(spider, 'finished') |
| 317 | | send_catch_log(signal=signals.domain_closed, sender=self.__class__, \ |
| 318 | | domain=domain, spider=spider, reason=reason) |
| 319 | | stats.close_domain(domain, reason=reason) |
| | 306 | send_catch_log(signal=signals.spider_closed, sender=self.__class__, \ |
| | 307 | spider=spider, reason=reason) |
| | 308 | stats.close_domain(spider.domain_name, reason=reason) |
| 320 | 309 | dfd = defer.maybeDeferred(spiders.close_spider, spider) |
| 321 | | dfd.addBoth(log.msg, "Domain closed (%s)" % reason, domain=domain) |
| | 310 | dfd.addBoth(log.msg, "Spider closed (%s)" % reason, spider=spider) |
| 322 | 311 | reactor.callLater(0, self._mainloop) |
| 323 | 312 | return dfd |
-
|
r1527
|
r1822
|
|
| 8 | 8 | engine_started = object() |
| 9 | 9 | engine_stopped = object() |
| 10 | | domain_opened = object() |
| 11 | | domain_idle = object() |
| 12 | | domain_closed = object() |
| | 10 | spider_opened = object() |
| | 11 | spider_idle = object() |
| | 12 | spider_closed = object() |
| 13 | 13 | request_received = object() |
| 14 | 14 | request_uploaded = object() |
| … |
… |
|
| 18 | 18 | item_passed = object() |
| 19 | 19 | item_dropped = object() |
| 20 | | |
| 21 | | # XXX: deprecated signals (will be removed in Scrapy 0.8) |
| 22 | | domain_open = object() |
| 23 | | |
-
|
r1818
|
r1822
|
|
| 64 | 64 | log.startLogging(file, setStdout=logstdout) |
| 65 | 65 | |
| 66 | | def msg(message, level=INFO, component=BOT_NAME, domain=None): |
| | 66 | def msg(message, level=INFO, component=BOT_NAME, domain=None, spider=None): |
| 67 | 67 | """Log message according to the level""" |
| 68 | 68 | if level > log_level: |
| 69 | 69 | return |
| 70 | 70 | dispatcher.send(signal=logmessage_received, message=message, level=level, \ |
| 71 | | domain=domain) |
| 72 | | system = domain if domain else component |
| | 71 | domain=domain, spider=spider) |
| | 72 | system = domain or spider.domain_name if spider else component |
| 73 | 73 | msg_txt = unicode_to_str("%s: %s" % (level_names[level], message)) |
| 74 | 74 | log.msg(msg_txt, system=system) |
| 75 | 75 | |
| 76 | | def exc(message, level=ERROR, component=BOT_NAME, domain=None): |
| | 76 | def exc(message, level=ERROR, component=BOT_NAME, domain=None, spider=None): |
| 77 | 77 | message = message + '\n' + format_exc() |
| 78 | | msg(message, level, component, domain) |
| | 78 | msg(message, level, component, domain, spider) |
| 79 | 79 | |
| 80 | 80 | def err(_stuff=None, _why=None, **kwargs): |
| … |
… |
|
| 82 | 82 | return |
| 83 | 83 | domain = kwargs.pop('domain', None) |
| | 84 | spider = kwargs.pop('spider', None) |
| 84 | 85 | component = kwargs.pop('component', BOT_NAME) |
| 85 | | kwargs['system'] = domain if domain else component |
| | 86 | kwargs['system'] = domain or spider.domain_name if spider else component |
| 86 | 87 | if _why: |
| 87 | 88 | _why = unicode_to_str("ERROR: %s" % _why) |
-
|
r1569
|
r1822
|
|
| 15 | 15 | |
| 16 | 16 | def tearDown(self): |
| 17 | | self.mw.domain_closed('scrapytest.org') |
| | 17 | self.mw.spider_closed('scrapytest.org') |
| 18 | 18 | del self.mw |
| 19 | 19 | |
-
|
r1713
|
r1822
|
|
| 90 | 90 | dispatcher.connect(self.record_signal, signals.engine_started) |
| 91 | 91 | dispatcher.connect(self.record_signal, signals.engine_stopped) |
| 92 | | dispatcher.connect(self.record_signal, signals.domain_opened) |
| 93 | | dispatcher.connect(self.record_signal, signals.domain_idle) |
| 94 | | dispatcher.connect(self.record_signal, signals.domain_closed) |
| | 92 | dispatcher.connect(self.record_signal, signals.spider_opened) |
| | 93 | dispatcher.connect(self.record_signal, signals.spider_idle) |
| | 94 | dispatcher.connect(self.record_signal, signals.spider_closed) |
| 95 | 95 | dispatcher.connect(self.item_scraped, signals.item_scraped) |
| 96 | 96 | dispatcher.connect(self.request_received, signals.request_received) |
| … |
… |
|
| 202 | 202 | assert signals.engine_started in session.signals_catched |
| 203 | 203 | assert signals.engine_stopped in session.signals_catched |
| 204 | | assert signals.domain_opened in session.signals_catched |
| 205 | | assert signals.domain_idle in session.signals_catched |
| 206 | | assert signals.domain_closed in session.signals_catched |
| 207 | | |
| 208 | | self.assertEqual({'domain': session.domain, 'spider': session.spider}, |
| 209 | | session.signals_catched[signals.domain_opened]) |
| 210 | | self.assertEqual({'domain': session.domain, 'spider': session.spider}, |
| 211 | | session.signals_catched[signals.domain_idle]) |
| 212 | | self.assertEqual({'domain': session.domain, 'spider': session.spider, 'reason': 'finished'}, |
| 213 | | session.signals_catched[signals.domain_closed]) |
| | 204 | assert signals.spider_opened in session.signals_catched |
| | 205 | assert signals.spider_idle in session.signals_catched |
| | 206 | assert signals.spider_closed in session.signals_catched |
| | 207 | |
| | 208 | self.assertEqual({'spider': session.spider}, |
| | 209 | session.signals_catched[signals.spider_opened]) |
| | 210 | self.assertEqual({'spider': session.spider}, |
| | 211 | session.signals_catched[signals.spider_idle]) |
| | 212 | self.assertEqual({'spider': session.spider, 'reason': 'finished'}, |
| | 213 | session.signals_catched[signals.spider_closed]) |
| 214 | 214 | |
| 215 | 215 | if __name__ == "__main__": |
-
|
r1686
|
r1822
|
|
| 14 | 14 | |
| 15 | 15 | self.mw = OffsiteMiddleware() |
| 16 | | self.mw.domain_opened(self.spider) |
| | 16 | self.mw.spider_opened(self.spider) |
| 17 | 17 | |
| 18 | 18 | def test_process_spider_output(self): |
| … |
… |
|
| 29 | 29 | |
| 30 | 30 | def tearDown(self): |
| 31 | | self.mw.domain_closed(self.spider) |
| | 31 | self.mw.spider_closed(self.spider) |
| 32 | 32 | |