| 1 | """CloseDomain is an extension that forces spiders to be closed after certain |
|---|
| 2 | conditions are met. |
|---|
| 3 | |
|---|
| 4 | See documentation in docs/topics/extensions.rst |
|---|
| 5 | """ |
|---|
| 6 | |
|---|
| 7 | from collections import defaultdict |
|---|
| 8 | |
|---|
| 9 | from twisted.internet import reactor |
|---|
| 10 | from scrapy.xlib.pydispatch import dispatcher |
|---|
| 11 | |
|---|
| 12 | from scrapy.core import signals |
|---|
| 13 | from scrapy.core.engine import scrapyengine |
|---|
| 14 | from scrapy.conf import settings |
|---|
| 15 | |
|---|
| 16 | class CloseDomain(object): |
|---|
| 17 | |
|---|
| 18 | def __init__(self): |
|---|
| 19 | self.timeout = settings.getint('CLOSEDOMAIN_TIMEOUT') |
|---|
| 20 | self.itempassed = settings.getint('CLOSEDOMAIN_ITEMPASSED') |
|---|
| 21 | |
|---|
| 22 | self.counts = defaultdict(int) |
|---|
| 23 | self.tasks = {} |
|---|
| 24 | |
|---|
| 25 | if self.timeout: |
|---|
| 26 | dispatcher.connect(self.spider_opened, signal=signals.spider_opened) |
|---|
| 27 | if self.itempassed: |
|---|
| 28 | dispatcher.connect(self.item_passed, signal=signals.item_passed) |
|---|
| 29 | dispatcher.connect(self.spider_closed, signal=signals.spider_closed) |
|---|
| 30 | |
|---|
| 31 | def spider_opened(self, spider): |
|---|
| 32 | self.tasks[spider] = reactor.callLater(self.timeout, scrapyengine.close_spider, \ |
|---|
| 33 | spider=spider, reason='closedomain_timeout') |
|---|
| 34 | |
|---|
| 35 | def item_passed(self, item, spider): |
|---|
| 36 | self.counts[spider] += 1 |
|---|
| 37 | if self.counts[spider] == self.itempassed: |
|---|
| 38 | scrapyengine.close_spider(spider, 'closedomain_itempassed') |
|---|
| 39 | |
|---|
| 40 | def spider_closed(self, spider): |
|---|
| 41 | self.counts.pop(spider, None) |
|---|
| 42 | tsk = self.tasks.pop(spider, None) |
|---|
| 43 | if tsk and not tsk.called: |
|---|
| 44 | tsk.cancel() |
|---|