Changeset 1827:61f26fb8b3b0

Show
Ignore:
Timestamp:
11/06/09 13:46:36 (9 months ago)
Author:
Pablo Hoffman <pablo@…>
Branch:
default
Message:

Changed item pipeline API to pass spider references (instead of domain names) to process_item() method

Files:
9 modified

Legend:

Unmodified
Added
Removed
  • docs/intro/overview.rst

    r1751 r1827  
    154154 
    155155    class StoreItemPipeline(object): 
    156         def process_item(self, domain, response, item): 
     156        def process_item(self, spider, item): 
    157157            torrent_id = item['url'].split('/')[-1] 
    158158            f = open("torrent-%s.pickle" % torrent_id, "w") 
  • docs/intro/tutorial.rst

    r1783 r1827  
    157157   [dmoz] INFO: Enabled spider middlewares: ... 
    158158   [dmoz] INFO: Enabled item pipelines: ... 
    159    [dmoz.org] INFO: Domain opened 
     159   [dmoz.org] INFO: Spider opened 
    160160   [dmoz.org] DEBUG: Crawled <http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/> from <None> 
    161161   [dmoz.org] DEBUG: Crawled <http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> from <None> 
    162    [dmoz.org] INFO: Domain closed (finished) 
     162   [dmoz.org] INFO: Spider closed (finished) 
    163163   [-] Main loop terminated. 
    164164 
    165165Pay attention to the lines containing ``[dmoz.org]``, which corresponds to 
    166 our spider (identified by the domain "dmoz.org"). You can see a log line for each 
    167 URL defined in ``start_urls``. Because these URLs are the starting ones, they 
    168 have no referrers, which is shown at the end of the log line, where it says 
    169 ``from <None>``. 
     166our spider (identified by the domain ``"dmoz.org"``). You can see a log line 
     167for each URL defined in ``start_urls``. Because these URLs are the starting 
     168ones, they have no referrers, which is shown at the end of the log line, 
     169where it says ``from <None>``. 
    170170 
    171171But more interesting, as our ``parse`` method instructs, two files have been 
     
    446446 
    447447   class DmozPipeline(object): 
    448        def process_item(self, domain, item): 
     448       def process_item(self, spider, item): 
    449449           return item 
    450450 
     
    462462           self.csvwriter = csv.writer(open('items.csv', 'wb')) 
    463463         
    464        def process_item(self, domain, item): 
     464       def process_item(self, spider, item): 
    465465           self.csvwriter.writerow([item['title'][0], item['link'][0], item['desc'][0]]) 
    466466           return item 
  • docs/topics/exporters.rst

    r1822 r1827  
    5252 
    5353       def spider_opened(self, spider): 
    54            domain = spider.domain_name 
    55            file = open('%s_products.xml' % domain, 'w+b') 
    56            self.files[domain] = file 
     54           file = open('%s_products.xml' % spider.domain_name, 'w+b') 
     55           self.files[spider] = file 
    5756           self.exporter = XmlItemExporter(file) 
    5857           self.exporter.start_exporting() 
    5958 
    6059       def spider_closed(self, spider): 
    61            domain = spider.domain_name 
    6260           self.exporter.finish_exporting() 
    63            file = self.files.pop(domain) 
     61           file = self.files.pop(spider) 
    6462           file.close() 
    6563 
    66        def process_item(self, domain, item): 
     64       def process_item(self, spider, item): 
    6765           self.exporter.export_item(item) 
    6866           return item 
  • docs/topics/item-pipeline.rst

    r1822 r1827  
    2525single Python class that must define the following method: 
    2626 
    27 .. method:: process_item(domain, item) 
    28  
    29 ``domain`` is a string with the domain of the spider which scraped the item 
    30  
    31 ``item`` is a :class:`~scrapy.item.Item` with the item scraped 
     27.. method:: process_item(spider, item) 
     28 
     29:param spider: the spider which scraped the item 
     30:type spider: :class:`~scrapy.spider.BaseSpider` object 
     31 
     32:param item: the item scraped 
     33:type item: :class:`~scrapy.item.Item` object 
    3234 
    3335This method is called for every item pipeline component and must either return 
     
    5052        vat_factor = 1.15 
    5153 
    52         def process_item(self, domain, item): 
     54        def process_item(self, spider, item): 
    5355            if item['price']: 
    5456                if item['price_excludes_vat']: 
     
    6971   ] 
    7072 
    71 Item pipeline example with resources per domain 
     73Item pipeline example with resources per spider 
    7274=============================================== 
    7375 
    7476Sometimes you need to keep resources about the items processed grouped per 
    75 domain, and delete those resource when a domain finish. 
     77spider, and delete those resource when a spider finish. 
    7678 
    7779An example is a filter that looks for duplicate items, and drops those items 
     
    9193 
    9294        def spider_opened(self, spider): 
    93             self.duplicates[spider.domain_name] = set() 
     95            self.duplicates[spider] = set() 
    9496 
    9597        def spider_closed(self, spider): 
    96             del self.duplicates[spider.domain_name] 
    97  
    98         def process_item(self, domain, item): 
    99             if item.id in self.duplicates[domain]: 
     98            del self.duplicates[spider] 
     99 
     100        def process_item(self, spider, item): 
     101            if item.id in self.duplicates[spider]: 
    100102                raise DropItem("Duplicate item found: %s" % item) 
    101103            else: 
    102                 self.duplicates[domain].add(item.id) 
     104                self.duplicates[spider].add(item.id) 
    103105                return item 
    104106 
  • scrapy/contrib/itemsampler.py

    r1822 r1827  
    3737from scrapy.conf import settings 
    3838 
    39 items_per_domain = settings.getint('ITEMSAMPLER_COUNT', 1) 
    40 close_domain = settings.getbool('ITEMSAMPLER_CLOSE_DOMAIN', False) 
     39items_per_spider = settings.getint('ITEMSAMPLER_COUNT', 1) 
     40close_spider = settings.getbool('ITEMSAMPLER_CLOSE_SPIDER', False) 
    4141max_response_size = settings.getbool('ITEMSAMPLER_MAX_RESPONSE_SIZE', ) 
    4242 
     
    4848            raise NotConfigured 
    4949        self.items = {} 
    50         self.domains_count = 0 
     50        self.spiders_count = 0 
    5151        self.empty_domains = set() 
    5252        dispatcher.connect(self.spider_closed, signal=signals.spider_closed) 
    5353        dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped) 
    5454 
    55     def process_item(self, item, spider): 
    56         domain = spider.domain_name 
    57         sampled = stats.get_value("items_sampled", 0, domain=domain) 
    58         if sampled < items_per_domain: 
     55    def process_item(self, spider, item): 
     56        sampled = stats.get_value("items_sampled", 0, domain=spider.domain_name) 
     57        if sampled < items_per_spider: 
    5958            self.items[item.guid] = item 
    6059            sampled += 1 
    61             stats.set_value("items_sampled", sampled, domain=domain) 
    62             log.msg("Sampled %s" % item, domain=domain, level=log.INFO) 
    63             if close_domain and sampled == items_per_domain: 
     60            stats.set_value("items_sampled", sampled, domain=spider.domain_name) 
     61            log.msg("Sampled %s" % item, spider=spider, level=log.INFO) 
     62            if close_spider and sampled == items_per_spider: 
    6463                scrapyengine.close_spider(spider) 
    6564        return item 
     
    6968            pickle.dump(self.items, f) 
    7069        if self.empty_domains: 
    71             log.msg("No products sampled for: %s" % " ".join(self.empty_domains), level=log.WARNING) 
     70            log.msg("No products sampled for: %s" % " ".join(self.empty_domains), \ 
     71                level=log.WARNING) 
    7272 
    7373    def spider_closed(self, spider, reason): 
    74         domain = spider.domain_name 
    75         if reason == 'finished' and not stats.get_value("items_sampled", domain=domain): 
    76             self.empty_domains.add(domain) 
    77         self.domains_count += 1 
    78         log.msg("Sampled %d domains so far (%d empty)" % (self.domains_count, len(self.empty_domains)), level=log.INFO) 
     74        if reason == 'finished' and not stats.get_value("items_sampled", domain=spider.domain_name): 
     75            self.empty_domains.add(spider.domain_name) 
     76        self.spiders_count += 1 
     77        log.msg("Sampled %d domains so far (%d empty)" % (self.spiders_count, \ 
     78            len(self.empty_domains)), level=log.INFO) 
    7979 
    8080 
     
    8888 
    8989    def process_spider_input(self, response, spider): 
    90         if stats.get_value("items_sampled", domain=spider.domain_name) >= items_per_domain: 
     90        if stats.get_value("items_sampled", domain=spider.domain_name) >= items_per_spider: 
    9191            return [] 
    9292        elif max_response_size and max_response_size > len(response_httprepr(response)):   
     
    101101                items.append(r) 
    102102 
    103         if stats.get_value("items_sampled", domain=spider.domain_name) >= items_per_domain: 
     103        if stats.get_value("items_sampled", domain=spider.domain_name) >= items_per_spider: 
    104104            return [] 
    105105        else: 
  • scrapy/contrib/pipeline/__init__.py

    r1713 r1827  
    5858                return item 
    5959            current_stage = stages_left.pop(0) 
    60             d = mustbe_deferred(current_stage.process_item, spider.domain_name, item) 
     60            d = mustbe_deferred(current_stage.process_item, spider, item) 
    6161            d.addCallback(next_stage, stages_left) 
    6262            return d 
  • scrapy/contrib/pipeline/fileexport.py

    r1632 r1827  
    1818        dispatcher.connect(self.engine_stopped, signals.engine_stopped) 
    1919 
    20     def process_item(self, domain, item): 
     20    def process_item(self, spider, item): 
    2121        self.exporter.export_item(item) 
    2222        return item 
  • scrapy/contrib/pipeline/media.py

    r1822 r1827  
    1313    DOWNLOAD_PRIORITY = 1000 
    1414 
    15     class DomainInfo(object): 
     15    class SpiderInfo(object): 
    1616        def __init__(self, spider): 
    17             self.domain = spider.domain_name 
    1817            self.spider = spider 
    1918            self.downloading = {} 
     
    2221 
    2322    def __init__(self): 
    24         self.domaininfo = {} 
     23        self.spiderinfo = {} 
    2524        dispatcher.connect(self.spider_opened, signals.spider_opened) 
    2625        dispatcher.connect(self.spider_closed, signals.spider_closed) 
    2726 
    2827    def spider_opened(self, spider): 
    29         self.domaininfo[spider.domain_name] = self.DomainInfo(spider) 
     28        self.spiderinfo[spider] = self.SpiderInfo(spider) 
    3029 
    3130    def spider_closed(self, spider): 
    32         del self.domaininfo[spider.domain_name] 
     31        del self.spiderinfo[spider] 
    3332 
    34     def process_item(self, domain, item): 
    35         info = self.domaininfo[domain] 
     33    def process_item(self, spider, item): 
     34        info = self.spiderinfo[spider] 
    3635        requests = arg_to_iter(self.get_media_requests(item, info)) 
    3736        dlist = [] 
     
    8483            info.downloading[fp] = (request, dwld) # fill downloading state data 
    8584            dwld.addBoth(_downloaded) # append post-download hook 
    86             dwld.addErrback(log.err, domain=info.domain) 
     85            dwld.addErrback(log.err, spider=info.spider) 
    8786 
    8887        # declare request in downloading state (None is used as place holder) 
  • scrapy/contrib_exp/pipeline/shoveitem.py

    r1822 r1827  
    2828        dispatcher.connect(self.spider_closed, signal=signals.spider_closed) 
    2929 
    30     def process_item(self, domain, item): 
     30    def process_item(self, spider, item): 
    3131        guid = str(item.guid) 
    3232 
    33         if guid in self.stores[domain]: 
    34             if self.stores[domain][guid] == item: 
     33        if guid in self.stores[spider]: 
     34            if self.stores[spider][guid] == item: 
    3535                status = 'old' 
    3636            else: 
     
    4040 
    4141        if not status == 'old': 
    42             self.stores[domain][guid] = item 
    43         self.log(domain, item, status) 
     42            self.stores[spider][guid] = item 
     43        self.log(spider, item, status) 
    4444        return item 
    4545 
    4646    def spider_opened(self, spider): 
    47         domain = spider.domain_name 
    48         uri = Template(self.uritpl).substitute(domain=domain) 
    49         self.stores[domain] = Shove(uri, **self.opts) 
     47        uri = Template(self.uritpl).substitute(domain=spider.domain_name) 
     48        self.stores[spider] = Shove(uri, **self.opts) 
    5049 
    5150    def spider_closed(self, spider): 
    52         self.stores[spider.domain_name].sync() 
     51        self.stores[spider].sync() 
    5352 
    54     def log(self, domain, item, status): 
    55         log.msg("Shove (%s): Item guid=%s" % (status, item.guid), level=log.DEBUG, domain=domain) 
     53    def log(self, spider, item, status): 
     54        log.msg("Shove (%s): Item guid=%s" % (status, item.guid), level=log.DEBUG, \ 
     55            spider=spider)