Changeset 1975:9df3e845c0d7

Show
Ignore:
Timestamp:
04/01/10 18:27:22 (5 months ago)
Author:
Rolando Espinoza La fuente <darkrho@…>
Branch:
default
Message:

SEP12 implementation

This patch implements the domain_name to name change in BaseSpider? class and
change all spider instantiations to use the new attribute.

  • Add allowed_domains to spider

This patch implements the merging of spider.domain_name and
spider.extra_domain_names in spider.allowed_domains for offsite checking
purposes.

Note that spider.domain_name is not touched by this patch, only not used.

  • Remove spider.domain_name references from scrapy.stats
  • Refactor genspider command

The new signature for genspider is: genspider [options] <domain_name>.

Genspider uses domain_name for spider name and for the module name.

  • Remove spider.domain_name references
  • Update crawl command signature <spider|url>
  • docs: updated references to domain_name
  • examples/experimental: use spider.name
  • genspider: require <name> <domain>
  • spidermanager: renamed crawl_domain to crawl_spider_name
  • spiderctl: updated references of *domain* to spider
  • added backward compatiblity with legacy spider's attributes
    'domain_name' and 'extra_domain_names'
Files:
1 added
40 modified

Legend:

Unmodified
Added
Removed
  • docs/intro/overview.rst

    r1953 r1975  
    129129    class MininovaSpider(CrawlSpider): 
    130130 
    131         domain_name = 'mininova.org' 
     131        name = 'mininova.org' 
     132        allowed_domains = ['mininova.org'] 
    132133        start_urls = ['http://www.mininova.org/today'] 
    133134        rules = [Rule(SgmlLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')] 
  • docs/intro/tutorial.rst

    r1952 r1975  
    103103define the three main, mandatory, attributes: 
    104104 
    105 * :attr:`~scrapy.spider.BaseSpider.domain_name`: identifies the Spider. It must 
    106   be unique, that is, you can't set the same domain name for different Spiders. 
     105* :attr:`~scrapy.spider.BaseSpider.name`: identifies the Spider. It must be 
     106  unique, that is, you can't set the same name for different Spiders. 
    107107 
    108108* :attr:`~scrapy.spider.BaseSpider.start_urls`: is a list of URLs where the 
     
    129129 
    130130   class DmozSpider(BaseSpider): 
    131        domain_name = "dmoz.org" 
     131       name = "dmoz.org" 
     132       allowed_domains = ["dmoz.org"] 
    132133       start_urls = [ 
    133134           "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", 
     
    355356 
    356357   class DmozSpider(BaseSpider): 
    357       domain_name = "dmoz.org" 
     358      name = "dmoz.org" 
     359      allowed_domains = ["dmoz.org"] 
    358360      start_urls = [ 
    359361          "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", 
     
    399401 
    400402   class DmozSpider(BaseSpider): 
    401       domain_name = "dmoz.org" 
     403      name = "dmoz.org" 
     404      allowed_domains = ["dmoz.org"] 
    402405      start_urls = [ 
    403406          "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", 
  • docs/topics/downloader-middleware.rst

    r1893 r1975  
    200200            http_user = 'someuser' 
    201201            http_pass = 'somepass' 
    202             domain_name = 'intranet.example.com' 
     202            name = 'intranet.example.com' 
    203203 
    204204            # .. rest of the spider code omitted ... 
  • docs/topics/exporters.rst

    r1951 r1975  
    5353 
    5454       def spider_opened(self, spider): 
    55            file = open('%s_products.xml' % spider.domain_name, 'w+b') 
     55           file = open('%s_products.xml' % spider.name, 'w+b') 
    5656           self.files[spider] = file 
    5757           self.exporter = XmlItemExporter(file) 
  • docs/topics/extensions.rst

    r1963 r1975  
    106106 
    107107        def spider_opened(self, spider): 
    108             log.msg("opened spider %s" % spider.domain_name) 
     108            log.msg("opened spider %s" % spider.name) 
    109109 
    110110        def spider_closed(self, spider): 
    111             log.msg("closed spider %s" % spider.domain_name) 
     111            log.msg("closed spider %s" % spider.name) 
    112112 
    113113 
  • docs/topics/firebug.rst

    r1513 r1975  
    8080 
    8181   class GoogleDirectorySpider(CrawlSpider): 
    82        domain_name = 'directory.google.com' 
     82       name = 'directory.google.com' 
     83       allowed_domains = ['directory.google.com'] 
    8384       start_urls = ['http://directory.google.com/'] 
    8485 
  • docs/topics/request-response.rst

    r1961 r1975  
    322322 
    323323    class LoginSpider(BaseSpider): 
    324         domain_name = 'example.com' 
     324        name = 'example.com' 
    325325        start_urls = ['http://www.example.com/users/login.php'] 
    326326 
  • docs/topics/shell.rst

    r1591 r1975  
    164164 
    165165    class MySpider(BaseSpider): 
    166         domain_name = 'example.com' 
     166        ... 
    167167 
    168168        def parse(self, response): 
  • docs/topics/spider-middleware.rst

    r1841 r1975  
    211211   Filters out Requests for URLs outside the domains covered by the spider. 
    212212 
    213    This middleware filters out every request whose host names don't match 
    214    :attr:`~scrapy.spider.BaseSpider.domain_name`, or the spider 
    215    :attr:`~scrapy.spider.BaseSpider.domain_name` prefixed by "www.".   
    216    Spider can add more domains to exclude using  
    217    :attr:`~scrapy.spider.BaseSpider.extra_domain_names` attribute. 
     213   This middleware filters out every request whose host names aren't in the 
     214   spider's :attr:`~scrapy.spider.BaseSpider.allowed_domains` attribute. 
    218215 
    219216   When your spider returns a request for a domain not belonging to those 
  • docs/topics/spiders.rst

    r1917 r1975  
    7171   method ``parse`` for each of the resulting responses. 
    7272 
    73    .. attribute:: domain_name 
     73   .. attribute:: name 
    7474       
    75        A string which defines the domain name for this spider, which will also be 
    76        the unique identifier for this spider (which means you can't have two 
    77        spider with the same ``domain_name``). This is the most important spider 
    78        attribute and it's required, and it's the name by which Scrapy will known 
    79        the spider.  
    80  
    81    .. attribute:: extra_domain_names 
    82  
    83        An optional list of strings containing additional domains that this 
    84        spider is allowed to crawl. Requests for URLs not belonging to the 
    85        domain name specified in :attr:`domain_name` or this list won't be 
    86        followed. 
     75       A string which defines the name for this spider. The spider name is how 
     76       the spider is located (and instantiated) by Scrapy, so it must be 
     77       unique. However, nothing prevents you from instantiating more than one 
     78       instance of the same spider. This is the most important spider attribute 
     79       and it's required. 
     80 
     81       Is recommended to name your spiders after the domain that their crawl. 
     82 
     83   .. attribute:: allowed_domains 
     84 
     85       An optional list of strings containing domains that this spider is 
     86       allowed to crawl. Requests for URLs not belonging to the domain names 
     87       specified in this list won't be followed if 
     88       :class:`~scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware` is enabled. 
    8789 
    8890   .. attribute:: start_urls 
     
    145147 
    146148       Log a message using the :func:`scrapy.log.msg` function, automatically 
    147        populating the domain argument with the :attr:`domain_name` of this 
     149       populating the spider argument with the :attr:`name` of this 
    148150       spider. For more information see :ref:`topics-logging`. 
    149151 
     
    158160 
    159161    class MySpider(BaseSpider): 
    160         domain_name = 'http://www.example.com' 
     162        name = 'example.com' 
     163        allowed_domains = ['example.com'] 
    161164        start_urls = [ 
    162165            'http://www.example.com/1.html', 
     
    178181 
    179182    class MySpider(BaseSpider): 
    180         domain_name = 'http://www.example.com' 
     183        name = 'example.com' 
     184        allowed_domains = ['example.com'] 
    181185        start_urls = [ 
    182186            'http://www.example.com/1.html', 
     
    255259 
    256260    class MySpider(CrawlSpider): 
    257         domain_name = 'example.com' 
     261        name = 'example.com' 
     262        allowed_domains = ['example.com'] 
    258263        start_urls = ['http://www.example.com'] 
    259264         
     
    379384 
    380385    class MySpider(XMLFeedSpider): 
    381         domain_name = 'example.com' 
     386        name = 'example.com' 
     387        allowed_domains = ['example.com'] 
    382388        start_urls = ['http://www.example.com/feed.xml'] 
    383389        iterator = 'iternodes' # This is actually unnecesary, since it's the default value 
     
    436442 
    437443    class MySpider(CSVFeedSpider): 
    438         domain_name = 'example.com' 
     444        name = 'example.com' 
     445        allowed_domains = ['example.com'] 
    439446        start_urls = ['http://www.example.com/feed.csv'] 
    440447        delimiter = ';' 
  • docs/topics/stats.rst

    r1849 r1975  
    205205    A simple stats collector that keeps the stats of the last scraping run (for 
    206206    each spider) in memory, after they're closed. The stats can be accessed 
    207     through the :attr:`domain_stats` attribute, which is a dict keyed by spider 
     207    through the :attr:`spider_stats` attribute, which is a dict keyed by spider 
    208208    domain name. 
    209209 
    210210    This is the default Stats Collector used in Scrapy. 
    211211 
    212     .. attribute:: domain_stats 
    213  
    214        A dict of dicts (keyed by spider domain name) containing the stats of 
    215        the last scraping run for each domain. 
     212    .. attribute:: spider_stats 
     213 
     214       A dict of dicts (keyed by spider name) containing the stats of the last 
     215       scraping run for each spider. 
    216216 
    217217DummyStatsCollector 
     
    241241    persitance time: 
    242242 
    243         * ``domain``: the spider domain (so you can use it later for querying stats 
    244           for that domain)  
     243        * ``spider``: the spider name (so you can use it later for querying stats 
     244          for that spider) 
    245245        * ``timestamp``: the timestamp when the stats were persisited 
    246246 
    247     Both the ``domain`` and ``timestamp`` are used for generating the SimpleDB 
     247    Both the ``spider`` and ``timestamp`` are used for generating the SimpleDB 
    248248    item name in order to avoid overwriting stats of previous scraping runs. 
    249249 
  • examples/experimental/googledir/googledir/spiders/google_directory.py

    r1933 r1975  
    77class GoogleDirectorySpider(CrawlSpider): 
    88 
    9     domain_name = 'directory.google.com' 
     9    name = 'google_directory' 
     10    allowed_domains = ['directory.google.com'] 
    1011    start_urls = ['http://directory.google.com/'] 
    1112 
  • examples/experimental/imdb/imdb/spiders/imdb_site.py

    r1934 r1975  
    3030 
    3131class ImdbSiteSpider(CrawlSpider): 
    32     domain_name = 'imdb.com' 
     32    name = 'imdb.com' 
     33    allowed_domains = ['imdb.com'] 
    3334    start_urls = ['http://www.imdb.com/'] 
    3435 
  • examples/googledir/googledir/spiders/google_directory.py

    r1517 r1975  
    77class GoogleDirectorySpider(CrawlSpider): 
    88 
    9     domain_name = 'directory.google.com' 
     9    name = 'directory.google.com' 
     10    allow_domains = ['directory.google.com'] 
    1011    start_urls = ['http://directory.google.com/'] 
    1112 
  • scrapy/command/commands/crawl.py

    r1974 r1975  
    1414 
    1515    def syntax(self): 
    16         return "[options] <domain|url> ..." 
     16        return "[options] <spider|url> ..." 
    1717 
    1818    def short_desc(self): 
    19         return "Start crawling a domain or URL" 
     19        return "Start crawling from a spider or URL" 
    2020 
    2121    def add_options(self, parser): 
     
    3232 
    3333    def run(self, args, opts): 
    34  
    35         urls, domains = self._split_urls_and_domains(args) 
    36         for dom in domains: 
    37             scrapymanager.crawl_domain(dom) 
     34        urls, names = self._split_urls_and_names(args) 
     35        for name in names: 
     36            scrapymanager.crawl_spider_name(name) 
    3837 
    3938        if opts.spider: 
     
    6665        return spider_urls.items() 
    6766 
    68     def _split_urls_and_domains(self, args): 
     67    def _split_urls_and_names(self, args): 
    6968        urls = [] 
    70         domains = [] 
     69        names = [] 
    7170        for arg in args: 
    7271            if is_url(arg): 
    7372                urls.append(arg) 
    7473            else: 
    75                 domains.append(arg) 
    76         return urls, domains 
     74                names.append(arg) 
     75        return urls, names 
  • scrapy/command/commands/genspider.py

    r1973 r1975  
    1616 
    1717def sanitize_module_name(module_name): 
    18     """Sanitize the given module name, by replacing dashes with underscores and 
    19     prefixing it with a letter if it doesn't start with one 
     18    """Sanitize the given module name, by replacing dashes and points 
     19    with underscores and prefixing it with a letter if it doesn't start 
     20    with one 
    2021    """ 
    21     module_name = module_name.replace('-', '_') 
     22    module_name = module_name.replace('-', '_').replace('.', '_') 
    2223    if module_name[0] not in string.ascii_letters: 
    2324        module_name = "a" + module_name 
     
    2930 
    3031    def syntax(self): 
    31         return "[options] <spider_module_name> <spider_domain_name>" 
     32        return "[options] <name> <domain>" 
    3233 
    3334    def short_desc(self): 
     
    5556            return 
    5657 
    57         if len(args) < 2: 
     58        if len(args) != 2: 
    5859            return False 
    5960 
    60         module = sanitize_module_name(args[0]) 
     61        name = args[0] 
    6162        domain = args[1] 
     63 
     64        module = sanitize_module_name(name) 
    6265 
    6366        # if spider already exists and not force option then halt 
    6467        try: 
    65             spider = spiders.create(domain) 
     68            spider = spiders.create(name) 
    6669        except KeyError: 
    6770            pass 
    6871        else: 
    6972            if not opts.force: 
    70                 print "Spider '%s' already exists in module:" % domain 
     73                print "Spider '%s' already exists in module:" % name 
    7174                print "  %s" % spider.__module__ 
    7275                sys.exit(1) 
     
    7477        template_file = self._find_template(opts.template) 
    7578        if template_file: 
    76             self._genspider(module, domain, opts.template, template_file) 
     79            self._genspider(module, name, domain, opts.template, template_file) 
    7780 
    78     def _genspider(self, module, domain, template_name, template_file): 
     81    def _genspider(self, module, name, domain, template_name, template_file): 
    7982        """Generate the spider module, based on the given template""" 
    8083        tvars = { 
     
    8285            'ProjectName': string_camelcase(settings.get('BOT_NAME')), 
    8386            'module': module, 
    84             'site': domain, 
     87            'name': name, 
     88            'domain': domain, 
    8589            'classname': '%sSpider' % ''.join([s.capitalize() \ 
    8690                for s in module.split('_')]) 
     
    9397        shutil.copyfile(template_file, spider_file) 
    9498        render_templatefile(spider_file, **tvars) 
    95         print "Created spider %r using template %r in module:" % (domain, \ 
     99        print "Created spider %r using template %r in module:" % (name, \ 
    96100            template_name) 
    97101        print "  %s.%s" % (spiders_module.__name__, module) 
  • scrapy/command/commands/parse.py

    r1974 r1975  
    4747            callback_fcn = callback if callable(callback) else getattr(spider, callback, None) 
    4848            if not callback_fcn: 
    49                 log.msg('Cannot find callback %s in %s spider' % (callback, spider.domain_name)) 
     49                log.msg('Cannot find callback %s in %s spider' % (callback, spider.name)) 
    5050                return (), () 
    5151 
     
    131131                log.msg('No rules found for spider "%s", ' \ 
    132132                        'please specify a callback for parsing' \ 
    133                         % spider.domain_name, log.ERROR) 
     133                        % spider.name, log.ERROR) 
    134134        else: 
    135135            # default callback 'parse' 
  • scrapy/contrib/aws.py

    r1681 r1975  
    2121    def process_request(self, request, spider): 
    2222        hostname = urlparse_cached(request).hostname 
    23         if spider.domain_name == 's3.amazonaws.com' \ 
     23        if spider.name == 's3.amazonaws.com' \ 
    2424                or (hostname and hostname.endswith('s3.amazonaws.com')): 
    2525            request.headers['Date'] = time.strftime("%a, %d %b %Y %H:%M:%S GMT", \ 
  • scrapy/contrib/downloadermiddleware/httpcache.py

    r1843 r1975  
    109109    def _get_request_path(self, spider, request): 
    110110        key = request_fingerprint(request) 
    111         return join(self.cachedir, spider.domain_name, key[0:2], key) 
     111        return join(self.cachedir, spider.name, key[0:2], key) 
    112112 
    113113    def _read_meta(self, spider, request): 
  • scrapy/contrib/itemsampler.py

    r1849 r1975  
    11""" 
    22This module provides a mechanism for collecting one (or more) sample items per 
    3 domain. 
     3spider. 
    44 
    55The items are collected in a dict of guid->item and persisted by pickling that 
     
    99code that affects several spiders. 
    1010 
    11 It uses the scrapy stats service to keep track of which domains are already 
     11It uses the scrapy stats service to keep track of which spiders are already 
    1212sampled. 
    1313 
     
    4949        self.items = {} 
    5050        self.spiders_count = 0 
    51         self.empty_domains = set() 
     51        self.empty_spiders = set() 
    5252        dispatcher.connect(self.spider_closed, signal=signals.spider_closed) 
    5353        dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped) 
     
    6767        with open(self.filename, 'w') as f: 
    6868            pickle.dump(self.items, f) 
    69         if self.empty_domains: 
    70             log.msg("No products sampled for: %s" % " ".join(self.empty_domains), \ 
     69        if self.empty_spiders: 
     70            log.msg("No products sampled for: %s" % " ".join(self.empty_spiders), \ 
    7171                level=log.WARNING) 
    7272 
    7373    def spider_closed(self, spider, reason): 
    7474        if reason == 'finished' and not stats.get_value("items_sampled", spider=spider): 
    75             self.empty_domains.add(spider.domain_name) 
     75            self.empty_spiders.add(spider.name) 
    7676        self.spiders_count += 1 
    77         log.msg("Sampled %d domains so far (%d empty)" % (self.spiders_count, \ 
    78             len(self.empty_domains)), level=log.INFO) 
     77        log.msg("Sampled %d spiders so far (%d empty)" % (self.spiders_count, \ 
     78            len(self.empty_spiders)), level=log.INFO) 
    7979 
    8080 
    8181class ItemSamplerMiddleware(object): 
    82     """This middleware drops items and requests (when domain sampling has been 
    83     completed) to accelerate the processing of remaining domains""" 
     82    """This middleware drops items and requests (when spider sampling has been 
     83    completed) to accelerate the processing of remaining spiders""" 
    8484 
    8585    def __init__(self): 
  • scrapy/contrib/pipeline/images.py

    r1849 r1975  
    4848 
    4949    def spider_closed(self, spider): 
    50         self.created_directories.pop(spider.domain_name, None) 
     50        self.created_directories.pop(spider.name, None) 
    5151 
    5252    def persist_image(self, key, image, buf, info): 
     
    9393    delayed even more because it is uploading images to s3. 
    9494    """ 
    95     domain_name = "s3.amazonaws.com" 
     95    name = "s3.amazonaws.com" 
    9696    start_urls = ['http://s3.amazonaws.com/'] 
    9797    max_concurrent_requests = 100 
  • scrapy/contrib/spidermanager.py

    r1973 r1975  
    5454            for spider in self._getspiders(ISpider, module): 
    5555                ISpider.validateInvariants(spider) 
    56                 self._spiders[spider.domain_name] = spider 
     56                self._spiders[spider.name] = spider 
    5757        self.loaded = True 
    5858 
     
    7777        spider 
    7878        """ 
    79         domain = spider.domain_name 
    80         if domain not in self._spiders: 
     79        name = spider.name 
     80        if name not in self._spiders: 
    8181            return 
    82         spider = self._spiders[domain] 
     82        spider = self._spiders[name] 
    8383        module_name = spider.__module__ 
    8484        module = sys.modules[module_name] 
     
    8787                level=log.DEBUG) 
    8888            new_module = rebuild(module, doLog=0) 
    89             self._spiders[domain] = new_module.SPIDER 
     89            self._spiders[name] = new_module.SPIDER 
  • scrapy/contrib/spidermiddleware/offsite.py

    r1841 r1975  
    4848 
    4949    def spider_opened(self, spider): 
    50         domains = [spider.domain_name] + spider.extra_domain_names 
    51         self.host_regexes[spider] = self.get_host_regex(domains) 
     50        self.host_regexes[spider] = self.get_host_regex(spider.allowed_domains) 
    5251        self.domains_seen[spider] = set() 
    5352 
  • scrapy/contrib/statsmailer.py

    r1849 r1975  
    2424        body = "Global stats\n\n" 
    2525        body += "\n".join("%-50s : %s" % i for i in stats.get_stats().items()) 
    26         body += "\n\n%s stats\n\n" % spider.domain_name 
     26        body += "\n\n%s stats\n\n" % spider.name 
    2727        body += "\n".join("%-50s : %s" % i for i in spider_stats.items()) 
    28         mail.send(self.recipients, "Scrapy stats for: %s" % spider.domain_name, body) 
     28        mail.send(self.recipients, "Scrapy stats for: %s" % spider.name, body) 
  • scrapy/contrib/webconsole/livestats.py

    r1867 r1975  
    6161 
    6262            s += '<tr><td>%s</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td>%s</td><td>%s</td></tr>\n' % \ 
    63                  (spider.domain_name, stats.scraped, stats.crawled, scheduled, dqueued, active, transf, str(stats.started), str(runtime)) 
     63                 (spider.name, stats.scraped, stats.crawled, scheduled, dqueued, active, transf, str(stats.started), str(runtime)) 
    6464 
    6565            totdomains += 1 
  • scrapy/contrib/webconsole/spiderctl.py

    r1972 r1975  
    2626 
    2727    def spider_opened(self, spider): 
    28         self.running[spider.domain_name] = spider 
     28        self.running[spider.name] = spider 
    2929 
    3030    def spider_closed(self, spider): 
    31         del self.running[spider.domain_name] 
    32         self.finished.add(spider.domain_name) 
     31        del self.running[spider.name] 
     32        self.finished.add(spider.name) 
    3333 
    3434    def webconsole_render(self, wc_request): 
     
    3636            changes = self.webconsole_control(wc_request) 
    3737 
    38         self.scheduled = [s.domain_name for s in scrapyengine.spider_scheduler._pending_spiders] 
    39         self.idle = [d for d in self.enabled_domains if d not in self.scheduled 
     38        self.scheduled = [s.name for s in scrapyengine.spider_scheduler._pending_spiders] 
     39        self.idle = [d for d in self.enabled_spiders if d not in self.scheduled 
    4040                                                        and d not in self.running 
    4141                                                        and d not in self.finished] 
     
    5454        s += "<td valign='top'>\n" 
    5555        s += '<form method="post" action=".">\n' 
    56         s += '<select name="add_pending_domains" multiple="multiple">\n' 
    57         for domain in sorted(self.idle): 
    58             s += "<option>%s</option>\n" % domain 
     56        s += '<select name="add_pending_spiders" multiple="multiple">\n' 
     57        for name in sorted(self.idle): 
     58            s += "<option>%s</option>\n" % name 
    5959        s += '</select><br>\n' 
    6060        s += '<br />' 
     
    6666        s += "<td valign='top'>\n" 
    6767        s += '<form method="post" action=".">\n' 
    68         s += '<select name="remove_pending_domains" multiple="multiple">\n' 
    69         for domain in self.scheduled: 
    70             s += "<option>%s</option>\n" % domain 
     68        s += '<select name="remove_pending_spiders" multiple="multiple">\n' 
     69        for name in self.scheduled: 
     70            s += "<option>%s</option>\n" % name 
    7171        s += '</select><br>\n' 
    7272        s += '<br />' 
     
    7979        s += "<td valign='top'>\n" 
    8080        s += '<form method="post" action=".">\n' 
    81         s += '<select name="stop_running_domains" multiple="multiple">\n' 
    82         for domain in sorted(self.running): 
    83             s += "<option>%s</option>\n" % domain 
     81        s += '<select name="stop_running_spiders" multiple="multiple">\n' 
     82        for name in sorted(self.running): 
     83            s += "<option>%s</option>\n" % name  
    8484        s += '</select><br>\n' 
    8585        s += '<br />' 
     
    9191        s += "<td valign='top'>\n" 
    9292        s += '<form method="post" action=".">\n' 
    93         s += '<select name="rerun_finished_domains" multiple="multiple">\n' 
    94         for domain in sorted(self.finished): 
    95             s += "<option>%s</option>\n" % domain 
     93        s += '<select name="rerun_finished_spiders" multiple="multiple">\n' 
     94        for name in sorted(self.finished): 
     95            s += "<option>%s</option>\n" % name 
    9696        s += '</select><br>\n' 
    9797        s += '<br />' 
     
    115115        s = "<hr />\n" 
    116116 
    117         if "stop_running_domains" in args: 
     117        if "stop_running_spiders" in args: 
    118118            s += "<p>" 
    119             stopped_domains = [] 
    120             for domain in args["stop_running_domains"]: 
    121                 if domain in self.running: 
    122                     scrapyengine.close_spider(self.running[domain]) 
    123                     stopped_domains.append(domain) 
    124             s += "Stopped spiders: <ul><li>%s</li></ul>" % "</li><li>".join(stopped_domains) 
     119            stopped_spiders = [] 
     120            for name in args["stop_running_spiders"]: 
     121                if name in self.running: 
     122                    scrapyengine.close_spider(self.running[name]) 
     123                    stopped_spiders.append(name) 
     124            s += "Stopped spiders: <ul><li>%s</li></ul>" % "</li><li>".join(stopped_spiders) 
    125125            s += "</p>" 
    126         if "remove_pending_domains" in args: 
     126        if "remove_pending_spiders" in args: 
    127127            removed = [] 
    128             for domain in args["remove_pending_domains"]: 
    129                 if scrapyengine.spider_scheduler.remove_pending_domain(domain): 
    130                     removed.append(domain) 
     128            for name in args["remove_pending_spiders"]: 
     129                if scrapyengine.spider_scheduler.remove_pending_spider(name): 
     130                    removed.append(name) 
    131131            if removed: 
    132132                s += "<p>" 
    133                 s += "Removed scheduled spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["remove_pending_domains"]) 
     133                s += "Removed scheduled spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["remove_pending_spiders"]) 
    134134                s += "</p>" 
    135         if "add_pending_domains" in args: 
    136             for domain in args["add_pending_domains"]: 
    137                 if domain not in scrapyengine.scheduler.pending_requests: 
    138                     scrapymanager.crawl_domain(domain) 
     135        if "add_pending_spiders" in args: 
     136            for name in args["add_pending_spiders"]: 
     137                if name not in scrapyengine.scheduler.pending_requests: 
     138                    scrapymanager.crawl_spider_name(name) 
    139139            s += "<p>" 
    140             s += "Scheduled spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["add_pending_domains"]) 
     140            s += "Scheduled spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["add_pending_spiders"]) 
    141141            s += "</p>" 
    142         if "rerun_finished_domains" in args: 
    143             for domain in args["rerun_finished_domains"]: 
    144                 if domain not in scrapyengine.scheduler.pending_requests: 
    145                     scrapymanager.crawl_domain(domain) 
    146                 self.finished.remove(domain) 
     142        if "rerun_finished_spiders" in args: 
     143            for name in args["rerun_finished_spiders"]: 
     144                if name not in scrapyengine.scheduler.pending_requests: 
     145                    scrapymanager.crawl_spider_name(name) 
     146                self.finished.remove(name) 
    147147            s += "<p>" 
    148             s += "Re-scheduled finished spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["rerun_finished_domains"]) 
     148            s += "Re-scheduled finished spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["rerun_finished_spiders"]) 
    149149            s += "</p>" 
    150150 
     
    152152         
    153153    def webconsole_discover_module(self): 
    154         self.enabled_domains = spiders.list() 
     154        self.enabled_spiders = spiders.list() 
    155155        return self 
  • scrapy/contrib/webconsole/stats.py

    r1849 r1975  
    2424        s += stats_html_table(stats.get_stats()) 
    2525        for spider, spider_stats in stats.iter_spider_stats(): 
    26             s += "<h3>%s</h3>\n" % spider.domain_name 
     26            s += "<h3>%s</h3>\n" % spider.name 
    2727            s += stats_html_table(spider_stats) 
    2828        s += "</body>\n" 
  • scrapy/core/manager.py

    r1973 r1975  
    5555            scrapyengine.crawl(request, spider) 
    5656 
    57     def crawl_domain(self, domain): 
    58         """Schedule given domain for crawling.""" 
     57    def crawl_spider_name(self, name): 
     58        """Schedule given spider by name for crawling.""" 
    5959        try: 
    60             spider = spiders.create(domain) 
     60            spider = spiders.create(name) 
    6161        except KeyError: 
    62             log.msg('Could not find spider for domain: %s' % domain, log.ERROR) 
     62            log.msg('Could not find spider: %s' % name, log.ERROR) 
    6363        else: 
    6464            self.crawl_spider(spider) 
  • scrapy/log.py

    r1956 r1975  
    7676    dispatcher.send(signal=logmessage_received, message=message, level=level, \ 
    7777        spider=spider) 
    78     system = domain or (spider.domain_name if spider else component) 
     78    system = domain or (spider.name if spider else component) 
    7979    msg_txt = unicode_to_str("%s: %s" % (level_names[level], message), log_encoding) 
    8080    log.msg(msg_txt, system=system) 
     
    9494        warnings.warn("'domain' argument of scrapy.log.err() is deprecated, " \ 
    9595            "use 'spider' argument instead", DeprecationWarning, stacklevel=2) 
    96     kwargs['system'] = domain or (spider.domain_name if spider else component) 
     96    kwargs['system'] = domain or (spider.name if spider else component) 
    9797    if _why: 
    9898        _why = unicode_to_str("ERROR: %s" % _why, log_encoding) 
  • scrapy/spider/models.py

    r1886 r1975  
    44See documentation in docs/topics/spiders.rst 
    55""" 
     6 
     7import warnings 
     8 
    69from zope.interface import Interface, Attribute, invariant, implements 
    710from twisted.plugin import IPlugin 
     
    1215from scrapy.utils.trackref import object_ref 
    1316 
    14 def _valid_domain_name(obj): 
    15     """Check the domain name specified is valid""" 
    16     if not obj.domain_name: 
    17         raise ValueError("Spider 'domain_name' attribute is required") 
    18  
    1917class ISpider(Interface, IPlugin) : 
    20     """Interface to be implemented by site-specific web spiders""" 
    21  
    22     domain_name = Attribute("The domain name of the site to be scraped.") 
    23  
    24     invariant(_valid_domain_name) 
     18    """Interface used by TwistedPluginSpiderManager to discover spiders""" 
     19    pass 
    2520 
    2621class BaseSpider(object_ref): 
     
    3227 
    3328    # XXX: class attributes kept for backwards compatibility 
    34     domain_name = None 
     29    name = None 
    3530    start_urls = [] 
    36     extra_domain_names = [] 
     31    allowed_domains = [] 
    3732 
    38     def __init__(self, domain_name=None): 
    39         if domain_name is not None: 
    40             self.domain_name = domain_name 
     33    def __init__(self, name=None): 
     34        # XXX: SEP-12 backward compatibility (remove for 0.10) 
     35        if hasattr(self, 'domain_name'): 
     36            warnings.warn("Spider.domain_name attribute is deprecated, use Spider.name instead", \ 
     37                DeprecationWarning, stacklevel=4) 
     38            self.name = self.domain_name 
     39        if hasattr(self, 'extra_domain_names'): 
     40            warnings.warn("Spider.extra_domain_names attribute is deprecated - user Spider.allowed_domains instead", \ 
     41                DeprecationWarning, stacklevel=4) 
     42            self.allowed_domains = [self.name] + list(self.extra_domain_names) 
     43 
     44        if name is not None: 
     45            self.name = name 
    4146        # XXX: create instance attributes (class attributes were kept for 
    4247        # backwards compatibility) 
    4348        if not self.start_urls: 
    4449            self.start_urls = [] 
    45         if not self.extra_domain_names: 
    46             self.extra_domain_names = [] 
     50        if not self.allowed_domains: 
     51            self.allowed_domains = [] 
     52        if not getattr(self, 'domain_name', None): 
     53            self.domain_name = self.name 
     54        if not getattr(self, 'extra_domain_names', None): 
     55            self.extra_domain_names = self.allowed_domains 
    4756 
    4857    def log(self, message, level=log.DEBUG): 
     
    6877 
    6978    def __str__(self): 
    70         return "<%s %r>" % (type(self).__name__, self.domain_name) 
     79        return "<%s %r>" % (type(self).__name__, self.name) 
    7180 
    7281    __repr__ = __str__ 
  • scrapy/stats/collector/__init__.py

    r1849 r1975  
    7777    def __init__(self): 
    7878        super(MemoryStatsCollector, self).__init__() 
    79         self.domain_stats = {} 
     79        self.spider_stats = {} 
    8080 
    8181    def _persist_stats(self, stats, spider=None): 
    8282        if spider is not None: 
    83             self.domain_stats[spider.domain_name] = stats 
     83            self.spider_stats[spider.name] = stats 
    8484 
    8585 
  • scrapy/stats/collector/simpledb.py

    r1849 r1975  
    3737    def _persist_to_sdb(self, spider, stats): 
    3838        ts = self._get_timestamp(spider).isoformat() 
    39         sdb_item_id = "%s_%s" % (spider.domain_name, ts) 
     39        sdb_item_id = "%s_%s" % (spider.name, ts) 
    4040        sdb_item = dict((k, self._to_sdb_value(v, k)) for k, v in stats.iteritems()) 
    41         sdb_item['domain'] = spider.domain_name 
     41        sdb_item['spider'] = spider.name 
    4242        sdb_item['timestamp'] = self._to_sdb_value(ts) 
    4343        connect_sdb().put_attributes(self._sdbdomain, sdb_item_id, sdb_item) 
  • scrapy/templates/spiders/basic.tmpl

    r1752 r1975  
    22 
    33class $classname(BaseSpider): 
    4     domain_name = "$site" 
     4    name = "$name" 
     5    allowed_domains = ["$domain"] 
    56    start_urls = ( 
    6         'http://www.$site/', 
     7        'http://www.$domain/', 
    78        ) 
    89 
  • scrapy/templates/spiders/crawl.tmpl

    r1927 r1975  
    77 
    88class $classname(CrawlSpider): 
    9     domain_name = '$site' 
    10     start_urls = ['http://www.$site/'] 
     9    name = '$name' 
     10    allowed_domains = ['$domain'] 
     11    start_urls = ['http://www.$domain/'] 
    1112 
    1213    rules = ( 
     
    1718        hxs = HtmlXPathSelector(response) 
    1819        i = ${ProjectName}Item() 
    19         #i['site_id'] = hxs.select('//input[@id="sid"]/@value').extract() 
     20        #i['domain_id'] = hxs.select('//input[@id="sid"]/@value').extract() 
    2021        #i['name'] = hxs.select('//div[@id="name"]').extract() 
    2122        #i['description'] = hxs.select('//div[@id="description"]').extract() 
  • scrapy/templates/spiders/csvfeed.tmpl

    r1718 r1975  
    33 
    44class $classname(CSVFeedSpider): 
    5     domain_name = '$site' 
    6     start_urls = ['http://www.$site/feed.csv'] 
     5    name = '$name' 
     6    allowed_domains = ['$domain'] 
     7    start_urls = ['http://www.$domain/feed.csv'] 
    78    # headers = ['id', 'name', 'description', 'image_link'] 
    89    # delimiter = '\t' 
  • scrapy/templates/spiders/xmlfeed.tmpl

    r1718 r1975  
    33 
    44class $classname(XMLFeedSpider): 
    5     domain_name = '$site' 
    6     start_urls = ['http://www.$site/feed.xml'] 
     5    name = '$name' 
     6    allowed_domains = ['$domain'] 
     7    start_urls = ['http://www.$domain/feed.xml'] 
    78 
    89    def parse_item(self, response, selector): 
  • scrapy/tests/test_commands.py

    r1737 r1975  
    6060class GenspiderCommandTest(CommandTest): 
    6161 
     62    def test_arguments(self): 
     63        # only pass one argument. spider script shouldn't be created 
     64        self.assertEqual(0, self.call('genspider', 'test_name')) 
     65        assert not exists(join(self.proj_mod_path, 'spiders', 'test_name.py')) 
     66        # pass two arguments <name> <domain>. spider script should be created 
     67        self.assertEqual(0, self.call('genspider', 'test_name', 'test.com')) 
     68        assert exists(join(self.proj_mod_path, 'spiders', 'test_name.py')) 
     69 
    6270    def test_template_default(self, *args): 
    63         self.assertEqual(0, self.call('genspider', 'testspider', 'test.com', *args)) 
    64         assert exists(join(self.proj_mod_path, 'spiders', 'testspider.py')) 
    65         self.assertEqual(1, self.call('genspider', 'otherspider', 'test.com')) 
     71        self.assertEqual(0, self.call('genspider', 'test_spider', 'test.com', *args)) 
     72        assert exists(join(self.proj_mod_path, 'spiders', 'test_spider.py')) 
     73        self.assertEqual(1, self.call('genspider', 'test_spider', 'test.com')) 
    6674 
    6775    def test_template_basic(self): 
  • scrapy/tests/test_engine.py

    r1972 r1975  
    2323 
    2424class TestSpider(BaseSpider): 
    25     domain_name = "scrapytest.org" 
    26     extra_domain_names = ["localhost"] 
     25    name = "scrapytest.org" 
     26    allowed_domains = ["scrapytest.org", "localhost"] 
    2727    start_urls = ['http://localhost'] 
    2828 
     
    6969 
    7070    def __init__(self): 
    71         self.domain = 'scrapytest.org' 
     71        self.name = 'scrapytest.org' 
    7272        self.spider = None 
    7373        self.respplug = [] 
     
    140140        """ 
    141141        assert session.spider is not None 
    142         self.assertEqual(session.spider.domain_name, session.domain) 
     142        self.assertEqual(session.spider.name, session.name) 
    143143 
    144144    def test_visited_urls(self): 
  • scrapy/tests/test_spidermiddleware_offsite.py

    r1822 r1975  
    1010    def setUp(self): 
    1111        self.spider = BaseSpider() 
    12         self.spider.domain_name = 'scrapytest.org' 
    13         self.spider.extra_domain_names = ['scrapy.org'] 
     12        self.spider.name = 'scrapytest.org' 
     13        self.spider.allowed_domains = ['scrapytest.org', 'scrapy.org'] 
    1414 
    1515        self.mw = OffsiteMiddleware() 
  • scrapy/utils/url.py

    r1957 r1975  
    2323def url_is_from_spider(url, spider): 
    2424    """Return True if the url belongs to the given spider""" 
    25     domains = [spider.domain_name] 
    26     domains.extend(spider.extra_domain_names) 
    27     return url_is_from_any_domain(url, domains) 
     25    return url_is_from_any_domain(url, spider.allowed_domains) 
    2826 
    2927def urljoin_rfc(base, ref, encoding='utf-8'):