Changeset 1975:9df3e845c0d7
- Timestamp:
- 04/01/10 18:27:22 (5 months ago)
- Author:
- Rolando Espinoza La fuente <darkrho@…>
- Branch:
- default
- Message:
-
SEP12 implementation
This patch implements the domain_name to name change in BaseSpider? class and
change all spider instantiations to use the new attribute.
- Add allowed_domains to spider
This patch implements the merging of spider.domain_name and
spider.extra_domain_names in spider.allowed_domains for offsite checking
purposes.
Note that spider.domain_name is not touched by this patch, only not used.
- Remove spider.domain_name references from scrapy.stats
- Refactor genspider command
The new signature for genspider is: genspider [options] <domain_name>.
Genspider uses domain_name for spider name and for the module name.
- Remove spider.domain_name references
- Update crawl command signature <spider|url>
- docs: updated references to domain_name
- examples/experimental: use spider.name
- genspider: require <name> <domain>
- spidermanager: renamed crawl_domain to crawl_spider_name
- spiderctl: updated references of *domain* to spider
- added backward compatiblity with legacy spider's attributes
'domain_name' and 'extra_domain_names'
- Files:
-
Legend:
- Unmodified
- Added
- Removed
-
|
r1953
|
r1975
|
|
| 129 | 129 | class MininovaSpider(CrawlSpider): |
| 130 | 130 | |
| 131 | | domain_name = 'mininova.org' |
| | 131 | name = 'mininova.org' |
| | 132 | allowed_domains = ['mininova.org'] |
| 132 | 133 | start_urls = ['http://www.mininova.org/today'] |
| 133 | 134 | rules = [Rule(SgmlLinkExtractor(allow=['/tor/\d+']), 'parse_torrent')] |
-
|
r1952
|
r1975
|
|
| 103 | 103 | define the three main, mandatory, attributes: |
| 104 | 104 | |
| 105 | | * :attr:`~scrapy.spider.BaseSpider.domain_name`: identifies the Spider. It must |
| 106 | | be unique, that is, you can't set the same domain name for different Spiders. |
| | 105 | * :attr:`~scrapy.spider.BaseSpider.name`: identifies the Spider. It must be |
| | 106 | unique, that is, you can't set the same name for different Spiders. |
| 107 | 107 | |
| 108 | 108 | * :attr:`~scrapy.spider.BaseSpider.start_urls`: is a list of URLs where the |
| … |
… |
|
| 129 | 129 | |
| 130 | 130 | class DmozSpider(BaseSpider): |
| 131 | | domain_name = "dmoz.org" |
| | 131 | name = "dmoz.org" |
| | 132 | allowed_domains = ["dmoz.org"] |
| 132 | 133 | start_urls = [ |
| 133 | 134 | "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", |
| … |
… |
|
| 355 | 356 | |
| 356 | 357 | class DmozSpider(BaseSpider): |
| 357 | | domain_name = "dmoz.org" |
| | 358 | name = "dmoz.org" |
| | 359 | allowed_domains = ["dmoz.org"] |
| 358 | 360 | start_urls = [ |
| 359 | 361 | "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", |
| … |
… |
|
| 399 | 401 | |
| 400 | 402 | class DmozSpider(BaseSpider): |
| 401 | | domain_name = "dmoz.org" |
| | 403 | name = "dmoz.org" |
| | 404 | allowed_domains = ["dmoz.org"] |
| 402 | 405 | start_urls = [ |
| 403 | 406 | "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", |
-
|
r1893
|
r1975
|
|
| 200 | 200 | http_user = 'someuser' |
| 201 | 201 | http_pass = 'somepass' |
| 202 | | domain_name = 'intranet.example.com' |
| | 202 | name = 'intranet.example.com' |
| 203 | 203 | |
| 204 | 204 | # .. rest of the spider code omitted ... |
-
|
r1951
|
r1975
|
|
| 53 | 53 | |
| 54 | 54 | def spider_opened(self, spider): |
| 55 | | file = open('%s_products.xml' % spider.domain_name, 'w+b') |
| | 55 | file = open('%s_products.xml' % spider.name, 'w+b') |
| 56 | 56 | self.files[spider] = file |
| 57 | 57 | self.exporter = XmlItemExporter(file) |
-
|
r1963
|
r1975
|
|
| 106 | 106 | |
| 107 | 107 | def spider_opened(self, spider): |
| 108 | | log.msg("opened spider %s" % spider.domain_name) |
| | 108 | log.msg("opened spider %s" % spider.name) |
| 109 | 109 | |
| 110 | 110 | def spider_closed(self, spider): |
| 111 | | log.msg("closed spider %s" % spider.domain_name) |
| | 111 | log.msg("closed spider %s" % spider.name) |
| 112 | 112 | |
| 113 | 113 | |
-
|
r1513
|
r1975
|
|
| 80 | 80 | |
| 81 | 81 | class GoogleDirectorySpider(CrawlSpider): |
| 82 | | domain_name = 'directory.google.com' |
| | 82 | name = 'directory.google.com' |
| | 83 | allowed_domains = ['directory.google.com'] |
| 83 | 84 | start_urls = ['http://directory.google.com/'] |
| 84 | 85 | |
-
|
r1961
|
r1975
|
|
| 322 | 322 | |
| 323 | 323 | class LoginSpider(BaseSpider): |
| 324 | | domain_name = 'example.com' |
| | 324 | name = 'example.com' |
| 325 | 325 | start_urls = ['http://www.example.com/users/login.php'] |
| 326 | 326 | |
-
|
r1591
|
r1975
|
|
| 164 | 164 | |
| 165 | 165 | class MySpider(BaseSpider): |
| 166 | | domain_name = 'example.com' |
| | 166 | ... |
| 167 | 167 | |
| 168 | 168 | def parse(self, response): |
-
|
r1841
|
r1975
|
|
| 211 | 211 | Filters out Requests for URLs outside the domains covered by the spider. |
| 212 | 212 | |
| 213 | | This middleware filters out every request whose host names don't match |
| 214 | | :attr:`~scrapy.spider.BaseSpider.domain_name`, or the spider |
| 215 | | :attr:`~scrapy.spider.BaseSpider.domain_name` prefixed by "www.". |
| 216 | | Spider can add more domains to exclude using |
| 217 | | :attr:`~scrapy.spider.BaseSpider.extra_domain_names` attribute. |
| | 213 | This middleware filters out every request whose host names aren't in the |
| | 214 | spider's :attr:`~scrapy.spider.BaseSpider.allowed_domains` attribute. |
| 218 | 215 | |
| 219 | 216 | When your spider returns a request for a domain not belonging to those |
-
|
r1917
|
r1975
|
|
| 71 | 71 | method ``parse`` for each of the resulting responses. |
| 72 | 72 | |
| 73 | | .. attribute:: domain_name |
| | 73 | .. attribute:: name |
| 74 | 74 | |
| 75 | | A string which defines the domain name for this spider, which will also be |
| 76 | | the unique identifier for this spider (which means you can't have two |
| 77 | | spider with the same ``domain_name``). This is the most important spider |
| 78 | | attribute and it's required, and it's the name by which Scrapy will known |
| 79 | | the spider. |
| 80 | | |
| 81 | | .. attribute:: extra_domain_names |
| 82 | | |
| 83 | | An optional list of strings containing additional domains that this |
| 84 | | spider is allowed to crawl. Requests for URLs not belonging to the |
| 85 | | domain name specified in :attr:`domain_name` or this list won't be |
| 86 | | followed. |
| | 75 | A string which defines the name for this spider. The spider name is how |
| | 76 | the spider is located (and instantiated) by Scrapy, so it must be |
| | 77 | unique. However, nothing prevents you from instantiating more than one |
| | 78 | instance of the same spider. This is the most important spider attribute |
| | 79 | and it's required. |
| | 80 | |
| | 81 | Is recommended to name your spiders after the domain that their crawl. |
| | 82 | |
| | 83 | .. attribute:: allowed_domains |
| | 84 | |
| | 85 | An optional list of strings containing domains that this spider is |
| | 86 | allowed to crawl. Requests for URLs not belonging to the domain names |
| | 87 | specified in this list won't be followed if |
| | 88 | :class:`~scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware` is enabled. |
| 87 | 89 | |
| 88 | 90 | .. attribute:: start_urls |
| … |
… |
|
| 145 | 147 | |
| 146 | 148 | Log a message using the :func:`scrapy.log.msg` function, automatically |
| 147 | | populating the domain argument with the :attr:`domain_name` of this |
| | 149 | populating the spider argument with the :attr:`name` of this |
| 148 | 150 | spider. For more information see :ref:`topics-logging`. |
| 149 | 151 | |
| … |
… |
|
| 158 | 160 | |
| 159 | 161 | class MySpider(BaseSpider): |
| 160 | | domain_name = 'http://www.example.com' |
| | 162 | name = 'example.com' |
| | 163 | allowed_domains = ['example.com'] |
| 161 | 164 | start_urls = [ |
| 162 | 165 | 'http://www.example.com/1.html', |
| … |
… |
|
| 178 | 181 | |
| 179 | 182 | class MySpider(BaseSpider): |
| 180 | | domain_name = 'http://www.example.com' |
| | 183 | name = 'example.com' |
| | 184 | allowed_domains = ['example.com'] |
| 181 | 185 | start_urls = [ |
| 182 | 186 | 'http://www.example.com/1.html', |
| … |
… |
|
| 255 | 259 | |
| 256 | 260 | class MySpider(CrawlSpider): |
| 257 | | domain_name = 'example.com' |
| | 261 | name = 'example.com' |
| | 262 | allowed_domains = ['example.com'] |
| 258 | 263 | start_urls = ['http://www.example.com'] |
| 259 | 264 | |
| … |
… |
|
| 379 | 384 | |
| 380 | 385 | class MySpider(XMLFeedSpider): |
| 381 | | domain_name = 'example.com' |
| | 386 | name = 'example.com' |
| | 387 | allowed_domains = ['example.com'] |
| 382 | 388 | start_urls = ['http://www.example.com/feed.xml'] |
| 383 | 389 | iterator = 'iternodes' # This is actually unnecesary, since it's the default value |
| … |
… |
|
| 436 | 442 | |
| 437 | 443 | class MySpider(CSVFeedSpider): |
| 438 | | domain_name = 'example.com' |
| | 444 | name = 'example.com' |
| | 445 | allowed_domains = ['example.com'] |
| 439 | 446 | start_urls = ['http://www.example.com/feed.csv'] |
| 440 | 447 | delimiter = ';' |
-
|
r1849
|
r1975
|
|
| 205 | 205 | A simple stats collector that keeps the stats of the last scraping run (for |
| 206 | 206 | each spider) in memory, after they're closed. The stats can be accessed |
| 207 | | through the :attr:`domain_stats` attribute, which is a dict keyed by spider |
| | 207 | through the :attr:`spider_stats` attribute, which is a dict keyed by spider |
| 208 | 208 | domain name. |
| 209 | 209 | |
| 210 | 210 | This is the default Stats Collector used in Scrapy. |
| 211 | 211 | |
| 212 | | .. attribute:: domain_stats |
| 213 | | |
| 214 | | A dict of dicts (keyed by spider domain name) containing the stats of |
| 215 | | the last scraping run for each domain. |
| | 212 | .. attribute:: spider_stats |
| | 213 | |
| | 214 | A dict of dicts (keyed by spider name) containing the stats of the last |
| | 215 | scraping run for each spider. |
| 216 | 216 | |
| 217 | 217 | DummyStatsCollector |
| … |
… |
|
| 241 | 241 | persitance time: |
| 242 | 242 | |
| 243 | | * ``domain``: the spider domain (so you can use it later for querying stats |
| 244 | | for that domain) |
| | 243 | * ``spider``: the spider name (so you can use it later for querying stats |
| | 244 | for that spider) |
| 245 | 245 | * ``timestamp``: the timestamp when the stats were persisited |
| 246 | 246 | |
| 247 | | Both the ``domain`` and ``timestamp`` are used for generating the SimpleDB |
| | 247 | Both the ``spider`` and ``timestamp`` are used for generating the SimpleDB |
| 248 | 248 | item name in order to avoid overwriting stats of previous scraping runs. |
| 249 | 249 | |
-
|
r1933
|
r1975
|
|
| 7 | 7 | class GoogleDirectorySpider(CrawlSpider): |
| 8 | 8 | |
| 9 | | domain_name = 'directory.google.com' |
| | 9 | name = 'google_directory' |
| | 10 | allowed_domains = ['directory.google.com'] |
| 10 | 11 | start_urls = ['http://directory.google.com/'] |
| 11 | 12 | |
-
|
r1934
|
r1975
|
|
| 30 | 30 | |
| 31 | 31 | class ImdbSiteSpider(CrawlSpider): |
| 32 | | domain_name = 'imdb.com' |
| | 32 | name = 'imdb.com' |
| | 33 | allowed_domains = ['imdb.com'] |
| 33 | 34 | start_urls = ['http://www.imdb.com/'] |
| 34 | 35 | |
-
|
r1517
|
r1975
|
|
| 7 | 7 | class GoogleDirectorySpider(CrawlSpider): |
| 8 | 8 | |
| 9 | | domain_name = 'directory.google.com' |
| | 9 | name = 'directory.google.com' |
| | 10 | allow_domains = ['directory.google.com'] |
| 10 | 11 | start_urls = ['http://directory.google.com/'] |
| 11 | 12 | |
-
|
r1974
|
r1975
|
|
| 14 | 14 | |
| 15 | 15 | def syntax(self): |
| 16 | | return "[options] <domain|url> ..." |
| | 16 | return "[options] <spider|url> ..." |
| 17 | 17 | |
| 18 | 18 | def short_desc(self): |
| 19 | | return "Start crawling a domain or URL" |
| | 19 | return "Start crawling from a spider or URL" |
| 20 | 20 | |
| 21 | 21 | def add_options(self, parser): |
| … |
… |
|
| 32 | 32 | |
| 33 | 33 | def run(self, args, opts): |
| 34 | | |
| 35 | | urls, domains = self._split_urls_and_domains(args) |
| 36 | | for dom in domains: |
| 37 | | scrapymanager.crawl_domain(dom) |
| | 34 | urls, names = self._split_urls_and_names(args) |
| | 35 | for name in names: |
| | 36 | scrapymanager.crawl_spider_name(name) |
| 38 | 37 | |
| 39 | 38 | if opts.spider: |
| … |
… |
|
| 66 | 65 | return spider_urls.items() |
| 67 | 66 | |
| 68 | | def _split_urls_and_domains(self, args): |
| | 67 | def _split_urls_and_names(self, args): |
| 69 | 68 | urls = [] |
| 70 | | domains = [] |
| | 69 | names = [] |
| 71 | 70 | for arg in args: |
| 72 | 71 | if is_url(arg): |
| 73 | 72 | urls.append(arg) |
| 74 | 73 | else: |
| 75 | | domains.append(arg) |
| 76 | | return urls, domains |
| | 74 | names.append(arg) |
| | 75 | return urls, names |
-
|
r1973
|
r1975
|
|
| 16 | 16 | |
| 17 | 17 | def sanitize_module_name(module_name): |
| 18 | | """Sanitize the given module name, by replacing dashes with underscores and |
| 19 | | prefixing it with a letter if it doesn't start with one |
| | 18 | """Sanitize the given module name, by replacing dashes and points |
| | 19 | with underscores and prefixing it with a letter if it doesn't start |
| | 20 | with one |
| 20 | 21 | """ |
| 21 | | module_name = module_name.replace('-', '_') |
| | 22 | module_name = module_name.replace('-', '_').replace('.', '_') |
| 22 | 23 | if module_name[0] not in string.ascii_letters: |
| 23 | 24 | module_name = "a" + module_name |
| … |
… |
|
| 29 | 30 | |
| 30 | 31 | def syntax(self): |
| 31 | | return "[options] <spider_module_name> <spider_domain_name>" |
| | 32 | return "[options] <name> <domain>" |
| 32 | 33 | |
| 33 | 34 | def short_desc(self): |
| … |
… |
|
| 55 | 56 | return |
| 56 | 57 | |
| 57 | | if len(args) < 2: |
| | 58 | if len(args) != 2: |
| 58 | 59 | return False |
| 59 | 60 | |
| 60 | | module = sanitize_module_name(args[0]) |
| | 61 | name = args[0] |
| 61 | 62 | domain = args[1] |
| | 63 | |
| | 64 | module = sanitize_module_name(name) |
| 62 | 65 | |
| 63 | 66 | # if spider already exists and not force option then halt |
| 64 | 67 | try: |
| 65 | | spider = spiders.create(domain) |
| | 68 | spider = spiders.create(name) |
| 66 | 69 | except KeyError: |
| 67 | 70 | pass |
| 68 | 71 | else: |
| 69 | 72 | if not opts.force: |
| 70 | | print "Spider '%s' already exists in module:" % domain |
| | 73 | print "Spider '%s' already exists in module:" % name |
| 71 | 74 | print " %s" % spider.__module__ |
| 72 | 75 | sys.exit(1) |
| … |
… |
|
| 74 | 77 | template_file = self._find_template(opts.template) |
| 75 | 78 | if template_file: |
| 76 | | self._genspider(module, domain, opts.template, template_file) |
| | 79 | self._genspider(module, name, domain, opts.template, template_file) |
| 77 | 80 | |
| 78 | | def _genspider(self, module, domain, template_name, template_file): |
| | 81 | def _genspider(self, module, name, domain, template_name, template_file): |
| 79 | 82 | """Generate the spider module, based on the given template""" |
| 80 | 83 | tvars = { |
| … |
… |
|
| 82 | 85 | 'ProjectName': string_camelcase(settings.get('BOT_NAME')), |
| 83 | 86 | 'module': module, |
| 84 | | 'site': domain, |
| | 87 | 'name': name, |
| | 88 | 'domain': domain, |
| 85 | 89 | 'classname': '%sSpider' % ''.join([s.capitalize() \ |
| 86 | 90 | for s in module.split('_')]) |
| … |
… |
|
| 93 | 97 | shutil.copyfile(template_file, spider_file) |
| 94 | 98 | render_templatefile(spider_file, **tvars) |
| 95 | | print "Created spider %r using template %r in module:" % (domain, \ |
| | 99 | print "Created spider %r using template %r in module:" % (name, \ |
| 96 | 100 | template_name) |
| 97 | 101 | print " %s.%s" % (spiders_module.__name__, module) |
-
|
r1974
|
r1975
|
|
| 47 | 47 | callback_fcn = callback if callable(callback) else getattr(spider, callback, None) |
| 48 | 48 | if not callback_fcn: |
| 49 | | log.msg('Cannot find callback %s in %s spider' % (callback, spider.domain_name)) |
| | 49 | log.msg('Cannot find callback %s in %s spider' % (callback, spider.name)) |
| 50 | 50 | return (), () |
| 51 | 51 | |
| … |
… |
|
| 131 | 131 | log.msg('No rules found for spider "%s", ' \ |
| 132 | 132 | 'please specify a callback for parsing' \ |
| 133 | | % spider.domain_name, log.ERROR) |
| | 133 | % spider.name, log.ERROR) |
| 134 | 134 | else: |
| 135 | 135 | # default callback 'parse' |
-
|
r1681
|
r1975
|
|
| 21 | 21 | def process_request(self, request, spider): |
| 22 | 22 | hostname = urlparse_cached(request).hostname |
| 23 | | if spider.domain_name == 's3.amazonaws.com' \ |
| | 23 | if spider.name == 's3.amazonaws.com' \ |
| 24 | 24 | or (hostname and hostname.endswith('s3.amazonaws.com')): |
| 25 | 25 | request.headers['Date'] = time.strftime("%a, %d %b %Y %H:%M:%S GMT", \ |
-
|
r1843
|
r1975
|
|
| 109 | 109 | def _get_request_path(self, spider, request): |
| 110 | 110 | key = request_fingerprint(request) |
| 111 | | return join(self.cachedir, spider.domain_name, key[0:2], key) |
| | 111 | return join(self.cachedir, spider.name, key[0:2], key) |
| 112 | 112 | |
| 113 | 113 | def _read_meta(self, spider, request): |
-
|
r1849
|
r1975
|
|
| 1 | 1 | """ |
| 2 | 2 | This module provides a mechanism for collecting one (or more) sample items per |
| 3 | | domain. |
| | 3 | spider. |
| 4 | 4 | |
| 5 | 5 | The items are collected in a dict of guid->item and persisted by pickling that |
| … |
… |
|
| 9 | 9 | code that affects several spiders. |
| 10 | 10 | |
| 11 | | It uses the scrapy stats service to keep track of which domains are already |
| | 11 | It uses the scrapy stats service to keep track of which spiders are already |
| 12 | 12 | sampled. |
| 13 | 13 | |
| … |
… |
|
| 49 | 49 | self.items = {} |
| 50 | 50 | self.spiders_count = 0 |
| 51 | | self.empty_domains = set() |
| | 51 | self.empty_spiders = set() |
| 52 | 52 | dispatcher.connect(self.spider_closed, signal=signals.spider_closed) |
| 53 | 53 | dispatcher.connect(self.engine_stopped, signal=signals.engine_stopped) |
| … |
… |
|
| 67 | 67 | with open(self.filename, 'w') as f: |
| 68 | 68 | pickle.dump(self.items, f) |
| 69 | | if self.empty_domains: |
| 70 | | log.msg("No products sampled for: %s" % " ".join(self.empty_domains), \ |
| | 69 | if self.empty_spiders: |
| | 70 | log.msg("No products sampled for: %s" % " ".join(self.empty_spiders), \ |
| 71 | 71 | level=log.WARNING) |
| 72 | 72 | |
| 73 | 73 | def spider_closed(self, spider, reason): |
| 74 | 74 | if reason == 'finished' and not stats.get_value("items_sampled", spider=spider): |
| 75 | | self.empty_domains.add(spider.domain_name) |
| | 75 | self.empty_spiders.add(spider.name) |
| 76 | 76 | self.spiders_count += 1 |
| 77 | | log.msg("Sampled %d domains so far (%d empty)" % (self.spiders_count, \ |
| 78 | | len(self.empty_domains)), level=log.INFO) |
| | 77 | log.msg("Sampled %d spiders so far (%d empty)" % (self.spiders_count, \ |
| | 78 | len(self.empty_spiders)), level=log.INFO) |
| 79 | 79 | |
| 80 | 80 | |
| 81 | 81 | class ItemSamplerMiddleware(object): |
| 82 | | """This middleware drops items and requests (when domain sampling has been |
| 83 | | completed) to accelerate the processing of remaining domains""" |
| | 82 | """This middleware drops items and requests (when spider sampling has been |
| | 83 | completed) to accelerate the processing of remaining spiders""" |
| 84 | 84 | |
| 85 | 85 | def __init__(self): |
-
|
r1849
|
r1975
|
|
| 48 | 48 | |
| 49 | 49 | def spider_closed(self, spider): |
| 50 | | self.created_directories.pop(spider.domain_name, None) |
| | 50 | self.created_directories.pop(spider.name, None) |
| 51 | 51 | |
| 52 | 52 | def persist_image(self, key, image, buf, info): |
| … |
… |
|
| 93 | 93 | delayed even more because it is uploading images to s3. |
| 94 | 94 | """ |
| 95 | | domain_name = "s3.amazonaws.com" |
| | 95 | name = "s3.amazonaws.com" |
| 96 | 96 | start_urls = ['http://s3.amazonaws.com/'] |
| 97 | 97 | max_concurrent_requests = 100 |
-
|
r1973
|
r1975
|
|
| 54 | 54 | for spider in self._getspiders(ISpider, module): |
| 55 | 55 | ISpider.validateInvariants(spider) |
| 56 | | self._spiders[spider.domain_name] = spider |
| | 56 | self._spiders[spider.name] = spider |
| 57 | 57 | self.loaded = True |
| 58 | 58 | |
| … |
… |
|
| 77 | 77 | spider |
| 78 | 78 | """ |
| 79 | | domain = spider.domain_name |
| 80 | | if domain not in self._spiders: |
| | 79 | name = spider.name |
| | 80 | if name not in self._spiders: |
| 81 | 81 | return |
| 82 | | spider = self._spiders[domain] |
| | 82 | spider = self._spiders[name] |
| 83 | 83 | module_name = spider.__module__ |
| 84 | 84 | module = sys.modules[module_name] |
| … |
… |
|
| 87 | 87 | level=log.DEBUG) |
| 88 | 88 | new_module = rebuild(module, doLog=0) |
| 89 | | self._spiders[domain] = new_module.SPIDER |
| | 89 | self._spiders[name] = new_module.SPIDER |
-
|
r1841
|
r1975
|
|
| 48 | 48 | |
| 49 | 49 | def spider_opened(self, spider): |
| 50 | | domains = [spider.domain_name] + spider.extra_domain_names |
| 51 | | self.host_regexes[spider] = self.get_host_regex(domains) |
| | 50 | self.host_regexes[spider] = self.get_host_regex(spider.allowed_domains) |
| 52 | 51 | self.domains_seen[spider] = set() |
| 53 | 52 | |
-
|
r1849
|
r1975
|
|
| 24 | 24 | body = "Global stats\n\n" |
| 25 | 25 | body += "\n".join("%-50s : %s" % i for i in stats.get_stats().items()) |
| 26 | | body += "\n\n%s stats\n\n" % spider.domain_name |
| | 26 | body += "\n\n%s stats\n\n" % spider.name |
| 27 | 27 | body += "\n".join("%-50s : %s" % i for i in spider_stats.items()) |
| 28 | | mail.send(self.recipients, "Scrapy stats for: %s" % spider.domain_name, body) |
| | 28 | mail.send(self.recipients, "Scrapy stats for: %s" % spider.name, body) |
-
|
r1867
|
r1975
|
|
| 61 | 61 | |
| 62 | 62 | s += '<tr><td>%s</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td align="right">%d</td><td>%s</td><td>%s</td></tr>\n' % \ |
| 63 | | (spider.domain_name, stats.scraped, stats.crawled, scheduled, dqueued, active, transf, str(stats.started), str(runtime)) |
| | 63 | (spider.name, stats.scraped, stats.crawled, scheduled, dqueued, active, transf, str(stats.started), str(runtime)) |
| 64 | 64 | |
| 65 | 65 | totdomains += 1 |
-
|
r1972
|
r1975
|
|
| 26 | 26 | |
| 27 | 27 | def spider_opened(self, spider): |
| 28 | | self.running[spider.domain_name] = spider |
| | 28 | self.running[spider.name] = spider |
| 29 | 29 | |
| 30 | 30 | def spider_closed(self, spider): |
| 31 | | del self.running[spider.domain_name] |
| 32 | | self.finished.add(spider.domain_name) |
| | 31 | del self.running[spider.name] |
| | 32 | self.finished.add(spider.name) |
| 33 | 33 | |
| 34 | 34 | def webconsole_render(self, wc_request): |
| … |
… |
|
| 36 | 36 | changes = self.webconsole_control(wc_request) |
| 37 | 37 | |
| 38 | | self.scheduled = [s.domain_name for s in scrapyengine.spider_scheduler._pending_spiders] |
| 39 | | self.idle = [d for d in self.enabled_domains if d not in self.scheduled |
| | 38 | self.scheduled = [s.name for s in scrapyengine.spider_scheduler._pending_spiders] |
| | 39 | self.idle = [d for d in self.enabled_spiders if d not in self.scheduled |
| 40 | 40 | and d not in self.running |
| 41 | 41 | and d not in self.finished] |
| … |
… |
|
| 54 | 54 | s += "<td valign='top'>\n" |
| 55 | 55 | s += '<form method="post" action=".">\n' |
| 56 | | s += '<select name="add_pending_domains" multiple="multiple">\n' |
| 57 | | for domain in sorted(self.idle): |
| 58 | | s += "<option>%s</option>\n" % domain |
| | 56 | s += '<select name="add_pending_spiders" multiple="multiple">\n' |
| | 57 | for name in sorted(self.idle): |
| | 58 | s += "<option>%s</option>\n" % name |
| 59 | 59 | s += '</select><br>\n' |
| 60 | 60 | s += '<br />' |
| … |
… |
|
| 66 | 66 | s += "<td valign='top'>\n" |
| 67 | 67 | s += '<form method="post" action=".">\n' |
| 68 | | s += '<select name="remove_pending_domains" multiple="multiple">\n' |
| 69 | | for domain in self.scheduled: |
| 70 | | s += "<option>%s</option>\n" % domain |
| | 68 | s += '<select name="remove_pending_spiders" multiple="multiple">\n' |
| | 69 | for name in self.scheduled: |
| | 70 | s += "<option>%s</option>\n" % name |
| 71 | 71 | s += '</select><br>\n' |
| 72 | 72 | s += '<br />' |
| … |
… |
|
| 79 | 79 | s += "<td valign='top'>\n" |
| 80 | 80 | s += '<form method="post" action=".">\n' |
| 81 | | s += '<select name="stop_running_domains" multiple="multiple">\n' |
| 82 | | for domain in sorted(self.running): |
| 83 | | s += "<option>%s</option>\n" % domain |
| | 81 | s += '<select name="stop_running_spiders" multiple="multiple">\n' |
| | 82 | for name in sorted(self.running): |
| | 83 | s += "<option>%s</option>\n" % name |
| 84 | 84 | s += '</select><br>\n' |
| 85 | 85 | s += '<br />' |
| … |
… |
|
| 91 | 91 | s += "<td valign='top'>\n" |
| 92 | 92 | s += '<form method="post" action=".">\n' |
| 93 | | s += '<select name="rerun_finished_domains" multiple="multiple">\n' |
| 94 | | for domain in sorted(self.finished): |
| 95 | | s += "<option>%s</option>\n" % domain |
| | 93 | s += '<select name="rerun_finished_spiders" multiple="multiple">\n' |
| | 94 | for name in sorted(self.finished): |
| | 95 | s += "<option>%s</option>\n" % name |
| 96 | 96 | s += '</select><br>\n' |
| 97 | 97 | s += '<br />' |
| … |
… |
|
| 115 | 115 | s = "<hr />\n" |
| 116 | 116 | |
| 117 | | if "stop_running_domains" in args: |
| | 117 | if "stop_running_spiders" in args: |
| 118 | 118 | s += "<p>" |
| 119 | | stopped_domains = [] |
| 120 | | for domain in args["stop_running_domains"]: |
| 121 | | if domain in self.running: |
| 122 | | scrapyengine.close_spider(self.running[domain]) |
| 123 | | stopped_domains.append(domain) |
| 124 | | s += "Stopped spiders: <ul><li>%s</li></ul>" % "</li><li>".join(stopped_domains) |
| | 119 | stopped_spiders = [] |
| | 120 | for name in args["stop_running_spiders"]: |
| | 121 | if name in self.running: |
| | 122 | scrapyengine.close_spider(self.running[name]) |
| | 123 | stopped_spiders.append(name) |
| | 124 | s += "Stopped spiders: <ul><li>%s</li></ul>" % "</li><li>".join(stopped_spiders) |
| 125 | 125 | s += "</p>" |
| 126 | | if "remove_pending_domains" in args: |
| | 126 | if "remove_pending_spiders" in args: |
| 127 | 127 | removed = [] |
| 128 | | for domain in args["remove_pending_domains"]: |
| 129 | | if scrapyengine.spider_scheduler.remove_pending_domain(domain): |
| 130 | | removed.append(domain) |
| | 128 | for name in args["remove_pending_spiders"]: |
| | 129 | if scrapyengine.spider_scheduler.remove_pending_spider(name): |
| | 130 | removed.append(name) |
| 131 | 131 | if removed: |
| 132 | 132 | s += "<p>" |
| 133 | | s += "Removed scheduled spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["remove_pending_domains"]) |
| | 133 | s += "Removed scheduled spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["remove_pending_spiders"]) |
| 134 | 134 | s += "</p>" |
| 135 | | if "add_pending_domains" in args: |
| 136 | | for domain in args["add_pending_domains"]: |
| 137 | | if domain not in scrapyengine.scheduler.pending_requests: |
| 138 | | scrapymanager.crawl_domain(domain) |
| | 135 | if "add_pending_spiders" in args: |
| | 136 | for name in args["add_pending_spiders"]: |
| | 137 | if name not in scrapyengine.scheduler.pending_requests: |
| | 138 | scrapymanager.crawl_spider_name(name) |
| 139 | 139 | s += "<p>" |
| 140 | | s += "Scheduled spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["add_pending_domains"]) |
| | 140 | s += "Scheduled spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["add_pending_spiders"]) |
| 141 | 141 | s += "</p>" |
| 142 | | if "rerun_finished_domains" in args: |
| 143 | | for domain in args["rerun_finished_domains"]: |
| 144 | | if domain not in scrapyengine.scheduler.pending_requests: |
| 145 | | scrapymanager.crawl_domain(domain) |
| 146 | | self.finished.remove(domain) |
| | 142 | if "rerun_finished_spiders" in args: |
| | 143 | for name in args["rerun_finished_spiders"]: |
| | 144 | if name not in scrapyengine.scheduler.pending_requests: |
| | 145 | scrapymanager.crawl_spider_name(name) |
| | 146 | self.finished.remove(name) |
| 147 | 147 | s += "<p>" |
| 148 | | s += "Re-scheduled finished spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["rerun_finished_domains"]) |
| | 148 | s += "Re-scheduled finished spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["rerun_finished_spiders"]) |
| 149 | 149 | s += "</p>" |
| 150 | 150 | |
| … |
… |
|
| 152 | 152 | |
| 153 | 153 | def webconsole_discover_module(self): |
| 154 | | self.enabled_domains = spiders.list() |
| | 154 | self.enabled_spiders = spiders.list() |
| 155 | 155 | return self |
-
|
r1849
|
r1975
|
|
| 24 | 24 | s += stats_html_table(stats.get_stats()) |
| 25 | 25 | for spider, spider_stats in stats.iter_spider_stats(): |
| 26 | | s += "<h3>%s</h3>\n" % spider.domain_name |
| | 26 | s += "<h3>%s</h3>\n" % spider.name |
| 27 | 27 | s += stats_html_table(spider_stats) |
| 28 | 28 | s += "</body>\n" |
-
|
r1973
|
r1975
|
|
| 55 | 55 | scrapyengine.crawl(request, spider) |
| 56 | 56 | |
| 57 | | def crawl_domain(self, domain): |
| 58 | | """Schedule given domain for crawling.""" |
| | 57 | def crawl_spider_name(self, name): |
| | 58 | """Schedule given spider by name for crawling.""" |
| 59 | 59 | try: |
| 60 | | spider = spiders.create(domain) |
| | 60 | spider = spiders.create(name) |
| 61 | 61 | except KeyError: |
| 62 | | log.msg('Could not find spider for domain: %s' % domain, log.ERROR) |
| | 62 | log.msg('Could not find spider: %s' % name, log.ERROR) |
| 63 | 63 | else: |
| 64 | 64 | self.crawl_spider(spider) |
-
|
r1956
|
r1975
|
|
| 76 | 76 | dispatcher.send(signal=logmessage_received, message=message, level=level, \ |
| 77 | 77 | spider=spider) |
| 78 | | system = domain or (spider.domain_name if spider else component) |
| | 78 | system = domain or (spider.name if spider else component) |
| 79 | 79 | msg_txt = unicode_to_str("%s: %s" % (level_names[level], message), log_encoding) |
| 80 | 80 | log.msg(msg_txt, system=system) |
| … |
… |
|
| 94 | 94 | warnings.warn("'domain' argument of scrapy.log.err() is deprecated, " \ |
| 95 | 95 | "use 'spider' argument instead", DeprecationWarning, stacklevel=2) |
| 96 | | kwargs['system'] = domain or (spider.domain_name if spider else component) |
| | 96 | kwargs['system'] = domain or (spider.name if spider else component) |
| 97 | 97 | if _why: |
| 98 | 98 | _why = unicode_to_str("ERROR: %s" % _why, log_encoding) |
-
|
r1886
|
r1975
|
|
| 4 | 4 | See documentation in docs/topics/spiders.rst |
| 5 | 5 | """ |
| | 6 | |
| | 7 | import warnings |
| | 8 | |
| 6 | 9 | from zope.interface import Interface, Attribute, invariant, implements |
| 7 | 10 | from twisted.plugin import IPlugin |
| … |
… |
|
| 12 | 15 | from scrapy.utils.trackref import object_ref |
| 13 | 16 | |
| 14 | | def _valid_domain_name(obj): |
| 15 | | """Check the domain name specified is valid""" |
| 16 | | if not obj.domain_name: |
| 17 | | raise ValueError("Spider 'domain_name' attribute is required") |
| 18 | | |
| 19 | 17 | class ISpider(Interface, IPlugin) : |
| 20 | | """Interface to be implemented by site-specific web spiders""" |
| 21 | | |
| 22 | | domain_name = Attribute("The domain name of the site to be scraped.") |
| 23 | | |
| 24 | | invariant(_valid_domain_name) |
| | 18 | """Interface used by TwistedPluginSpiderManager to discover spiders""" |
| | 19 | pass |
| 25 | 20 | |
| 26 | 21 | class BaseSpider(object_ref): |
| … |
… |
|
| 32 | 27 | |
| 33 | 28 | # XXX: class attributes kept for backwards compatibility |
| 34 | | domain_name = None |
| | 29 | name = None |
| 35 | 30 | start_urls = [] |
| 36 | | extra_domain_names = [] |
| | 31 | allowed_domains = [] |
| 37 | 32 | |
| 38 | | def __init__(self, domain_name=None): |
| 39 | | if domain_name is not None: |
| 40 | | self.domain_name = domain_name |
| | 33 | def __init__(self, name=None): |
| | 34 | # XXX: SEP-12 backward compatibility (remove for 0.10) |
| | 35 | if hasattr(self, 'domain_name'): |
| | 36 | warnings.warn("Spider.domain_name attribute is deprecated, use Spider.name instead", \ |
| | 37 | DeprecationWarning, stacklevel=4) |
| | 38 | self.name = self.domain_name |
| | 39 | if hasattr(self, 'extra_domain_names'): |
| | 40 | warnings.warn("Spider.extra_domain_names attribute is deprecated - user Spider.allowed_domains instead", \ |
| | 41 | DeprecationWarning, stacklevel=4) |
| | 42 | self.allowed_domains = [self.name] + list(self.extra_domain_names) |
| | 43 | |
| | 44 | if name is not None: |
| | 45 | self.name = name |
| 41 | 46 | # XXX: create instance attributes (class attributes were kept for |
| 42 | 47 | # backwards compatibility) |
| 43 | 48 | if not self.start_urls: |
| 44 | 49 | self.start_urls = [] |
| 45 | | if not self.extra_domain_names: |
| 46 | | self.extra_domain_names = [] |
| | 50 | if not self.allowed_domains: |
| | 51 | self.allowed_domains = [] |
| | 52 | if not getattr(self, 'domain_name', None): |
| | 53 | self.domain_name = self.name |
| | 54 | if not getattr(self, 'extra_domain_names', None): |
| | 55 | self.extra_domain_names = self.allowed_domains |
| 47 | 56 | |
| 48 | 57 | def log(self, message, level=log.DEBUG): |
| … |
… |
|
| 68 | 77 | |
| 69 | 78 | def __str__(self): |
| 70 | | return "<%s %r>" % (type(self).__name__, self.domain_name) |
| | 79 | return "<%s %r>" % (type(self).__name__, self.name) |
| 71 | 80 | |
| 72 | 81 | __repr__ = __str__ |
-
|
r1849
|
r1975
|
|
| 77 | 77 | def __init__(self): |
| 78 | 78 | super(MemoryStatsCollector, self).__init__() |
| 79 | | self.domain_stats = {} |
| | 79 | self.spider_stats = {} |
| 80 | 80 | |
| 81 | 81 | def _persist_stats(self, stats, spider=None): |
| 82 | 82 | if spider is not None: |
| 83 | | self.domain_stats[spider.domain_name] = stats |
| | 83 | self.spider_stats[spider.name] = stats |
| 84 | 84 | |
| 85 | 85 | |
-
|
r1849
|
r1975
|
|
| 37 | 37 | def _persist_to_sdb(self, spider, stats): |
| 38 | 38 | ts = self._get_timestamp(spider).isoformat() |
| 39 | | sdb_item_id = "%s_%s" % (spider.domain_name, ts) |
| | 39 | sdb_item_id = "%s_%s" % (spider.name, ts) |
| 40 | 40 | sdb_item = dict((k, self._to_sdb_value(v, k)) for k, v in stats.iteritems()) |
| 41 | | sdb_item['domain'] = spider.domain_name |
| | 41 | sdb_item['spider'] = spider.name |
| 42 | 42 | sdb_item['timestamp'] = self._to_sdb_value(ts) |
| 43 | 43 | connect_sdb().put_attributes(self._sdbdomain, sdb_item_id, sdb_item) |
-
|
r1752
|
r1975
|
|
| 2 | 2 | |
| 3 | 3 | class $classname(BaseSpider): |
| 4 | | domain_name = "$site" |
| | 4 | name = "$name" |
| | 5 | allowed_domains = ["$domain"] |
| 5 | 6 | start_urls = ( |
| 6 | | 'http://www.$site/', |
| | 7 | 'http://www.$domain/', |
| 7 | 8 | ) |
| 8 | 9 | |
-
|
r1927
|
r1975
|
|
| 7 | 7 | |
| 8 | 8 | class $classname(CrawlSpider): |
| 9 | | domain_name = '$site' |
| 10 | | start_urls = ['http://www.$site/'] |
| | 9 | name = '$name' |
| | 10 | allowed_domains = ['$domain'] |
| | 11 | start_urls = ['http://www.$domain/'] |
| 11 | 12 | |
| 12 | 13 | rules = ( |
| … |
… |
|
| 17 | 18 | hxs = HtmlXPathSelector(response) |
| 18 | 19 | i = ${ProjectName}Item() |
| 19 | | #i['site_id'] = hxs.select('//input[@id="sid"]/@value').extract() |
| | 20 | #i['domain_id'] = hxs.select('//input[@id="sid"]/@value').extract() |
| 20 | 21 | #i['name'] = hxs.select('//div[@id="name"]').extract() |
| 21 | 22 | #i['description'] = hxs.select('//div[@id="description"]').extract() |
-
|
r1718
|
r1975
|
|
| 3 | 3 | |
| 4 | 4 | class $classname(CSVFeedSpider): |
| 5 | | domain_name = '$site' |
| 6 | | start_urls = ['http://www.$site/feed.csv'] |
| | 5 | name = '$name' |
| | 6 | allowed_domains = ['$domain'] |
| | 7 | start_urls = ['http://www.$domain/feed.csv'] |
| 7 | 8 | # headers = ['id', 'name', 'description', 'image_link'] |
| 8 | 9 | # delimiter = '\t' |
-
|
r1718
|
r1975
|
|
| 3 | 3 | |
| 4 | 4 | class $classname(XMLFeedSpider): |
| 5 | | domain_name = '$site' |
| 6 | | start_urls = ['http://www.$site/feed.xml'] |
| | 5 | name = '$name' |
| | 6 | allowed_domains = ['$domain'] |
| | 7 | start_urls = ['http://www.$domain/feed.xml'] |
| 7 | 8 | |
| 8 | 9 | def parse_item(self, response, selector): |
-
|
r1737
|
r1975
|
|
| 60 | 60 | class GenspiderCommandTest(CommandTest): |
| 61 | 61 | |
| | 62 | def test_arguments(self): |
| | 63 | # only pass one argument. spider script shouldn't be created |
| | 64 | self.assertEqual(0, self.call('genspider', 'test_name')) |
| | 65 | assert not exists(join(self.proj_mod_path, 'spiders', 'test_name.py')) |
| | 66 | # pass two arguments <name> <domain>. spider script should be created |
| | 67 | self.assertEqual(0, self.call('genspider', 'test_name', 'test.com')) |
| | 68 | assert exists(join(self.proj_mod_path, 'spiders', 'test_name.py')) |
| | 69 | |
| 62 | 70 | def test_template_default(self, *args): |
| 63 | | self.assertEqual(0, self.call('genspider', 'testspider', 'test.com', *args)) |
| 64 | | assert exists(join(self.proj_mod_path, 'spiders', 'testspider.py')) |
| 65 | | self.assertEqual(1, self.call('genspider', 'otherspider', 'test.com')) |
| | 71 | self.assertEqual(0, self.call('genspider', 'test_spider', 'test.com', *args)) |
| | 72 | assert exists(join(self.proj_mod_path, 'spiders', 'test_spider.py')) |
| | 73 | self.assertEqual(1, self.call('genspider', 'test_spider', 'test.com')) |
| 66 | 74 | |
| 67 | 75 | def test_template_basic(self): |
-
|
r1972
|
r1975
|
|
| 23 | 23 | |
| 24 | 24 | class TestSpider(BaseSpider): |
| 25 | | domain_name = "scrapytest.org" |
| 26 | | extra_domain_names = ["localhost"] |
| | 25 | name = "scrapytest.org" |
| | 26 | allowed_domains = ["scrapytest.org", "localhost"] |
| 27 | 27 | start_urls = ['http://localhost'] |
| 28 | 28 | |
| … |
… |
|
| 69 | 69 | |
| 70 | 70 | def __init__(self): |
| 71 | | self.domain = 'scrapytest.org' |
| | 71 | self.name = 'scrapytest.org' |
| 72 | 72 | self.spider = None |
| 73 | 73 | self.respplug = [] |
| … |
… |
|
| 140 | 140 | """ |
| 141 | 141 | assert session.spider is not None |
| 142 | | self.assertEqual(session.spider.domain_name, session.domain) |
| | 142 | self.assertEqual(session.spider.name, session.name) |
| 143 | 143 | |
| 144 | 144 | def test_visited_urls(self): |
-
|
r1822
|
r1975
|
|
| 10 | 10 | def setUp(self): |
| 11 | 11 | self.spider = BaseSpider() |
| 12 | | self.spider.domain_name = 'scrapytest.org' |
| 13 | | self.spider.extra_domain_names = ['scrapy.org'] |
| | 12 | self.spider.name = 'scrapytest.org' |
| | 13 | self.spider.allowed_domains = ['scrapytest.org', 'scrapy.org'] |
| 14 | 14 | |
| 15 | 15 | self.mw = OffsiteMiddleware() |
-
|
r1957
|
r1975
|
|
| 23 | 23 | def url_is_from_spider(url, spider): |
| 24 | 24 | """Return True if the url belongs to the given spider""" |
| 25 | | domains = [spider.domain_name] |
| 26 | | domains.extend(spider.extra_domain_names) |
| 27 | | return url_is_from_any_domain(url, domains) |
| | 25 | return url_is_from_any_domain(url, spider.allowed_domains) |
| 28 | 26 | |
| 29 | 27 | def urljoin_rfc(base, ref, encoding='utf-8'): |