| 1 | from __future__ import with_statement |
|---|
| 2 | |
|---|
| 3 | import os |
|---|
| 4 | from os.path import join, exists |
|---|
| 5 | from time import time |
|---|
| 6 | import cPickle as pickle |
|---|
| 7 | |
|---|
| 8 | from scrapy.xlib.pydispatch import dispatcher |
|---|
| 9 | from scrapy.core import signals |
|---|
| 10 | from scrapy.http import Headers |
|---|
| 11 | from scrapy.core.exceptions import NotConfigured, IgnoreRequest |
|---|
| 12 | from scrapy.core.downloader.responsetypes import responsetypes |
|---|
| 13 | from scrapy.utils.request import request_fingerprint |
|---|
| 14 | from scrapy.utils.http import headers_dict_to_raw, headers_raw_to_dict |
|---|
| 15 | from scrapy.utils.httpobj import urlparse_cached |
|---|
| 16 | from scrapy.utils.misc import load_object |
|---|
| 17 | from scrapy import conf |
|---|
| 18 | |
|---|
| 19 | |
|---|
| 20 | class HttpCacheMiddleware(object): |
|---|
| 21 | |
|---|
| 22 | def __init__(self, settings=conf.settings): |
|---|
| 23 | self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings) |
|---|
| 24 | self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING') |
|---|
| 25 | dispatcher.connect(self.spider_opened, signal=signals.spider_opened) |
|---|
| 26 | dispatcher.connect(self.spider_closed, signal=signals.spider_closed) |
|---|
| 27 | |
|---|
| 28 | def spider_opened(self, spider): |
|---|
| 29 | self.storage.open_spider(spider) |
|---|
| 30 | |
|---|
| 31 | def spider_closed(self, spider): |
|---|
| 32 | self.storage.close_spider(spider) |
|---|
| 33 | |
|---|
| 34 | def process_request(self, request, spider): |
|---|
| 35 | if not self.is_cacheable(request): |
|---|
| 36 | return |
|---|
| 37 | response = self.storage.retrieve_response(spider, request) |
|---|
| 38 | if response: |
|---|
| 39 | response.flags.append('cached') |
|---|
| 40 | return response |
|---|
| 41 | elif self.ignore_missing: |
|---|
| 42 | raise IgnoreRequest("Ignored request not in cache: %s" % request) |
|---|
| 43 | |
|---|
| 44 | def process_response(self, request, response, spider): |
|---|
| 45 | if self.is_cacheable(request): |
|---|
| 46 | self.storage.store_response(spider, request, response) |
|---|
| 47 | return response |
|---|
| 48 | |
|---|
| 49 | def is_cacheable(self, request): |
|---|
| 50 | return urlparse_cached(request).scheme in ['http', 'https'] |
|---|
| 51 | |
|---|
| 52 | |
|---|
| 53 | class FilesystemCacheStorage(object): |
|---|
| 54 | |
|---|
| 55 | def __init__(self, settings=conf.settings): |
|---|
| 56 | cachedir = settings['HTTPCACHE_DIR'] |
|---|
| 57 | if not cachedir: |
|---|
| 58 | raise NotConfigured |
|---|
| 59 | self.cachedir = cachedir |
|---|
| 60 | self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS') |
|---|
| 61 | |
|---|
| 62 | def open_spider(self, spider): |
|---|
| 63 | pass |
|---|
| 64 | |
|---|
| 65 | def close_spider(self, spider): |
|---|
| 66 | pass |
|---|
| 67 | |
|---|
| 68 | def retrieve_response(self, spider, request): |
|---|
| 69 | """Return response if present in cache, or None otherwise.""" |
|---|
| 70 | metadata = self._read_meta(spider, request) |
|---|
| 71 | if metadata is None: |
|---|
| 72 | return # not cached |
|---|
| 73 | rpath = self._get_request_path(spider, request) |
|---|
| 74 | with open(join(rpath, 'response_body'), 'rb') as f: |
|---|
| 75 | body = f.read() |
|---|
| 76 | with open(join(rpath, 'response_headers'), 'rb') as f: |
|---|
| 77 | rawheaders = f.read() |
|---|
| 78 | url = metadata['url'] |
|---|
| 79 | status = metadata['status'] |
|---|
| 80 | headers = Headers(headers_raw_to_dict(rawheaders)) |
|---|
| 81 | respcls = responsetypes.from_args(headers=headers, url=url) |
|---|
| 82 | response = respcls(url=url, headers=headers, status=status, body=body) |
|---|
| 83 | return response |
|---|
| 84 | |
|---|
| 85 | def store_response(self, spider, request, response): |
|---|
| 86 | """Store the given response in the cache.""" |
|---|
| 87 | rpath = self._get_request_path(spider, request) |
|---|
| 88 | if not exists(rpath): |
|---|
| 89 | os.makedirs(rpath) |
|---|
| 90 | metadata = { |
|---|
| 91 | 'url': request.url, |
|---|
| 92 | 'method': request.method, |
|---|
| 93 | 'status': response.status, |
|---|
| 94 | 'timestamp': time(), |
|---|
| 95 | } |
|---|
| 96 | with open(join(rpath, 'meta'), 'wb') as f: |
|---|
| 97 | f.write(repr(metadata)) |
|---|
| 98 | with open(join(rpath, 'pickled_meta'), 'wb') as f: |
|---|
| 99 | pickle.dump(metadata, f, protocol=2) |
|---|
| 100 | with open(join(rpath, 'response_headers'), 'wb') as f: |
|---|
| 101 | f.write(headers_dict_to_raw(response.headers)) |
|---|
| 102 | with open(join(rpath, 'response_body'), 'wb') as f: |
|---|
| 103 | f.write(response.body) |
|---|
| 104 | with open(join(rpath, 'request_headers'), 'wb') as f: |
|---|
| 105 | f.write(headers_dict_to_raw(request.headers)) |
|---|
| 106 | with open(join(rpath, 'request_body'), 'wb') as f: |
|---|
| 107 | f.write(request.body) |
|---|
| 108 | |
|---|
| 109 | def _get_request_path(self, spider, request): |
|---|
| 110 | key = request_fingerprint(request) |
|---|
| 111 | return join(self.cachedir, spider.domain_name, key[0:2], key) |
|---|
| 112 | |
|---|
| 113 | def _read_meta(self, spider, request): |
|---|
| 114 | rpath = self._get_request_path(spider, request) |
|---|
| 115 | metapath = join(rpath, 'pickled_meta') |
|---|
| 116 | if not exists(metapath): |
|---|
| 117 | return # not found |
|---|
| 118 | mtime = os.stat(rpath).st_mtime |
|---|
| 119 | if 0 <= self.expiration_secs < time() - mtime: |
|---|
| 120 | return # expired |
|---|
| 121 | with open(metapath, 'rb') as f: |
|---|
| 122 | return pickle.load(f) |
|---|