root/scrapy/contrib/downloadermiddleware/httpcache.py @ 1843:133d5e60dded

Revision 1843:133d5e60dded, 4.5 kB (checked in by Pablo Hoffman <pablo@…>, 10 months ago)

Refactored HttpCache? middleware:

* simplified code
* performance improvements
* removed awkward/unused domain sectorization
* it can now receive Settings on constructor
* added unittests
* added documentation about filesystem storage structure

Also made scrapy.conf.Settings objects instantiable with a dict which is used to override default settings.

Line 
1from __future__ import with_statement
2
3import os
4from os.path import join, exists
5from time import time
6import cPickle as pickle
7
8from scrapy.xlib.pydispatch import dispatcher
9from scrapy.core import signals
10from scrapy.http import Headers
11from scrapy.core.exceptions import NotConfigured, IgnoreRequest
12from scrapy.core.downloader.responsetypes import responsetypes
13from scrapy.utils.request import request_fingerprint
14from scrapy.utils.http import headers_dict_to_raw, headers_raw_to_dict
15from scrapy.utils.httpobj import urlparse_cached
16from scrapy.utils.misc import load_object
17from scrapy import conf
18
19
20class HttpCacheMiddleware(object):
21
22    def __init__(self, settings=conf.settings):
23        self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
24        self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
25        dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
26        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
27
28    def spider_opened(self, spider):
29        self.storage.open_spider(spider)
30
31    def spider_closed(self, spider):
32        self.storage.close_spider(spider)
33
34    def process_request(self, request, spider):
35        if not self.is_cacheable(request):
36            return
37        response = self.storage.retrieve_response(spider, request)
38        if response:
39            response.flags.append('cached')
40            return response
41        elif self.ignore_missing:
42            raise IgnoreRequest("Ignored request not in cache: %s" % request)
43
44    def process_response(self, request, response, spider):
45        if self.is_cacheable(request):
46            self.storage.store_response(spider, request, response)
47        return response
48
49    def is_cacheable(self, request):
50        return urlparse_cached(request).scheme in ['http', 'https']
51
52
53class FilesystemCacheStorage(object):
54
55    def __init__(self, settings=conf.settings):
56        cachedir = settings['HTTPCACHE_DIR']
57        if not cachedir:
58            raise NotConfigured
59        self.cachedir = cachedir
60        self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
61
62    def open_spider(self, spider):
63        pass
64
65    def close_spider(self, spider):
66        pass
67
68    def retrieve_response(self, spider, request):
69        """Return response if present in cache, or None otherwise."""
70        metadata = self._read_meta(spider, request)
71        if metadata is None:
72            return # not cached
73        rpath = self._get_request_path(spider, request)
74        with open(join(rpath, 'response_body'), 'rb') as f:
75            body = f.read()
76        with open(join(rpath, 'response_headers'), 'rb') as f:
77            rawheaders = f.read()
78        url = metadata['url']
79        status = metadata['status']
80        headers = Headers(headers_raw_to_dict(rawheaders))
81        respcls = responsetypes.from_args(headers=headers, url=url)
82        response = respcls(url=url, headers=headers, status=status, body=body)
83        return response
84
85    def store_response(self, spider, request, response):
86        """Store the given response in the cache."""
87        rpath = self._get_request_path(spider, request)
88        if not exists(rpath):
89            os.makedirs(rpath)
90        metadata = {
91            'url': request.url,
92            'method': request.method,
93            'status': response.status,
94            'timestamp': time(),
95        }
96        with open(join(rpath, 'meta'), 'wb') as f:
97            f.write(repr(metadata))
98        with open(join(rpath, 'pickled_meta'), 'wb') as f:
99            pickle.dump(metadata, f, protocol=2)
100        with open(join(rpath, 'response_headers'), 'wb') as f:
101            f.write(headers_dict_to_raw(response.headers))
102        with open(join(rpath, 'response_body'), 'wb') as f:
103            f.write(response.body)
104        with open(join(rpath, 'request_headers'), 'wb') as f:
105            f.write(headers_dict_to_raw(request.headers))
106        with open(join(rpath, 'request_body'), 'wb') as f:
107            f.write(request.body)
108
109    def _get_request_path(self, spider, request):
110        key = request_fingerprint(request)
111        return join(self.cachedir, spider.domain_name, key[0:2], key)
112
113    def _read_meta(self, spider, request):
114        rpath = self._get_request_path(spider, request)
115        metapath = join(rpath, 'pickled_meta')
116        if not exists(metapath):
117            return # not found
118        mtime = os.stat(rpath).st_mtime
119        if 0 <= self.expiration_secs < time() - mtime:
120            return # expired
121        with open(metapath, 'rb') as f:
122            return pickle.load(f)
Note: See TracBrowser for help on using the browser.