root/scrapy/contrib/downloadermiddleware/redirect.py @ 1421:0f9ee0e39384

Revision 1421:0f9ee0e39384, 2.5 kB (checked in by Daniel Grana <dangra@…>, 13 months ago)

returning None from process_response is not allowed, ignore the request using exception instead

Line 
1from scrapy import log
2from scrapy.utils.url import urljoin_rfc
3from scrapy.utils.response import get_meta_refresh
4from scrapy.core.exceptions import IgnoreRequest
5from scrapy.conf import settings
6
7
8class RedirectMiddleware(object):
9    """Handle redirection of requests based on response status and meta-refresh html tag"""
10
11    def __init__(self):
12        self.max_metarefresh_delay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY')
13        self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')
14        self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')
15
16    def process_response(self, request, response, spider):
17        domain = spider.domain_name
18
19        if response.status in [302, 303] and 'Location' in response.headers:
20            redirected_url = urljoin_rfc(request.url, response.headers['location'])
21            redirected = request.replace(url=redirected_url, method='GET', body='')
22            redirected.headers.pop('Content-Type', None)
23            redirected.headers.pop('Content-Length', None)
24            return self._redirect(redirected, request, spider, response.status)
25
26        if response.status in [301, 307] and 'Location' in response.headers:
27            redirected_url = urljoin_rfc(request.url, response.headers['location'])
28            redirected = request.replace(url=redirected_url)
29            return self._redirect(redirected, request, spider, response.status)
30
31        interval, url = get_meta_refresh(response)
32        if url and int(interval) < self.max_metarefresh_delay:
33            redirected = request.replace(url=urljoin_rfc(request.url, url))
34            return self._redirect(redirected, request, spider, 'meta refresh')
35
36        return response
37
38    def _redirect(self, redirected, request, spider, reason):
39        ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
40        redirects = request.meta.get('redirect_times', 0) + 1
41
42        if ttl and redirects <= self.max_redirect_times:
43            redirected.meta['redirect_times'] = redirects
44            redirected.meta['redirect_ttl'] = ttl - 1
45            redirected.dont_filter = request.dont_filter
46            redirected.priority = request.priority + self.priority_adjust
47            log.msg("Redirecting (%s) to %s from %s" % (reason, redirected, request),
48                    domain=spider.domain_name, level=log.DEBUG)
49            return redirected
50        else:
51            log.msg("Discarding %s: max redirections reached" % request,
52                    domain=spider.domain_name, level=log.DEBUG)
53            raise IgnoreRequest
54
Note: See TracBrowser for help on using the browser.