| 1 | from scrapy import log |
|---|
| 2 | from scrapy.utils.url import urljoin_rfc |
|---|
| 3 | from scrapy.utils.response import get_meta_refresh |
|---|
| 4 | from scrapy.core.exceptions import IgnoreRequest |
|---|
| 5 | from scrapy.conf import settings |
|---|
| 6 | |
|---|
| 7 | |
|---|
| 8 | class RedirectMiddleware(object): |
|---|
| 9 | """Handle redirection of requests based on response status and meta-refresh html tag""" |
|---|
| 10 | |
|---|
| 11 | def __init__(self): |
|---|
| 12 | self.max_metarefresh_delay = settings.getint('REDIRECT_MAX_METAREFRESH_DELAY') |
|---|
| 13 | self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES') |
|---|
| 14 | self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST') |
|---|
| 15 | |
|---|
| 16 | def process_response(self, request, response, spider): |
|---|
| 17 | domain = spider.domain_name |
|---|
| 18 | |
|---|
| 19 | if response.status in [302, 303] and 'Location' in response.headers: |
|---|
| 20 | redirected_url = urljoin_rfc(request.url, response.headers['location']) |
|---|
| 21 | redirected = request.replace(url=redirected_url, method='GET', body='') |
|---|
| 22 | redirected.headers.pop('Content-Type', None) |
|---|
| 23 | redirected.headers.pop('Content-Length', None) |
|---|
| 24 | return self._redirect(redirected, request, spider, response.status) |
|---|
| 25 | |
|---|
| 26 | if response.status in [301, 307] and 'Location' in response.headers: |
|---|
| 27 | redirected_url = urljoin_rfc(request.url, response.headers['location']) |
|---|
| 28 | redirected = request.replace(url=redirected_url) |
|---|
| 29 | return self._redirect(redirected, request, spider, response.status) |
|---|
| 30 | |
|---|
| 31 | interval, url = get_meta_refresh(response) |
|---|
| 32 | if url and int(interval) < self.max_metarefresh_delay: |
|---|
| 33 | redirected = request.replace(url=urljoin_rfc(request.url, url)) |
|---|
| 34 | return self._redirect(redirected, request, spider, 'meta refresh') |
|---|
| 35 | |
|---|
| 36 | return response |
|---|
| 37 | |
|---|
| 38 | def _redirect(self, redirected, request, spider, reason): |
|---|
| 39 | ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times) |
|---|
| 40 | redirects = request.meta.get('redirect_times', 0) + 1 |
|---|
| 41 | |
|---|
| 42 | if ttl and redirects <= self.max_redirect_times: |
|---|
| 43 | redirected.meta['redirect_times'] = redirects |
|---|
| 44 | redirected.meta['redirect_ttl'] = ttl - 1 |
|---|
| 45 | redirected.dont_filter = request.dont_filter |
|---|
| 46 | redirected.priority = request.priority + self.priority_adjust |
|---|
| 47 | log.msg("Redirecting (%s) to %s from %s" % (reason, redirected, request), |
|---|
| 48 | domain=spider.domain_name, level=log.DEBUG) |
|---|
| 49 | return redirected |
|---|
| 50 | else: |
|---|
| 51 | log.msg("Discarding %s: max redirections reached" % request, |
|---|
| 52 | domain=spider.domain_name, level=log.DEBUG) |
|---|
| 53 | raise IgnoreRequest |
|---|
| 54 | |
|---|