Changeset 1804:8d03689ef662
- Timestamp:
- 10/21/09 09:37:04 (9 months ago)
- Branch:
- default
- Location:
- scrapy
- Files:
-
- 3 modified
-
contrib/downloadermiddleware/redirect.py (modified) (2 diffs)
-
tests/test_utils_response.py (modified) (3 diffs)
-
utils/response.py (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
-
scrapy/contrib/downloadermiddleware/redirect.py
r1421 r1804 15 15 16 16 def process_response(self, request, response, spider): 17 domain = spider.domain_name18 19 17 if response.status in [302, 303] and 'Location' in response.headers: 20 18 redirected_url = urljoin_rfc(request.url, response.headers['location']) … … 30 28 31 29 interval, url = get_meta_refresh(response) 32 if url and int (interval)< self.max_metarefresh_delay:33 redirected = request.replace(url=url join_rfc(request.url, url))30 if url and interval < self.max_metarefresh_delay: 31 redirected = request.replace(url=url) 34 32 return self._redirect(redirected, request, spider, 'meta refresh') 35 33 -
scrapy/tests/test_utils_response.py
r1437 r1804 43 43 </html>""" 44 44 response = Response(url='http://example.org', body=body) 45 self.assertEqual(get_meta_refresh(response), ( '5', 'http://example.org/newpage'))45 self.assertEqual(get_meta_refresh(response), (5, 'http://example.org/newpage')) 46 46 47 47 # refresh without url should return (None, None) … … 53 53 url=http://example.org/newpage" /></head>""" 54 54 response = Response(url='http://example.org', body=body) 55 self.assertEqual(get_meta_refresh(response), ( '5', 'http://example.org/newpage'))55 self.assertEqual(get_meta_refresh(response), (5, 'http://example.org/newpage')) 56 56 57 57 # meta refresh in multiple lines … … 61 61 CONTENT="1; URL=http://example.org/newpage">""" 62 62 response = Response(url='http://example.org', body=body) 63 self.assertEqual(get_meta_refresh(response), ('1', 'http://example.org/newpage')) 63 self.assertEqual(get_meta_refresh(response), (1, 'http://example.org/newpage')) 64 65 # entities in the redirect url 66 body = """<meta http-equiv="refresh" content="3; url='http://www.example.com/other'">""" 67 response = Response(url='http://example.com', body=body) 68 self.assertEqual(get_meta_refresh(response), (3, 'http://www.example.com/other')) 69 70 # relative redirects 71 body = """<meta http-equiv="refresh" content="3; url=other.html">""" 72 response = Response(url='http://example.com/page/this.html', body=body) 73 self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/page/other.html')) 74 75 # non-standard encodings (utf-16) 76 body = """<meta http-equiv="refresh" content="3; url=http://example.com/redirect">""" 77 body = body.decode('ascii').encode('utf-16') 78 response = TextResponse(url='http://example.com', body=body, encoding='utf-16') 79 self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/redirect')) 80 81 # non-ascii chars in the url (default encoding - utf8) 82 body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">""" 83 response = Response(url='http://example.com', body=body) 84 self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/to%C2%A3')) 85 86 # non-ascii chars in the url (custom encoding - latin1) 87 body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">""" 88 response = TextResponse(url='http://example.com', body=body, encoding='latin1') 89 self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/to%C2%A3')) 90 91 # wrong encodings (possibly caused by truncated chunks) 92 body = """<meta http-equiv="refresh" content="3; url=http://example.com/this\xc2_THAT">""" 93 response = Response(url='http://example.com', body=body) 94 self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/thisTHAT')) 64 95 65 96 def test_response_httprepr(self): -
scrapy/utils/response.py
r1592 r1804 14 14 from twisted.web.http import RESPONSES 15 15 16 from scrapy.utils.markup import remove_entities 17 from scrapy.utils.url import safe_url_string, urljoin_rfc 16 18 from scrapy.xlib.BeautifulSoup import BeautifulSoup 17 19 from scrapy.http import Response, HtmlResponse … … 35 37 return _baseurl_cache[response] 36 38 37 META_REFRESH_RE = re.compile( r'<meta[^>]*http-equiv[^>]*refresh[^>].*?(\d+);\s*url=([^"\']+)', re.DOTALL | re.IGNORECASE)39 META_REFRESH_RE = re.compile(ur'<meta[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>\d+)\s*;\s*url=(?P<url>.*?)(?P=quote)', re.DOTALL | re.IGNORECASE) 38 40 _metaref_cache = weakref.WeakKeyDictionary() 39 41 def get_meta_refresh(response): 40 """ Return a tuple of two strings containing the interval and url included 41 in the http-equiv parameter of the HTML meta element. If no url is included 42 (None, None) is returned [instead of (interval, None)] 42 """Parse the http-equiv parameter of the HTML meta element from the given 43 response and return a tuple (interval, url) where interval is an integer 44 containing the delay in seconds (or zero if not present) and url is a 45 string with the absolute url to redirect. 46 47 If no meta redirect is found, (None, None) is returned. 43 48 """ 44 49 if response not in _metaref_cache: 45 match = META_REFRESH_RE.search(response.body[0:4096]) 46 _metaref_cache[response] = match.groups() if match else (None, None) 50 encoding = getattr(response, 'encoding', 'utf-8') 51 body_chunk = remove_entities(unicode(response.body[0:4096], encoding, \ 52 errors='ignore')) 53 match = META_REFRESH_RE.search(body_chunk) 54 if match: 55 interval = int(match.group('int')) 56 url = safe_url_string(match.group('url').strip(' "\'')) 57 url = urljoin_rfc(response.url, url) 58 _metaref_cache[response] = (interval, url) 59 else: 60 _metaref_cache[response] = (None, None) 61 #_metaref_cache[response] = match.groups() if match else (None, None) 47 62 return _metaref_cache[response] 48 63
