Changeset 1804:8d03689ef662

Show
Ignore:
Timestamp:
10/21/09 09:37:04 (9 months ago)
Author:
Pablo Hoffman <pablo@…>
Branch:
default
Message:

make get_meta_refresh() function more robust and changed interface to return (int, str) as (interval, absolute_url)

Location:
scrapy
Files:
3 modified

Legend:

Unmodified
Added
Removed
  • scrapy/contrib/downloadermiddleware/redirect.py

    r1421 r1804  
    1515 
    1616    def process_response(self, request, response, spider): 
    17         domain = spider.domain_name 
    18  
    1917        if response.status in [302, 303] and 'Location' in response.headers: 
    2018            redirected_url = urljoin_rfc(request.url, response.headers['location']) 
     
    3028 
    3129        interval, url = get_meta_refresh(response) 
    32         if url and int(interval) < self.max_metarefresh_delay: 
    33             redirected = request.replace(url=urljoin_rfc(request.url, url)) 
     30        if url and interval < self.max_metarefresh_delay: 
     31            redirected = request.replace(url=url) 
    3432            return self._redirect(redirected, request, spider, 'meta refresh') 
    3533 
  • scrapy/tests/test_utils_response.py

    r1437 r1804  
    4343            </html>""" 
    4444        response = Response(url='http://example.org', body=body) 
    45         self.assertEqual(get_meta_refresh(response), ('5', 'http://example.org/newpage')) 
     45        self.assertEqual(get_meta_refresh(response), (5, 'http://example.org/newpage')) 
    4646 
    4747        # refresh without url should return (None, None) 
     
    5353            url=http://example.org/newpage" /></head>""" 
    5454        response = Response(url='http://example.org', body=body) 
    55         self.assertEqual(get_meta_refresh(response), ('5', 'http://example.org/newpage')) 
     55        self.assertEqual(get_meta_refresh(response), (5, 'http://example.org/newpage')) 
    5656 
    5757        # meta refresh in multiple lines 
     
    6161               CONTENT="1; URL=http://example.org/newpage">""" 
    6262        response = Response(url='http://example.org', body=body) 
    63         self.assertEqual(get_meta_refresh(response), ('1', 'http://example.org/newpage')) 
     63        self.assertEqual(get_meta_refresh(response), (1, 'http://example.org/newpage')) 
     64 
     65        # entities in the redirect url 
     66        body = """<meta http-equiv="refresh" content="3; url=&#39;http://www.example.com/other&#39;">""" 
     67        response = Response(url='http://example.com', body=body) 
     68        self.assertEqual(get_meta_refresh(response), (3, 'http://www.example.com/other')) 
     69 
     70        # relative redirects 
     71        body = """<meta http-equiv="refresh" content="3; url=other.html">""" 
     72        response = Response(url='http://example.com/page/this.html', body=body) 
     73        self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/page/other.html')) 
     74 
     75        # non-standard encodings (utf-16) 
     76        body = """<meta http-equiv="refresh" content="3; url=http://example.com/redirect">""" 
     77        body = body.decode('ascii').encode('utf-16') 
     78        response = TextResponse(url='http://example.com', body=body, encoding='utf-16') 
     79        self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/redirect')) 
     80 
     81        # non-ascii chars in the url (default encoding - utf8) 
     82        body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">""" 
     83        response = Response(url='http://example.com', body=body) 
     84        self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/to%C2%A3')) 
     85 
     86        # non-ascii chars in the url (custom encoding - latin1) 
     87        body = """<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">""" 
     88        response = TextResponse(url='http://example.com', body=body, encoding='latin1') 
     89        self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/to%C2%A3')) 
     90 
     91        # wrong encodings (possibly caused by truncated chunks) 
     92        body = """<meta http-equiv="refresh" content="3; url=http://example.com/this\xc2_THAT">""" 
     93        response = Response(url='http://example.com', body=body) 
     94        self.assertEqual(get_meta_refresh(response), (3, 'http://example.com/thisTHAT')) 
    6495 
    6596    def test_response_httprepr(self): 
  • scrapy/utils/response.py

    r1592 r1804  
    1414from twisted.web.http import RESPONSES 
    1515 
     16from scrapy.utils.markup import remove_entities 
     17from scrapy.utils.url import safe_url_string, urljoin_rfc 
    1618from scrapy.xlib.BeautifulSoup import BeautifulSoup 
    1719from scrapy.http import Response, HtmlResponse 
     
    3537    return _baseurl_cache[response] 
    3638 
    37 META_REFRESH_RE = re.compile(r'<meta[^>]*http-equiv[^>]*refresh[^>].*?(\d+);\s*url=([^"\']+)', re.DOTALL | re.IGNORECASE) 
     39META_REFRESH_RE = re.compile(ur'<meta[^>]*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P<quote>["\'])(?P<int>\d+)\s*;\s*url=(?P<url>.*?)(?P=quote)', re.DOTALL | re.IGNORECASE) 
    3840_metaref_cache = weakref.WeakKeyDictionary() 
    3941def get_meta_refresh(response): 
    40     """ Return a tuple of two strings containing the interval and url included 
    41     in the http-equiv parameter of the HTML meta element. If no url is included 
    42     (None, None) is returned [instead of (interval, None)] 
     42    """Parse the http-equiv parameter of the HTML meta element from the given 
     43    response and return a tuple (interval, url) where interval is an integer 
     44    containing the delay in seconds (or zero if not present) and url is a 
     45    string with the absolute url to redirect. 
     46 
     47    If no meta redirect is found, (None, None) is returned. 
    4348    """ 
    4449    if response not in _metaref_cache: 
    45         match = META_REFRESH_RE.search(response.body[0:4096]) 
    46         _metaref_cache[response] = match.groups() if match else (None, None) 
     50        encoding = getattr(response, 'encoding', 'utf-8') 
     51        body_chunk = remove_entities(unicode(response.body[0:4096], encoding, \ 
     52            errors='ignore')) 
     53        match = META_REFRESH_RE.search(body_chunk) 
     54        if match: 
     55            interval = int(match.group('int')) 
     56            url = safe_url_string(match.group('url').strip(' "\'')) 
     57            url = urljoin_rfc(response.url, url) 
     58            _metaref_cache[response] = (interval, url) 
     59        else: 
     60            _metaref_cache[response] = (None, None) 
     61        #_metaref_cache[response] = match.groups() if match else (None, None) 
    4762    return _metaref_cache[response] 
    4863