Changeset 1960:9f45d1e28051

Show
Ignore:
Timestamp:
03/25/10 12:38:37 (6 months ago)
Author:
Daniel Grana <dangra@…>
rebase_source:
1bff87c127a7e9d8d12c772b3068feb11eb5d97f
Branch:
default
Message:

Support relative url used in base tag. closes #148

Location:
scrapy
Files:
8 modified

Legend:

Unmodified
Added
Removed
  • scrapy/contrib/linkextractors/htmlparser.py

    r1264 r1960  
    2727 
    2828        ret = [] 
    29         base_url = self.base_url if self.base_url else response_url 
     29        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url 
    3030        for link in links: 
    3131            link.url = urljoin_rfc(base_url, link.url, response_encoding) 
  • scrapy/contrib/linkextractors/image.py

    r1957 r1960  
    5252        xs = HtmlXPathSelector(response) 
    5353        base_url = xs.select('//base/@href').extract() 
    54         base_url = unicode_to_str(base_url[0], response.encoding) if base_url \ 
    55             else unicode_to_str(response.url, response.encoding) 
     54        base_url = urljoin_rfc(response.url, base_url[0]) if base_url else response.url 
    5655 
    5756        links = [] 
  • scrapy/contrib/linkextractors/lxmlparser.py

    r1823 r1960  
    3030 
    3131        ret = [] 
    32         base_url = self.base_url if self.base_url else response_url 
     32        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url 
    3333        for link in links: 
    3434            link.url = urljoin_rfc(base_url, link.url, response_encoding) 
  • scrapy/contrib/linkextractors/regex.py

    r1264 r1960  
    1717class RegexLinkExtractor(SgmlLinkExtractor): 
    1818    """High performant link extractor""" 
     19 
    1920    def _extract_links(self, response_text, response_url, response_encoding): 
    20         base_url = self.base_url if self.base_url else response_url 
     21        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url 
    2122 
    2223        clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding)))) 
  • scrapy/contrib/linkextractors/sgml.py

    r1515 r1960  
    2929 
    3030        ret = [] 
    31         base_url = self.base_url if self.base_url else response_url 
     31        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url 
    3232        for link in links: 
    3333            link.url = urljoin_rfc(base_url, link.url, response_encoding) 
  • scrapy/contrib_exp/crawlspider/reqext.py

    r1931 r1960  
    3131        self.close() 
    3232 
    33         base_url = self.base_url if self.base_url else response_url 
     33        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url 
    3434        self._make_absolute_urls(base_url, response_encoding) 
    3535        self._fix_link_text_encoding(response_encoding) 
  • scrapy/tests/test_contrib_exp_crawlspider_reqext.py

    r1931 r1960  
    5151 
    5252    def test_base_url(self): 
     53        reqx = BaseSgmlRequestExtractor() 
     54 
    5355        html = """<html><head><title>Page title<title> 
    5456        <base href="http://otherdomain.com/base/" /> 
    5557        <body><p><a href="item/12.html">Item 12</a></p> 
    5658        </body></html>""" 
    57         response = HtmlResponse("http://example.org/somepage/index.html", 
    58                     body=html) 
    59         reqx = BaseSgmlRequestExtractor() 
     59        response = HtmlResponse("https://example.org/p/index.html", body=html) 
     60        reqs = reqx.extract_requests(response) 
     61        self.failUnless(self._requests_equals( \ 
     62            [Request('http://otherdomain.com/base/item/12.html', \ 
     63                    meta={'link_text': 'Item 12'})], reqs), reqs) 
    6064 
    61         self.failUnless( 
    62             self._requests_equals(reqx.extract_requests(response), 
    63                     [ Request('http://otherdomain.com/base/item/12.html', 
    64                              meta={'link_text': 'Item 12'}) ] 
    65                     ) 
    66             ) 
     65        # base url is an absolute path and relative to host 
     66        html = """<html><head><title>Page title<title> 
     67        <base href="/" /> 
     68        <body><p><a href="item/12.html">Item 12</a></p> 
     69        </body></html>""" 
     70        response = HtmlResponse("https://example.org/p/index.html", body=html) 
     71        reqs = reqx.extract_requests(response) 
     72        self.failUnless(self._requests_equals( \ 
     73            [Request('https://example.org/item/12.html', \ 
     74                    meta={'link_text': 'Item 12'})], reqs), reqs) 
     75 
     76        # base url has no scheme 
     77        html = """<html><head><title>Page title<title> 
     78        <base href="//noscheme.com/base/" /> 
     79        <body><p><a href="item/12.html">Item 12</a></p> 
     80        </body></html>""" 
     81        response = HtmlResponse("https://example.org/p/index.html", body=html) 
     82        reqs = reqx.extract_requests(response) 
     83        self.failUnless(self._requests_equals( \ 
     84            [Request('https://noscheme.com/base/item/12.html', \ 
     85                    meta={'link_text': 'Item 12'})], reqs), reqs) 
    6786 
    6887    def test_extraction_encoding(self): 
  • scrapy/tests/test_contrib_linkextractors.py

    r1123 r1960  
    3535        self.assertEqual(lx.extract_links(response), 
    3636                         [Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')]) 
     37 
     38        # base url is an absolute path and relative to host 
     39        html = """<html><head><title>Page title<title><base href="/" /> 
     40        <body><p><a href="item/12.html">Item 12</a></p></body></html>""" 
     41        response = HtmlResponse("https://example.org/somepage/index.html", body=html) 
     42        self.assertEqual(lx.extract_links(response), 
     43                         [Link(url='https://example.org/item/12.html', text='Item 12')]) 
     44 
     45        # base url has no scheme 
     46        html = """<html><head><title>Page title<title><base href="//noschemedomain.com/path/to/" /> 
     47        <body><p><a href="item/12.html">Item 12</a></p></body></html>""" 
     48        response = HtmlResponse("https://example.org/somepage/index.html", body=html) 
     49        self.assertEqual(lx.extract_links(response), 
     50                         [Link(url='https://noschemedomain.com/path/to/item/12.html', text='Item 12')]) 
    3751 
    3852    def test_extraction_encoding(self):