Changeset 1960:9f45d1e28051
- Timestamp:
- 03/25/10 12:38:37 (6 months ago)
- rebase_source:
- 1bff87c127a7e9d8d12c772b3068feb11eb5d97f
- Branch:
- default
- Location:
- scrapy
- Files:
-
- 8 modified
-
contrib/linkextractors/htmlparser.py (modified) (1 diff)
-
contrib/linkextractors/image.py (modified) (1 diff)
-
contrib/linkextractors/lxmlparser.py (modified) (1 diff)
-
contrib/linkextractors/regex.py (modified) (1 diff)
-
contrib/linkextractors/sgml.py (modified) (1 diff)
-
contrib_exp/crawlspider/reqext.py (modified) (1 diff)
-
tests/test_contrib_exp_crawlspider_reqext.py (modified) (1 diff)
-
tests/test_contrib_linkextractors.py (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
-
scrapy/contrib/linkextractors/htmlparser.py
r1264 r1960 27 27 28 28 ret = [] 29 base_url = self.base_urlif self.base_url else response_url29 base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url 30 30 for link in links: 31 31 link.url = urljoin_rfc(base_url, link.url, response_encoding) -
scrapy/contrib/linkextractors/image.py
r1957 r1960 52 52 xs = HtmlXPathSelector(response) 53 53 base_url = xs.select('//base/@href').extract() 54 base_url = unicode_to_str(base_url[0], response.encoding) if base_url \ 55 else unicode_to_str(response.url, response.encoding) 54 base_url = urljoin_rfc(response.url, base_url[0]) if base_url else response.url 56 55 57 56 links = [] -
scrapy/contrib/linkextractors/lxmlparser.py
r1823 r1960 30 30 31 31 ret = [] 32 base_url = self.base_urlif self.base_url else response_url32 base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url 33 33 for link in links: 34 34 link.url = urljoin_rfc(base_url, link.url, response_encoding) -
scrapy/contrib/linkextractors/regex.py
r1264 r1960 17 17 class RegexLinkExtractor(SgmlLinkExtractor): 18 18 """High performant link extractor""" 19 19 20 def _extract_links(self, response_text, response_url, response_encoding): 20 base_url = self.base_urlif self.base_url else response_url21 base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url 21 22 22 23 clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding)))) -
scrapy/contrib/linkextractors/sgml.py
r1515 r1960 29 29 30 30 ret = [] 31 base_url = self.base_urlif self.base_url else response_url31 base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url 32 32 for link in links: 33 33 link.url = urljoin_rfc(base_url, link.url, response_encoding) -
scrapy/contrib_exp/crawlspider/reqext.py
r1931 r1960 31 31 self.close() 32 32 33 base_url = self.base_urlif self.base_url else response_url33 base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url 34 34 self._make_absolute_urls(base_url, response_encoding) 35 35 self._fix_link_text_encoding(response_encoding) -
scrapy/tests/test_contrib_exp_crawlspider_reqext.py
r1931 r1960 51 51 52 52 def test_base_url(self): 53 reqx = BaseSgmlRequestExtractor() 54 53 55 html = """<html><head><title>Page title<title> 54 56 <base href="http://otherdomain.com/base/" /> 55 57 <body><p><a href="item/12.html">Item 12</a></p> 56 58 </body></html>""" 57 response = HtmlResponse("http://example.org/somepage/index.html", 58 body=html) 59 reqx = BaseSgmlRequestExtractor() 59 response = HtmlResponse("https://example.org/p/index.html", body=html) 60 reqs = reqx.extract_requests(response) 61 self.failUnless(self._requests_equals( \ 62 [Request('http://otherdomain.com/base/item/12.html', \ 63 meta={'link_text': 'Item 12'})], reqs), reqs) 60 64 61 self.failUnless( 62 self._requests_equals(reqx.extract_requests(response), 63 [ Request('http://otherdomain.com/base/item/12.html', 64 meta={'link_text': 'Item 12'}) ] 65 ) 66 ) 65 # base url is an absolute path and relative to host 66 html = """<html><head><title>Page title<title> 67 <base href="/" /> 68 <body><p><a href="item/12.html">Item 12</a></p> 69 </body></html>""" 70 response = HtmlResponse("https://example.org/p/index.html", body=html) 71 reqs = reqx.extract_requests(response) 72 self.failUnless(self._requests_equals( \ 73 [Request('https://example.org/item/12.html', \ 74 meta={'link_text': 'Item 12'})], reqs), reqs) 75 76 # base url has no scheme 77 html = """<html><head><title>Page title<title> 78 <base href="//noscheme.com/base/" /> 79 <body><p><a href="item/12.html">Item 12</a></p> 80 </body></html>""" 81 response = HtmlResponse("https://example.org/p/index.html", body=html) 82 reqs = reqx.extract_requests(response) 83 self.failUnless(self._requests_equals( \ 84 [Request('https://noscheme.com/base/item/12.html', \ 85 meta={'link_text': 'Item 12'})], reqs), reqs) 67 86 68 87 def test_extraction_encoding(self): -
scrapy/tests/test_contrib_linkextractors.py
r1123 r1960 35 35 self.assertEqual(lx.extract_links(response), 36 36 [Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')]) 37 38 # base url is an absolute path and relative to host 39 html = """<html><head><title>Page title<title><base href="/" /> 40 <body><p><a href="item/12.html">Item 12</a></p></body></html>""" 41 response = HtmlResponse("https://example.org/somepage/index.html", body=html) 42 self.assertEqual(lx.extract_links(response), 43 [Link(url='https://example.org/item/12.html', text='Item 12')]) 44 45 # base url has no scheme 46 html = """<html><head><title>Page title<title><base href="//noschemedomain.com/path/to/" /> 47 <body><p><a href="item/12.html">Item 12</a></p></body></html>""" 48 response = HtmlResponse("https://example.org/somepage/index.html", body=html) 49 self.assertEqual(lx.extract_links(response), 50 [Link(url='https://noschemedomain.com/path/to/item/12.html', text='Item 12')]) 37 51 38 52 def test_extraction_encoding(self):
