Changes in [1926:44738f9e2661:1935:3dd98689e6df]
- Files:
-
- 28 added
- 7 modified
-
docs/experimental/crawlspider-v2.rst (added)
-
docs/experimental/index.rst (modified) (1 diff)
-
docs/intro/tutorial.rst (modified) (1 diff)
-
docs/topics/item-pipeline.rst (modified) (2 diffs)
-
examples/experimental/googledir/googledir/__init__.py (added)
-
examples/experimental/googledir/googledir/items.py (added)
-
examples/experimental/googledir/googledir/pipelines.py (added)
-
examples/experimental/googledir/googledir/settings.py (added)
-
examples/experimental/googledir/googledir/spiders/__init__.py (added)
-
examples/experimental/googledir/googledir/spiders/google_directory.py (added)
-
examples/experimental/googledir/scrapy-ctl.py (added)
-
examples/experimental/imdb/imdb/__init__.py (added)
-
examples/experimental/imdb/imdb/items.py (added)
-
examples/experimental/imdb/imdb/pipelines.py (added)
-
examples/experimental/imdb/imdb/settings.py (added)
-
examples/experimental/imdb/imdb/spiders/__init__.py (added)
-
examples/experimental/imdb/imdb/spiders/imdb_site.py (added)
-
examples/experimental/imdb/scrapy-ctl.py (added)
-
scrapy/contrib_exp/crawlspider/__init__.py (added)
-
scrapy/contrib_exp/crawlspider/matchers.py (added)
-
scrapy/contrib_exp/crawlspider/reqext.py (added)
-
scrapy/contrib_exp/crawlspider/reqgen.py (added)
-
scrapy/contrib_exp/crawlspider/reqproc.py (added)
-
scrapy/contrib_exp/crawlspider/rules.py (added)
-
scrapy/contrib_exp/crawlspider/spider.py (added)
-
scrapy/templates/project/module/pipelines.py.tmpl (modified) (1 diff)
-
scrapy/templates/spiders/crawl.tmpl (modified) (1 diff)
-
scrapy/tests/test_contrib_exp_crawlspider_matchers.py (added)
-
scrapy/tests/test_contrib_exp_crawlspider_reqext.py (added)
-
scrapy/tests/test_contrib_exp_crawlspider_reqgen.py (added)
-
scrapy/tests/test_contrib_exp_crawlspider_reqproc.py (added)
-
scrapy/tests/test_contrib_exp_crawlspider_rules.py (added)
-
scrapy/tests/test_contrib_exp_crawlspider_spider.py (added)
-
scrapy/tests/test_utils_python.py (modified) (2 diffs)
-
scrapy/utils/python.py (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
-
docs/experimental/index.rst
r1789 r1932 22 22 djangoitems 23 23 scheduler-middleware 24 crawlspider-v2 -
docs/intro/tutorial.rst
r1827 r1928 421 421 Now doing a crawl on the dmoz.org domain yields ``DmozItem``'s:: 422 422 423 [dmoz.org] DEBUG: Scraped DmozItem( {'title': [u'Text Processing in Python'], 'link': [u'http://gnosis.cx/TPiP/'], 'desc': [u' - By David Mertz; Addison Wesley. Book in progress, full text, ASCII format. Asks for feedback. [author website, Gnosis Software, Inc.]\n']}) in <http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>424 [dmoz.org] DEBUG: Scraped DmozItem( {'title': [u'XML Processing with Python'], 'link': [u'http://www.informit.com/store/product.aspx?isbn=0130211192'], 'desc': [u' - By Sean McGrath; Prentice Hall PTR, 2000, ISBN 0130211192, has CD-ROM. Methods to build XML applications fast, Python tutorial, DOM and SAX, new Pyxie open source XML processing library. [Prentice Hall PTR]\n']}) in <http://www.dmoz.org/Computers/Programming/Languages/Python/Books/>423 [dmoz.org] DEBUG: Scraped DmozItem(desc=[u' - By David Mertz; Addison Wesley. Book in progress, full text, ASCII format. Asks for feedback. [author website, Gnosis Software, Inc.]\n'], link=[u'http://gnosis.cx/TPiP/'], title=[u'Text Processing in Python']) in <http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> 424 [dmoz.org] DEBUG: Scraped DmozItem(desc=[u' - By Sean McGrath; Prentice Hall PTR, 2000, ISBN 0130211192, has CD-ROM. Methods to build XML applications fast, Python tutorial, DOM and SAX, new Pyxie open source XML processing library. [Prentice Hall PTR]\n'], link=[u'http://www.informit.com/store/product.aspx?isbn=0130211192'], title=[u'XML Processing with Python']) in <http://www.dmoz.org/Computers/Programming/Languages/Python/Books/> 425 425 426 426 -
docs/topics/item-pipeline.rst
r1911 r1929 99 99 100 100 def process_item(self, spider, item): 101 if item .idin self.duplicates[spider]:101 if item['id'] in self.duplicates[spider]: 102 102 raise DropItem("Duplicate item found: %s" % item) 103 103 else: 104 self.duplicates[spider].add(item .id)104 self.duplicates[spider].add(item['id']) 105 105 return item 106 106 … … 179 179 using the :setting:`EXPORT_FIELDS` setting. 180 180 181 * ``json lines``: uses a :class:`~jsonlines.JsonLinesItemExporter`181 * ``json``: uses a :class:`~jsonlines.JsonLinesItemExporter` 182 182 183 183 * ``pickle``: uses a :class:`PickleItemExporter` -
scrapy/templates/project/module/pipelines.py.tmpl
r1606 r1927 5 5 6 6 class ${ProjectName}Pipeline(object): 7 def process_item(self, domain, item):7 def process_item(self, spider, item): 8 8 return item -
scrapy/templates/spiders/crawl.tmpl
r1752 r1927 11 11 12 12 rules = ( 13 Rule(SgmlLinkExtractor(allow= (r'Items/', )),'parse_item', follow=True),13 Rule(SgmlLinkExtractor(allow=r'Items/'), callback='parse_item', follow=True), 14 14 ) 15 15 16 16 def parse_item(self, response): 17 xs = HtmlXPathSelector(response)17 hxs = HtmlXPathSelector(response) 18 18 i = ${ProjectName}Item() 19 #i['site_id'] = xs.select('//input[@id="sid"]/@value').extract()20 #i['name'] = xs.select('//div[@id="name"]').extract()21 #i['description'] = xs.select('//div[@id="description"]').extract()19 #i['site_id'] = hxs.select('//input[@id="sid"]/@value').extract() 20 #i['name'] = hxs.select('//div[@id="name"]').extract() 21 #i['description'] = hxs.select('//div[@id="description"]').extract() 22 22 return i 23 23 -
scrapy/tests/test_utils_python.py
r1562 r1930 1 import operator 1 2 import unittest 2 3 3 4 from scrapy.utils.python import str_to_unicode, unicode_to_str, \ 4 memoizemethod_noargs, isbinarytext 5 memoizemethod_noargs, isbinarytext, equal_attributes 5 6 6 7 class UtilsPythonTestCase(unittest.TestCase): … … 62 63 assert isbinarytext("\x02\xa3") 63 64 65 def test_equal_attributes(self): 66 class Obj: 67 pass 68 69 a = Obj() 70 b = Obj() 71 # no attributes given return False 72 self.failIf(equal_attributes(a, b, [])) 73 # not existent attributes 74 self.failIf(equal_attributes(a, b, ['x', 'y'])) 75 76 a.x = 1 77 b.x = 1 78 # equal attribute 79 self.failUnless(equal_attributes(a, b, ['x'])) 80 81 b.y = 2 82 # obj1 has no attribute y 83 self.failIf(equal_attributes(a, b, ['x', 'y'])) 84 85 a.y = 2 86 # equal attributes 87 self.failUnless(equal_attributes(a, b, ['x', 'y'])) 88 89 a.y = 1 90 # differente attributes 91 self.failIf(equal_attributes(a, b, ['x', 'y'])) 92 93 # test callable 94 a.meta = {} 95 b.meta = {} 96 self.failUnless(equal_attributes(a, b, ['meta'])) 97 98 # compare ['meta']['a'] 99 a.meta['z'] = 1 100 b.meta['z'] = 1 101 102 get_z = operator.itemgetter('z') 103 get_meta = operator.attrgetter('meta') 104 compare_z = lambda obj: get_z(get_meta(obj)) 105 106 self.failUnless(equal_attributes(a, b, [compare_z, 'x'])) 107 # fail z equality 108 a.meta['z'] = 2 109 self.failIf(equal_attributes(a, b, [compare_z, 'x'])) 110 111 64 112 if __name__ == "__main__": 65 113 unittest.main() -
scrapy/utils/python.py
r1561 r1930 217 217 raise TypeError('%s is not callable' % type(func)) 218 218 return func_args 219 220 221 def equal_attributes(obj1, obj2, attributes): 222 """Compare two objects attributes""" 223 # not attributes given return False by default 224 if not attributes: 225 return False 226 227 for attr in attributes: 228 # support callables like itemgetter 229 if callable(attr): 230 if not attr(obj1) == attr(obj2): 231 return False 232 else: 233 # check that objects has attribute 234 if not hasattr(obj1, attr): 235 return False 236 if not hasattr(obj2, attr): 237 return False 238 # compare object attributes 239 if not getattr(obj1, attr) == getattr(obj2, attr): 240 return False 241 # all attributes equal 242 return True 243
