Community Spiders
What is this?
This page contains a list of Scrapy spiders contributed by the community. Feel free to upload your spider here, along with the version (and revision) of Scrapy you used to run it, and the license you're releasing it under.
If this collection of spiders grows too big (we hope so!), we'll probably move this page to its own site, but these spiders will always remain free and open for everybody.
List of spiders
YouTube spider
This example spider scrapes videos from a youtube search query, to run it you must set the QUERY setting to the desired query like this:
scrapy-ctl.py runspider --set "QUERY=your query here" youtube.py --output=youtube.xml
from scrapy.conf import settings from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.contrib.loader import XPathItemLoader from scrapy.contrib.loader.processor import TakeFirst from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.item import Item, Field class YoutubeItem(Item): url = Field() title = Field() desc = Field() rating = Field() embed_url = Field() class YoutubeItemLoader(XPathItemLoader): default_item_class = YoutubeItem default_output_processor = TakeFirst() class YoutubeSpider(CrawlSpider): query = settings.get('QUERY') domain_name = 'youtube.com' start_urls = ['http://www.youtube.com/results?search_query=%s&page=1' % query] rules = ( Rule(SgmlLinkExtractor(allow=(r'results\?search_query=%s&page=\d+' % query,))), Rule(SgmlLinkExtractor(allow=(r'watch\?v=',), restrict_xpaths=['//div[@id="results-main-content"]']), 'parse_item'), ) def parse_item(self, response): il = YoutubeItemLoader(response=response) il.add_value('url', response.url) il.add_xpath('title', '//h1/text()') il.add_xpath('desc', '//span[@class="description"]/text()') il.add_xpath('rating', '//div[@id="ratingStars"]/button/@title') il.add_xpath('embed_url', '//input[@id="embed_code"]/@value', re='src="(.*?)"') return il.load_item() SPIDER = YoutubeSpider()
Mailman emails extractor spider
This spider can be helpful when you are the administrator of mailing lists under Mailman with no access to the Mailman deployment, how can you get all members emails? Mailman admin web-gui doesn't provide a feature to export all members (or at least I didn't found it) for example to migrate to another mailing list software.
""" * date: 2009-07-06 * author: anibal * license: BSD * scrapy version: tested with revision 1234 """ from scrapy.http import FormRequest from scrapy.xpath import HtmlXPathSelector from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.item import Item, Field class MailmanItem(Item): domain_name = Field() listname = Field() email = Field() class MailmanSpider(CrawlSpider): """ Spider for scraping email addresses on mailman admin pages. """ # Replace this lines with your values and uncomment or define them in a child spider #domain_name = 'example.com' #mailman_url = 'http://www.example.com/mailman' #passwords = { # 'users': 'pass_for_users_list', # 'customers': 'pass_for_customers_list', } rules = ( # This rule will allow to scrape the start response and get the letter links Rule(SgmlLinkExtractor(allow=(r'/members/list$', )), 'parse_items', follow=True), # This rule allows to go deep into pages by letters and its many chunks Rule(SgmlLinkExtractor(allow=(r'/members\?letter=.', )), 'parse_items', follow=True), ) def start_requests(self): """ Returns the initial login requests, one per list/pass defined in 'passwords' """ return [FormRequest( '%s/admin/%s_%s/members/list' % (self.mailman_url, l, self.domain_name), formdata={'adminpw': p}, method='POST', callback=self.parse) \ for l, p in self.passwords.items()] def parse_items(self, response): """ Returns the items -name of the list and the emails (members)- found """ xs, items = HtmlXPathSelector(response), [] listname = xs("//address/a[1]/text()").extract()[0].lower() for email in xs.x("//a[contains(@href,'--at--')]/text()").extract(): i = MailmanItem() i['domain_name'], i['listname'], i['email'] = self.domain_name, listname, email items.append(i) return items # Uncomment this if not using a child spider #SPIDER = MailmanSpider()
SilverStripe CMS demo spider with login handling
This example spider show how to submit login forms
scrapy-ctl.py runspider silterstripe.py
from scrapy.spider import BaseSpider from scrapy.http import FormRequest from scrapy.selector import HtmlXPathSelector from scrapy.item import Item, Field class TOCSpider(BaseSpider): domain_name = "demo.silverstripe.com" start_urls = ['http://demo.silverstripe.com/admin/'] def parse(self, response): hxs = HtmlXPathSelector(response) if hxs.select("//form[@id='UsernameLoginForm_LoginForm']"): return self.login(response) else: return self.get_section_links(response) def login(self, response): self.log("Login page... Posting username & password") formdata = {'Username': 'admin', 'Password': 'password'} return FormRequest.from_response(response, formdata=formdata, callback=self.parse, dont_filter=True) def get_section_links(self, response): self.log("Logged in... Grabbing links...") hxs = HtmlXPathSelector(response) for section in hxs.select("//select[@id='SubsitesSelect']//option"): value = section.select("@value").extract()[0] text = section.select("text()").extract()[0] yield MyItem(value=value, text=text) class MyItem(Item): value = Field() text = Field() SPIDER = TOCSpider()
