Community Spiders

What is this?

This page contains a list of Scrapy spiders contributed by the community. Feel free to upload your spider here, along with the version (and revision) of Scrapy you used to run it, and the license you're releasing it under.

If this collection of spiders grows too big (we hope so!), we'll probably move this page to its own site, but these spiders will always remain free and open for everybody.

List of spiders

YouTube spider

This example spider scrapes videos from a youtube search query, to run it you must set the QUERY setting to the desired query like this:

scrapy-ctl.py runspider --set "QUERY=your query here" youtube.py --output=youtube.xml
from scrapy.conf import settings
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import TakeFirst
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.item import Item, Field


class YoutubeItem(Item):
    url = Field()
    title = Field()
    desc = Field()
    rating = Field()
    embed_url = Field()


class YoutubeItemLoader(XPathItemLoader):
    default_item_class = YoutubeItem
    default_output_processor = TakeFirst()


class YoutubeSpider(CrawlSpider):
    query = settings.get('QUERY')

    domain_name = 'youtube.com'
    start_urls = ['http://www.youtube.com/results?search_query=%s&page=1' % 
                  query]

    rules = (
        Rule(SgmlLinkExtractor(allow=(r'results\?search_query=%s&page=\d+' %
                                      query,))),
        Rule(SgmlLinkExtractor(allow=(r'watch\?v=',),
                               restrict_xpaths=['//div[@id="results-main-content"]']),
             'parse_item'),
    )

    def parse_item(self, response):
        il = YoutubeItemLoader(response=response)
        il.add_value('url', response.url)
        il.add_xpath('title', '//h1/text()')
        il.add_xpath('desc', '//span[@class="description"]/text()')
        il.add_xpath('rating', '//div[@id="ratingStars"]/button/@title')
        il.add_xpath('embed_url', '//input[@id="embed_code"]/@value',
                     re='src="(.*?)"')
        
        return il.load_item()

SPIDER = YoutubeSpider()

Mailman emails extractor spider

This spider can be helpful when you are the administrator of mailing lists under Mailman with no access to the Mailman deployment, how can you get all members emails? Mailman admin web-gui doesn't provide a feature to export all members (or at least I didn't found it) for example to migrate to another mailing list software.

"""
 * date: 2009-07-06
 * author: anibal
 * license: BSD
 * scrapy version: tested with revision 1234
"""
from scrapy.http import FormRequest
from scrapy.xpath import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.item import Item, Field


class MailmanItem(Item):
    domain_name = Field()
    listname = Field()
    email = Field()


class MailmanSpider(CrawlSpider):
    """ Spider for scraping email addresses on mailman admin pages. """

    # Replace this lines with your values and uncomment or define them in a child spider
    #domain_name = 'example.com'
    #mailman_url = 'http://www.example.com/mailman'
    #passwords = {
    #   'users': 'pass_for_users_list',
    #   'customers': 'pass_for_customers_list', }

    rules = (
        # This rule will allow to scrape the start response and get the letter links
        Rule(SgmlLinkExtractor(allow=(r'/members/list$', )), 'parse_items', follow=True),
        # This rule allows to go deep into pages by letters and its many chunks
        Rule(SgmlLinkExtractor(allow=(r'/members\?letter=.', )), 'parse_items', follow=True),
    )

    def start_requests(self):
        """ Returns the initial login requests, one per list/pass defined in 'passwords' """
        return [FormRequest(
            '%s/admin/%s_%s/members/list' % (self.mailman_url, l, self.domain_name),
            formdata={'adminpw': p}, method='POST', callback=self.parse) \
                    for l, p in self.passwords.items()]

    def parse_items(self, response):
        """ Returns the items -name of the list and the emails (members)- found """
        xs, items = HtmlXPathSelector(response), []
        listname = xs("//address/a[1]/text()").extract()[0].lower()
        for email in xs.x("//a[contains(@href,'--at--')]/text()").extract():
            i = MailmanItem()
            i['domain_name'], i['listname'], i['email'] = self.domain_name, listname, email
            items.append(i)
        return items

# Uncomment this if not using a child spider
#SPIDER = MailmanSpider()

SilverStripe CMS demo spider with login handling

This example spider show how to submit login forms

scrapy-ctl.py runspider silterstripe.py
from scrapy.spider import BaseSpider
from scrapy.http import FormRequest
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field


class TOCSpider(BaseSpider):
    domain_name = "demo.silverstripe.com"
    start_urls = ['http://demo.silverstripe.com/admin/']

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        if hxs.select("//form[@id='UsernameLoginForm_LoginForm']"):
            return self.login(response)
        else:
            return self.get_section_links(response)

    def login(self, response):
        self.log("Login page... Posting username & password")
        formdata = {'Username': 'admin', 'Password': 'password'}
        return FormRequest.from_response(response, formdata=formdata, callback=self.parse, dont_filter=True)

    def get_section_links(self, response):
        self.log("Logged in... Grabbing links...")
        hxs = HtmlXPathSelector(response)
        for section in hxs.select("//select[@id='SubsitesSelect']//option"):
            value = section.select("@value").extract()[0]
            text = section.select("text()").extract()[0]
            yield MyItem(value=value, text=text)


class MyItem(Item):
    value = Field()
    text = Field()


SPIDER = TOCSpider()