index.html

---
layout: default
title: A Fast and Powerful Scraping and Web Crawling Framework
---
{% assign stable = site.data.scrapy.stable %}
{% assign oldstable = site.data.scrapy.oldstable %}
{% assign devel = site.data.scrapy.development %}

<div class="container">

  <div class="first-row">

    <div class="block-left">
      <div id="scrapy-logo"></div>
      <p>An open source and collaborative framework for extracting the data you need from websites.
      </p>
      <p>In a fast, simple, yet extensible way.</p>

    {% include badges-bar.html %}
    </div>

    <div class="block-right">
      {% include download-button.html %}
    </div>

  </div>
</div>

<div class="second-row">
  <div class="container code-box-line">
    <div class="code-box">
      <div class="box-header">
        <p>Terminal<span class="close-btn">&bull;</span></p>
      </div>
      <div class="box-code tab-page active-page">
        <pre>
<span class="prompt" onselectstart="return false"><i class="fa fa-dollar"></i></span> pip install scrapy
<span class="prompt" onselectstart="return false"><i class="fa fa-dollar"></i></span> cat > myspider.py &lt;&lt;EOF
{% highlight python %}
import scrapy

class BlogSpider(scrapy.Spider):
    name = 'blogspider'
    start_urls = ['https://blog.scrapinghub.com']

    def parse(self, response):
        for url in response.css('ul li a::attr("href")').re('.*/category/.*'):
            yield scrapy.Request(response.urljoin(url), self.parse_titles)

    def parse_titles(self, response):
        for post_title in response.css('div.entries > ul > li a::text').extract():
            yield {'title': post_title}
{% endhighlight %}EOF
<span class="prompt" onselectstart="return false"><i class="fa fa-dollar"></i></span> scrapy runspider myspider.py
</pre>
      </div>
    </div>

    <div class="code-subs"><p>Build and run your<br /><span class="highlight">web spiders</span></p></div>

  </div>

  <div class="container code-box-line">
    <div class="code-box">
      <div class="box-header">
        <p>Terminal<span class="close-btn">&bull;</span></p>
      </div>
      <div class="box-code tab-page active-page">
        <pre>
<span class="prompt" onselectstart="return false"><i class="fa fa-dollar"></i></span> shub login
<span class="comments">Insert your Scrapinghub API Key: <span class="placeholder">&lt;API_KEY&gt;</span></span>

<span class="comments"># Deploy the spider to Scrapy Cloud</span>
<span class="prompt" onselectstart="return false"><i class="fa fa-dollar"></i></span> shub deploy</span>

<span class="comments"># Schedule the spider for execution</span>
<span class="prompt" onselectstart="return false"><i class="fa fa-dollar"></i></span> shub schedule blogspider <span class="comments">
Spider blogspider scheduled, watch it running here:
https://app.scrapinghub.com/p/26731/job/1/8</span>

<span class="comments"># Retrieve the scraped data</span>
<span class="prompt" onselectstart="return false"><i class="fa fa-dollar"></i></span> shub items 26731/1/8
{% highlight python %}
{"title": "Black Friday, Cyber Monday: Are They Worth It?"}
{"title": "Tips for Creating a Cohesive Company Culture Remotely"}
...
{% endhighlight %}</pre>
      </div>
    </div>

    <div class="code-subs"><p>Deploy them to<br /><a href="http://scrapinghub.com/scrapy-cloud/" title=""><span class="highlight">Scrapy Cloud</span></a></p>
    <p class="sub-sub">or use <a href="https://github.com/scrapy/scrapyd" title="Scrapyd"><span class="highlight">Scrapyd</span></a> to host the spiders on your own server</p></div>
  </div>

</div>

<div class="container">

  <div class="third-row">
    <div class="block-01">
      <i class="fa fa-flash fa-4x"> </i>
      <h3>Fast and powerful</h3>
      <p>write the rules to extract the data and let Scrapy do the rest</p>
    </div>
    <div class="block-02">
      <i class="fa fa-puzzle-piece fa-4x"> </i>
      <h3>Easily extensible</h3>
      <p>extensible by design, plug new functionality easily without having to touch the core</p>
    </div>
    <div class="block-03">
      <i class="fa fa-cubes fa-4x"> </i>
      <h3>Portable, Python</h3>
      <p>written in Python and runs on Linux, Windows, Mac and BSD</p>
    </div>
  </div>
</div>

<div class='fourth-row'>
  <div class="container">
    <div class="block-left">
      <h2>Healthy community</h2>
      <ul>
        <li>- 13.8k stars, 3.8k forks and 1.1k watchers on <a href="https://github.com/scrapy/scrapy">GitHub</a></li>
        <li>- 2.8k followers on <a href="https://twitter.com/ScrapyProject">Twitter</a></li>
        <li>- 5.3k questions on <a href="http://stackoverflow.com/tags/scrapy/info">StackOverflow</a></li>
        <li>- 2.8k members on <a href="https://groups.google.com/forum/?fromgroups#!aboutgroup/scrapy-users">mailing list</a></li>
      </ul>
    </div>
    <div class="block-right">
      <h2>Want to know more?</h2>
      <ul>
        <li><a href="http://doc.scrapy.org/en/{{ stable.version }}/intro/overview.html">- Discover Scrapy at a glance</a></li>
        <li><a href="../companies/">- Meet the companies using Scrapy</a></li>
      </ul>

    </div>
  </div>
</div>

<script src="{{ "/js/simple-tabs-pure-js.js" | prepend: site.baseurl }}"></script>

<script>
window.onload = function() {
var codeTabs = new SimpleTabs(document.getElementById('code-tabs'));
};
</script>