Merge pull request #559 from realpython/scrapy-mongodb

Add project code for Scrapy and MongoDB overhaul
realpython · Aug 20, 2024 · 05b444e · 05b444e
2 parents bbd1ff9 + 95b5733
commit 05b444e
Show file tree

Hide file tree

Showing 13 changed files with 550 additions and 0 deletions.
diff --git a/web-scraping-with-scrapy-and-mongodb/README.md b/web-scraping-with-scrapy-and-mongodb/README.md
@@ -0,0 +1,41 @@
+# Web Scraping With Scrapy and MongoDB
+
+[Web Scraping With Scrapy and MongoDB](https://realpython.com/web-scraping-with-scrapy-and-mongodb/) is an example project for building a robust web scraper for static sites leveraging Scrapy and MongoDB.
+
+## Installation and Setup
+
+1. Create a Python virtual environment
+
+```sh
+$ python -m venv ./venv
+$ source venv/bin/activate
+(venv) $
+```
+
+2. Install the requirements
+
+```sh
+(venv) $ pip install -r requirements.txt
+```
+
+You'll also need to [set up a MongoDB collection](https://realpython.com/web-scraping-with-scrapy-and-mongodb/#set-up-a-mongodb-collection-on-your-computer) like described in the tutorial.
+
+## Run the Scraper
+
+Navigate into the `books/` project directory.
+
+Then you can start crawling the site:
+
+```sh
+(venv) $ scrapy crawl book
+```
+
+If set up correctly, this will populate your MongoDB collection with the book information scraped from the example site.
+
+## About the Author
+
+Martin Breuss - Email: [email protected]
+
+## License
+
+Distributed under the MIT license. See ``LICENSE`` for more information.
diff --git a/web-scraping-with-scrapy-and-mongodb/books/books/__init__.py b/web-scraping-with-scrapy-and-mongodb/books/books/__init__.py
diff --git a/web-scraping-with-scrapy-and-mongodb/books/books/items.py b/web-scraping-with-scrapy-and-mongodb/books/books/items.py
@@ -0,0 +1,8 @@
+import scrapy
+
+
+class BooksItem(scrapy.Item):
+    _id = scrapy.Field()
+    url = scrapy.Field()
+    title = scrapy.Field()
+    price = scrapy.Field()
diff --git a/web-scraping-with-scrapy-and-mongodb/books/books/middlewares.py b/web-scraping-with-scrapy-and-mongodb/books/books/middlewares.py
@@ -0,0 +1,101 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+# useful for handling different item types with a single interface
+from scrapy import signals
+
+
+class BooksSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class BooksDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
diff --git a/web-scraping-with-scrapy-and-mongodb/books/books/pipelines.py b/web-scraping-with-scrapy-and-mongodb/books/books/pipelines.py
@@ -0,0 +1,42 @@
+import hashlib
+
+import pymongo
+from itemadapter import ItemAdapter
+from scrapy.exceptions import DropItem
+
+
+class MongoPipeline:
+    COLLECTION_NAME = "books"
+
+    def __init__(self, mongo_uri, mongo_db):
+        self.mongo_uri = mongo_uri
+        self.mongo_db = mongo_db
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(
+            mongo_uri=crawler.settings.get("MONGO_URI"),
+            mongo_db=crawler.settings.get("MONGO_DATABASE"),
+        )
+
+    def open_spider(self, spider):
+        self.client = pymongo.MongoClient(self.mongo_uri)
+        self.db = self.client[self.mongo_db]
+
+    def close_spider(self, spider):
+        self.client.close()
+
+    def process_item(self, item, spider):
+        item_id = self.compute_item_id(item)
+        if self.db[self.COLLECTION_NAME].find_one({"_id": item_id}):
+            raise DropItem(f"Duplicate item found: {item}")
+        else:
+            item["_id"] = item_id
+            self.db[self.COLLECTION_NAME].insert_one(
+                ItemAdapter(item).asdict()
+            )
+            return item
+
+    def compute_item_id(self, item):
+        url = item["url"]
+        return hashlib.sha256(url.encode("utf-8")).hexdigest()
diff --git a/web-scraping-with-scrapy-and-mongodb/books/books/settings.py b/web-scraping-with-scrapy-and-mongodb/books/books/settings.py
@@ -0,0 +1,33 @@
+# Scrapy settings for books project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = "books"
+
+SPIDER_MODULES = ["books.spiders"]
+NEWSPIDER_MODULE = "books.spiders"
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+    "books.pipelines.MongoPipeline": 300,
+}
+
+# Set settings whose default value is deprecated to a future-proof value
+REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
+FEED_EXPORT_ENCODING = "utf-8"
+
+MONGO_URI = "mongodb://localhost:27017"
+MONGO_DATABASE = "books_db"
+
+LOG_LEVEL = "WARNING"
+LOG_FILE = "book_scraper.log"
diff --git a/web-scraping-with-scrapy-and-mongodb/books/books/spiders/__init__.py b/web-scraping-with-scrapy-and-mongodb/books/books/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/web-scraping-with-scrapy-and-mongodb/books/books/spiders/book.py b/web-scraping-with-scrapy-and-mongodb/books/books/spiders/book.py
@@ -0,0 +1,44 @@
+import scrapy
+
+from books.items import BooksItem
+
+
+class BookSpider(scrapy.Spider):
+    name = "book"
+    allowed_domains = ["books.toscrape.com"]
+    start_urls = ["https://books.toscrape.com/"]
+
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(
+                url, callback=self.parse, errback=self.log_error
+            )
+
+    def parse(self, response):
+        """
+        @url https://books.toscrape.com
+        @returns items 20 20
+        @returns request 1 50
+        @scrapes url title price
+        """
+        for book in response.css("article.product_pod"):
+            item = BooksItem()
+            item["url"] = book.css("h3 > a::attr(href)").get()
+            item["title"] = book.css("h3 > a::attr(title)").get()
+            item["price"] = book.css(".price_color::text").get()
+            yield item
+
+        next_page = response.css("li.next > a::attr(href)").get()
+        if next_page:
+            next_page_url = response.urljoin(next_page)
+            self.logger.info(
+                f"Navigating to next page with URL {next_page_url}."
+            )
+            yield scrapy.Request(
+                url=next_page_url,
+                callback=self.parse,
+                errback=self.log_error,
+            )
+
+    def log_error(self, failure):
+        self.logger.error(repr(failure))
diff --git a/web-scraping-with-scrapy-and-mongodb/books/scrapy.cfg b/web-scraping-with-scrapy-and-mongodb/books/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = books.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = books
diff --git a/web-scraping-with-scrapy-and-mongodb/books/tests/__init__.py b/web-scraping-with-scrapy-and-mongodb/books/tests/__init__.py