initial DockerhubDockerRegistrySpider scraping logic for the search page

neptun-software · Jun 30, 2024 · 6c83142 · 6c83142
1 parent 0bb9f3d
commit 6c83142
Showing 7 changed files with 312 additions and 21 deletions.
diff --git a/neptun_webscraper/__init__.py b/neptun_webscraper/__init__.py
@@ -0,0 +1,4 @@
+from .spiders.dockerhub import DockerhubDockerRegistrySpider
+from .spiders.quay import QuayDockerRegistrySpider
+
+__all__ = ['DockerhubDockerRegistrySpider', 'QuayDockerRegistrySpider']
diff --git a/neptun_webscraper/cli.py b/neptun_webscraper/cli.py
@@ -8,6 +8,10 @@
 - Import things from your .base module
 """
 
+import argparse
+from scrapy.crawler import CrawlerProcess
+from neptun_webscraper.spiders.dockerhub import DockerhubDockerRegistrySpider
+from neptun_webscraper.spiders.quay import QuayDockerRegistrySpider
 
 def main():  # pragma: no cover
     """
@@ -24,5 +28,33 @@ def main():  # pragma: no cover
         * Run a command line application (Click, Typer, ArgParse)
         * List all available tasks
         * Run an application (Flask, FastAPI, Django, etc.)
+
+    ---
+
+    Choose between different spiders.  
+    Examples:
+    ```
+    python -m neptun_webscraper dockerhub --query=python
+    ```
+
+    ```
+    python -m neptun_webscraper quay --query=python
+    ```
     """
-    print("This will do something")
+
+    parser = argparse.ArgumentParser(description="Neptune WebScraper CLI")
+    parser.add_argument("spider", choices=["dockerhub", "quay"], help="Choose the spider to run")
+    parser.add_argument("--query", default="", help="Search query for the registry")
+    args = parser.parse_args()
+
+    process = CrawlerProcess()
+
+    if args.spider == "dockerhub":
+        spider = DockerhubDockerRegistrySpider
+        start_url = f"https://hub.docker.com/search?q={args.query}"
+    elif args.spider == "quay":
+        spider = QuayDockerRegistrySpider
+        start_url = f"https://quay.io/search?q={args.query}"
+
+    process.crawl(spider, start_urls=[start_url])
+    process.start()
diff --git a/neptun_webscraper/dockerhub_registry_spider.py b/neptun_webscraper/dockerhub_registry_spider.py
diff --git a/neptun_webscraper/quay_registry_spider.py b/neptun_webscraper/quay_registry_spider.py
diff --git a/neptun_webscraper/spiders/__init__.py b/neptun_webscraper/spiders/__init__.py
diff --git a/neptun_webscraper/spiders/dockerhub.py b/neptun_webscraper/spiders/dockerhub.py
diff --git a/neptun_webscraper/spiders/quay.py b/neptun_webscraper/spiders/quay.py
@@ -0,0 +1,9 @@
+import scrapy
+
+
+class QuayDockerRegistrySpider(scrapy.Spider):
+    name = "quayDockerRegistry"
+
+    def parse(self, response):
+        # TODO: Extract data from Quay.io search results
+        item = {}