Skip to content

Commit

Permalink
initial DockerhubDockerRegistrySpider scraping logic for the search page
Browse files Browse the repository at this point in the history
  • Loading branch information
jonasfroeller committed Jun 30, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent 0bb9f3d commit 6c83142
Showing 7 changed files with 312 additions and 21 deletions.
4 changes: 4 additions & 0 deletions neptun_webscraper/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .spiders.dockerhub import DockerhubDockerRegistrySpider
from .spiders.quay import QuayDockerRegistrySpider

__all__ = ['DockerhubDockerRegistrySpider', 'QuayDockerRegistrySpider']
34 changes: 33 additions & 1 deletion neptun_webscraper/cli.py
Original file line number Diff line number Diff line change
@@ -8,6 +8,10 @@
- Import things from your .base module
"""

import argparse
from scrapy.crawler import CrawlerProcess
from neptun_webscraper.spiders.dockerhub import DockerhubDockerRegistrySpider
from neptun_webscraper.spiders.quay import QuayDockerRegistrySpider

def main(): # pragma: no cover
"""
@@ -24,5 +28,33 @@ def main(): # pragma: no cover
* Run a command line application (Click, Typer, ArgParse)
* List all available tasks
* Run an application (Flask, FastAPI, Django, etc.)
---
Choose between different spiders.
Examples:
```
python -m neptun_webscraper dockerhub --query=python
```
```
python -m neptun_webscraper quay --query=python
```
"""
print("This will do something")

parser = argparse.ArgumentParser(description="Neptune WebScraper CLI")
parser.add_argument("spider", choices=["dockerhub", "quay"], help="Choose the spider to run")
parser.add_argument("--query", default="", help="Search query for the registry")
args = parser.parse_args()

process = CrawlerProcess()

if args.spider == "dockerhub":
spider = DockerhubDockerRegistrySpider
start_url = f"https://hub.docker.com/search?q={args.query}"
elif args.spider == "quay":
spider = QuayDockerRegistrySpider
start_url = f"https://quay.io/search?q={args.query}"

process.crawl(spider, start_urls=[start_url])
process.start()
10 changes: 0 additions & 10 deletions neptun_webscraper/dockerhub_registry_spider.py

This file was deleted.

10 changes: 0 additions & 10 deletions neptun_webscraper/quay_registry_spider.py

This file was deleted.

Empty file.
266 changes: 266 additions & 0 deletions neptun_webscraper/spiders/dockerhub.py

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions neptun_webscraper/spiders/quay.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import scrapy


class QuayDockerRegistrySpider(scrapy.Spider):
name = "quayDockerRegistry"

def parse(self, response):
# TODO: Extract data from Quay.io search results
item = {}

0 comments on commit 6c83142

Please sign in to comment.