From 3b2ad6da1615c0db92451592e2d6529bfa1ea086 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Busso?= Date: Tue, 26 Mar 2024 13:57:35 +1300 Subject: [PATCH] add fake user agent and scrapeops proxy --- poetry.lock | 15 +++++++++++++-- pyproject.toml | 3 ++- scrapework/middleware.py | 22 +++++++++++++++++++++- scrapework/request.py | 13 ++++++++----- 4 files changed, 44 insertions(+), 9 deletions(-) diff --git a/poetry.lock b/poetry.lock index 40679cd..77bcf63 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "annotated-types" @@ -297,6 +297,17 @@ calendars = ["convertdate", "hijri-converter"] fasttext = ["fasttext"] langdetect = ["langdetect"] +[[package]] +name = "fake-useragent" +version = "1.5.1" +description = "Up-to-date simple useragent faker with real world database" +optional = false +python-versions = "*" +files = [ + {file = "fake-useragent-1.5.1.tar.gz", hash = "sha256:6387269f5a2196b5ba7ed8935852f75486845a1c95c50e72460e6a8e762f5c49"}, + {file = "fake_useragent-1.5.1-py3-none-any.whl", hash = "sha256:57415096557c8a4e23b62a375c21c55af5fd4ba30549227f562d2c4f5b60e3b3"}, +] + [[package]] name = "h11" version = "0.14.0" @@ -1161,4 +1172,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "25c3effe76005c9c9613f0b2d55b7bba1d661c233e2e49ebfc5e49007ae8fa13" +content-hash = "4fe056cbd82b4d5da7d159e58b0cc8d69cbbe82359907d5435a745bb55d7cbb8" diff --git a/pyproject.toml b/pyproject.toml index 2a8cf15..2e225cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "scrapework" -version = "0.3.2" +version = "0.3.3" description = "simple scraping framework" authors = ["Stéphane Busso "] license = "MIT" @@ -20,6 +20,7 @@ trafilatura = "^1.8.0" httpx = "^0.27.0" hishel = "^0.0.24" rich = "^13.7.1" +fake-useragent = "^1.5.1" [tool.poetry.group.dev.dependencies] black = "^24.3.0" diff --git a/scrapework/middleware.py b/scrapework/middleware.py index 79db60f..a3130df 100644 --- a/scrapework/middleware.py +++ b/scrapework/middleware.py @@ -3,6 +3,8 @@ from typing import List from urllib.parse import urlencode +from fake_useragent import UserAgent + from scrapework.core.context import Context from scrapework.module import Module from scrapework.request import Request @@ -67,8 +69,26 @@ def __init__(self, api_key: str): def process_request(self, ctx: Context, request: Request): payload = {"api_key": self.api_key, "url": request.url} - request.proxy = "https://proxy.scrapeops.io/v1/" + urlencode(payload) + # request.client_kwargs["proxy"] = "https://proxy.scrapeops.io/v1/?" + urlencode( + # payload + # ) + + request.request_url = "https://proxy.scrapeops.io/v1/?" + urlencode(payload) + + # self.logger.debug(f"Making request to {request.url}") + + return request + +class FakeUserAgentMiddleware(RequestMiddleware): + ua: UserAgent + + def __init__(self) -> None: + super().__init__() + self.ua = UserAgent() + + def process_request(self, ctx: Context, request: Request): + request.headers.update({"User-Agent": self.ua.random}) return request diff --git a/scrapework/request.py b/scrapework/request.py index 4fa18b5..1650f13 100644 --- a/scrapework/request.py +++ b/scrapework/request.py @@ -3,7 +3,7 @@ from typing import Any, Dict import httpx -from httpx import HTTPError, TimeoutException +from httpx import URL, HTTPError, TimeoutException from scrapework.core.context import Context from scrapework.core.logger import Logger @@ -26,6 +26,7 @@ def build_client(cls, **kwargs) -> httpx.Client: class Request: url: str + request_url: str logger: logging.Logger headers: Dict[str, str] = {} timeout: int = 10 @@ -38,11 +39,12 @@ class Request: def __init__(self, url: str, **kwargs): self.url = url + self.request_url = url self.logger = kwargs.get("logger", logging.getLogger("request")) self.headers = kwargs.get("headers", {}) self.timeout = kwargs.get("timeout", 10) self.follow_redirects = kwargs.get("follow_redirects", False) - self.proxy = kwargs.get("proxy", None) + # self.proxy = kwargs.get("proxy", None) self.retries = kwargs.get("retries", 0) self.cls_client = kwargs.get("cls_client", HttpxClient) self.client_kwargs = kwargs.get("client_kwargs", {}) @@ -61,6 +63,7 @@ def fetch(self) -> httpx.Response: :return: The fetched HTML content as a string, or None if there was an error. """ if self.proxy: + self.logger.debug(f"Using proxy: {self.proxy}") mounts = { "https://": httpx.HTTPTransport(proxy=self.proxy), "http://": httpx.HTTPTransport(proxy=self.proxy), @@ -75,12 +78,12 @@ def fetch(self) -> httpx.Response: **self.client_kwargs, ) try: - - response = client.get( - self.url, + response: httpx.Response = client.get( + self.request_url, **self.request_kwargs, ) + response.request.url = URL(self.url) return response except TimeoutException as err: