Skip to content

Commit

Permalink
add fake user agent and scrapeops proxy
Browse files Browse the repository at this point in the history
  • Loading branch information
sbusso committed Mar 26, 2024
1 parent 7adeeb0 commit 3b2ad6d
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 9 deletions.
15 changes: 13 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "scrapework"
version = "0.3.2"
version = "0.3.3"
description = "simple scraping framework"
authors = ["Stéphane Busso <[email protected]>"]
license = "MIT"
Expand All @@ -20,6 +20,7 @@ trafilatura = "^1.8.0"
httpx = "^0.27.0"
hishel = "^0.0.24"
rich = "^13.7.1"
fake-useragent = "^1.5.1"

[tool.poetry.group.dev.dependencies]
black = "^24.3.0"
Expand Down
22 changes: 21 additions & 1 deletion scrapework/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from typing import List
from urllib.parse import urlencode

from fake_useragent import UserAgent

from scrapework.core.context import Context
from scrapework.module import Module
from scrapework.request import Request
Expand Down Expand Up @@ -67,8 +69,26 @@ def __init__(self, api_key: str):
def process_request(self, ctx: Context, request: Request):

payload = {"api_key": self.api_key, "url": request.url}
request.proxy = "https://proxy.scrapeops.io/v1/" + urlencode(payload)
# request.client_kwargs["proxy"] = "https://proxy.scrapeops.io/v1/?" + urlencode(
# payload
# )

request.request_url = "https://proxy.scrapeops.io/v1/?" + urlencode(payload)

# self.logger.debug(f"Making request to {request.url}")

return request


class FakeUserAgentMiddleware(RequestMiddleware):
ua: UserAgent

def __init__(self) -> None:
super().__init__()
self.ua = UserAgent()

def process_request(self, ctx: Context, request: Request):
request.headers.update({"User-Agent": self.ua.random})
return request


Expand Down
13 changes: 8 additions & 5 deletions scrapework/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import Any, Dict

import httpx
from httpx import HTTPError, TimeoutException
from httpx import URL, HTTPError, TimeoutException

from scrapework.core.context import Context
from scrapework.core.logger import Logger
Expand All @@ -26,6 +26,7 @@ def build_client(cls, **kwargs) -> httpx.Client:

class Request:
url: str
request_url: str
logger: logging.Logger
headers: Dict[str, str] = {}
timeout: int = 10
Expand All @@ -38,11 +39,12 @@ class Request:

def __init__(self, url: str, **kwargs):
self.url = url
self.request_url = url
self.logger = kwargs.get("logger", logging.getLogger("request"))
self.headers = kwargs.get("headers", {})
self.timeout = kwargs.get("timeout", 10)
self.follow_redirects = kwargs.get("follow_redirects", False)
self.proxy = kwargs.get("proxy", None)
# self.proxy = kwargs.get("proxy", None)
self.retries = kwargs.get("retries", 0)
self.cls_client = kwargs.get("cls_client", HttpxClient)
self.client_kwargs = kwargs.get("client_kwargs", {})
Expand All @@ -61,6 +63,7 @@ def fetch(self) -> httpx.Response:
:return: The fetched HTML content as a string, or None if there was an error.
"""
if self.proxy:
self.logger.debug(f"Using proxy: {self.proxy}")
mounts = {
"https://": httpx.HTTPTransport(proxy=self.proxy),
"http://": httpx.HTTPTransport(proxy=self.proxy),
Expand All @@ -75,12 +78,12 @@ def fetch(self) -> httpx.Response:
**self.client_kwargs,
)
try:

response = client.get(
self.url,
response: httpx.Response = client.get(
self.request_url,
**self.request_kwargs,
)

response.request.url = URL(self.url)
return response

except TimeoutException as err:
Expand Down

0 comments on commit 3b2ad6d

Please sign in to comment.