Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add transform_request_function for enqueue_links #923

Merged
merged 13 commits into from
Jan 27, 2025
11 changes: 7 additions & 4 deletions docs/introduction/03_adding_more_urls.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import OriginalCodeExample from '!!raw-loader!./code/03_original_code.py';
import FindingNewLinksExample from '!!raw-loader!./code/03_finding_new_links.py';
import EnqueueStrategyExample from '!!raw-loader!./code/03_enqueue_strategy.py';
import GlobsExample from '!!raw-loader!./code/03_globs.py';
import TransformExample from '!!raw-loader!./code/03_transform_request.py';

Previously you've built a very simple crawler that downloads HTML of a single page, reads its title and prints it to the console. This is the original source code:

Expand Down Expand Up @@ -106,11 +107,13 @@ For even more control, you can use the `include` or `exclude` parameters, either
{GlobsExample}
</CodeBlock>

{/* TODO:
### Transform requests
### Transform requests before enqueuing

...
*/}
For cases where you need to modify or filter requests before they are enqueued, you can use the `transform_request_function` parameter. This function takes a <ApiLink to="class/Request">`Request`</ApiLink> object as input and should return either a modified <ApiLink to="class/Request">`Request`</ApiLink>` object or `None`. If the function returns `None`, the request will be skipped.

<CodeBlock className="language-python">
{TransformExample}
</CodeBlock>

## Next steps

Expand Down
41 changes: 41 additions & 0 deletions docs/introduction/code/03_transform_request.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from __future__ import annotations

from crawlee import HttpHeaders, RequestOptions, RequestTransformAction
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext


def transform_request(request_options: RequestOptions) -> RequestOptions | RequestTransformAction:
# Skip requests to PDF files
if request_options['url'].endswith('.pdf'):
return 'skip'

if '/docs' in request_options['url']:
# Add custom headers to requests to specific URLs
request_options['headers'] = HttpHeaders({'Custom-Header': 'value'})

elif '/blog' in request_options['url']:
# Add label for certain URLs
request_options['label'] = 'BLOG'

else:
# Signal that the request should proceed without any transformation
return 'unchanged'

return request_options


async def main() -> None:
crawler = BeautifulSoupCrawler(max_requests_per_crawl=50)

@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url}.')

# Transfor request befor enqueue
Mantisus marked this conversation as resolved.
Show resolved Hide resolved
await context.enqueue_links(transform_request_function=transform_request)

@crawler.router.handler('BLOG')
async def blog_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Blog Processing {context.request.url}.')

await crawler.run(['https://crawlee.dev/'])
15 changes: 12 additions & 3 deletions src/crawlee/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
from importlib import metadata

from ._request import Request
from ._request import Request, RequestOptions
from ._service_locator import service_locator
from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders
from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction
from ._utils.globs import Glob

__version__ = metadata.version('crawlee')

__all__ = ['ConcurrencySettings', 'EnqueueStrategy', 'Glob', 'HttpHeaders', 'Request', 'service_locator']
__all__ = [
'ConcurrencySettings',
'EnqueueStrategy',
'Glob',
'HttpHeaders',
'Request',
'RequestOptions',
'RequestTransformAction',
'service_locator',
]
151 changes: 58 additions & 93 deletions src/crawlee/_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from collections.abc import Iterator, MutableMapping
from datetime import datetime
from enum import IntEnum
from typing import TYPE_CHECKING, Annotated, Any, cast
from typing import TYPE_CHECKING, Annotated, Any, TypedDict, cast

from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, PlainSerializer, PlainValidator, TypeAdapter
from yarl import URL
Expand All @@ -15,7 +15,7 @@
from crawlee._utils.urls import validate_http_url

if TYPE_CHECKING:
from typing_extensions import Self
from typing_extensions import NotRequired, Required, Self


class RequestState(IntEnum):
Expand Down Expand Up @@ -108,27 +108,57 @@ def __eq__(self, other: object) -> bool:
user_data_adapter = TypeAdapter(UserData)


class BaseRequestData(BaseModel):
"""Data needed to create a new crawling request."""
class RequestOptions(TypedDict):
janbuchar marked this conversation as resolved.
Show resolved Hide resolved
"""Options that can be used to customize request creation.
janbuchar marked this conversation as resolved.
Show resolved Hide resolved

model_config = ConfigDict(populate_by_name=True)
This type exactly matches the parameters of `Request.from_url` method.
"""

url: Annotated[str, BeforeValidator(validate_http_url), Field()]
"""The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
and fragments."""
url: Required[str]
method: NotRequired[HttpMethod]
headers: NotRequired[HttpHeaders | dict[str, str] | None]
payload: NotRequired[HttpPayload | str | None]
label: NotRequired[str | None]
unique_key: NotRequired[str | None]
id: NotRequired[str | None]
keep_url_fragment: NotRequired[bool]
use_extended_unique_key: NotRequired[bool]
always_enqueue: NotRequired[bool]
user_data: NotRequired[dict[str, JsonSerializable]]
no_retry: NotRequired[bool]

unique_key: Annotated[str, Field(alias='uniqueKey')]
"""A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
to the same URL.

If `unique_key` is not provided, then it is automatically generated by normalizing the URL.
For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key`
of `http://www.example.com/something`.
@docs_group('Data structures')
class Request(BaseModel):
"""Represents a request in the Crawlee framework, containing the necessary information for crawling operations.

Pass an arbitrary non-empty text value to the `unique_key` property
to override the default behavior and specify which URLs shall be considered equal.
The `Request` class is one of the core components in Crawlee, utilized by various components such as request
providers, HTTP clients, crawlers, and more. It encapsulates the essential data for executing web requests,
including the URL, HTTP method, headers, payload, and user data. The user data allows custom information
to be stored and persisted throughout the request lifecycle, including its retries.

Key functionalities include managing the request's identifier (`id`), unique key (`unique_key`) that is used
for request deduplication, controlling retries, handling state management, and enabling configuration for session
rotation and proxy handling.

The recommended way to create a new instance is by using the `Request.from_url` constructor, which automatically
generates a unique key and identifier based on the URL and request parameters.

### Usage

```python
from crawlee import Request

request = Request.from_url('https://crawlee.dev')
```
"""

model_config = ConfigDict(populate_by_name=True)

url: Annotated[str, BeforeValidator(validate_http_url), Field()]
"""The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
and fragments."""

method: HttpMethod = 'GET'
"""HTTP request method."""

Expand Down Expand Up @@ -172,79 +202,16 @@ class BaseRequestData(BaseModel):
handled_at: Annotated[datetime | None, Field(alias='handledAt')] = None
"""Timestamp when the request was handled."""

@classmethod
def from_url(
cls,
url: str,
*,
method: HttpMethod = 'GET',
headers: HttpHeaders | dict[str, str] | None = None,
payload: HttpPayload | str | None = None,
label: str | None = None,
unique_key: str | None = None,
keep_url_fragment: bool = False,
use_extended_unique_key: bool = False,
**kwargs: Any,
) -> Self:
"""Create a new `BaseRequestData` instance from a URL. See `Request.from_url` for more details."""
if isinstance(headers, dict) or headers is None:
headers = HttpHeaders(headers or {})

if isinstance(payload, str):
payload = payload.encode()

unique_key = unique_key or compute_unique_key(
url,
method=method,
headers=headers,
payload=payload,
keep_url_fragment=keep_url_fragment,
use_extended_unique_key=use_extended_unique_key,
)

request = cls(
url=url,
unique_key=unique_key,
method=method,
headers=headers,
payload=payload,
**kwargs,
)

if label is not None:
request.user_data['label'] = label

return request

def get_query_param_from_url(self, param: str, *, default: str | None = None) -> str | None:
"""Get the value of a specific query parameter from the URL."""
query_params = URL(self.url).query
return query_params.get(param, default)


@docs_group('Data structures')
class Request(BaseRequestData):
"""Represents a request in the Crawlee framework, containing the necessary information for crawling operations.

The `Request` class is one of the core components in Crawlee, utilized by various components such as request
providers, HTTP clients, crawlers, and more. It encapsulates the essential data for executing web requests,
including the URL, HTTP method, headers, payload, and user data. The user data allows custom information
to be stored and persisted throughout the request lifecycle, including its retries.

Key functionalities include managing the request's identifier (`id`), unique key (`unique_key`) that is used
for request deduplication, controlling retries, handling state management, and enabling configuration for session
rotation and proxy handling.

The recommended way to create a new instance is by using the `Request.from_url` constructor, which automatically
generates a unique key and identifier based on the URL and request parameters.

### Usage
unique_key: Annotated[str, Field(alias='uniqueKey')]
"""A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
to the same URL.

```python
from crawlee import Request
If `unique_key` is not provided, then it is automatically generated by normalizing the URL.
For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key`
of `http://www.example.com/something`.

request = Request.from_url('https://crawlee.dev')
```
Pass an arbitrary non-empty text value to the `unique_key` property
to override the default behavior and specify which URLs shall be considered equal.
"""

id: str
Expand Down Expand Up @@ -331,12 +298,10 @@ def from_url(

return request

@classmethod
def from_base_request_data(cls, base_request_data: BaseRequestData, *, id: str | None = None) -> Self:
"""Create a complete Request object based on a BaseRequestData instance."""
kwargs = base_request_data.model_dump()
kwargs['id'] = id or unique_key_to_request_id(base_request_data.unique_key)
return cls(**kwargs)
def get_query_param_from_url(self, param: str, *, default: str | None = None) -> str | None:
"""Get the value of a specific query parameter from the URL."""
query_params = URL(self.url).query
return query_params.get(param, default)

@property
def label(self) -> str | None:
Expand Down
17 changes: 12 additions & 5 deletions src/crawlee/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from collections.abc import Iterator, Mapping
from dataclasses import dataclass
from enum import Enum
from typing import TYPE_CHECKING, Annotated, Any, Literal, Optional, Protocol, TypeVar, Union, cast, overload
from typing import TYPE_CHECKING, Annotated, Any, Callable, Literal, Optional, Protocol, TypeVar, Union, cast, overload

from pydantic import ConfigDict, Field, PlainValidator, RootModel
from typing_extensions import NotRequired, TypeAlias, TypedDict, Unpack
Expand All @@ -16,7 +16,7 @@
from collections.abc import Coroutine, Sequence

from crawlee import Glob, Request
from crawlee._request import BaseRequestData
from crawlee._request import RequestOptions
from crawlee.http_clients import HttpResponse
from crawlee.proxy_configuration import ProxyInfo
from crawlee.sessions import Session
Expand Down Expand Up @@ -44,6 +44,8 @@

HttpPayload: TypeAlias = bytes

RequestTransformAction: TypeAlias = Literal['skip', 'unchanged']
vdusek marked this conversation as resolved.
Show resolved Hide resolved


def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:
"""Converts all header keys to lowercase, strips whitespace, and returns them sorted by key."""
Expand Down Expand Up @@ -182,7 +184,7 @@ class EnqueueLinksKwargs(TypedDict):
class AddRequestsKwargs(EnqueueLinksKwargs):
"""Keyword arguments for the `add_requests` methods."""

requests: Sequence[str | BaseRequestData | Request]
requests: Sequence[str | Request]
"""Requests to be added to the `RequestManager`."""


Expand Down Expand Up @@ -264,7 +266,7 @@ def __init__(self, *, key_value_store_getter: GetKeyValueStoreFunction) -> None:

async def add_requests(
self,
requests: Sequence[str | BaseRequestData],
requests: Sequence[str | Request],
**kwargs: Unpack[EnqueueLinksKwargs],
) -> None:
"""Track a call to the `add_requests` context helper."""
Expand Down Expand Up @@ -315,7 +317,7 @@ class AddRequestsFunction(Protocol):

def __call__(
self,
requests: Sequence[str | BaseRequestData | Request],
requests: Sequence[str | Request],
**kwargs: Unpack[EnqueueLinksKwargs],
) -> Coroutine[None, None, None]:
"""Call dunder method.
Expand All @@ -341,6 +343,7 @@ def __call__(
selector: str = 'a',
label: str | None = None,
user_data: dict[str, Any] | None = None,
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
**kwargs: Unpack[EnqueueLinksKwargs],
) -> Coroutine[None, None, None]:
"""A call dunder method.
Expand All @@ -353,6 +356,10 @@ def __call__(
- `BeautifulSoupCrawler` supports CSS selectors.
label: Label for the newly created `Request` objects, used for request routing.
user_data: User data to be provided to the newly created `Request` objects.
transform_request_function: A function that takes `RequestOptions` and returns either:
- Modified `RequestOptions` to update the request configuration,
- `'skip'` to exclude the request from being enqueued,
- `'unchanged'` to use the original request options without modification.
**kwargs: Additional keyword arguments.
"""

Expand Down
Loading
Loading