apify · janbuchar · Jan 27, 2025 · Jan 20, 2025 · Jan 20, 2025 · Jan 20, 2025
diff --git a/docs/introduction/03_adding_more_urls.mdx b/docs/introduction/03_adding_more_urls.mdx
@@ -10,6 +10,7 @@ import OriginalCodeExample from '!!raw-loader!./code/03_original_code.py';
 import FindingNewLinksExample from '!!raw-loader!./code/03_finding_new_links.py';
 import EnqueueStrategyExample from '!!raw-loader!./code/03_enqueue_strategy.py';
 import GlobsExample from '!!raw-loader!./code/03_globs.py';
+import TransformExample from '!!raw-loader!./code/03_transform_request.py';
 
 Previously you've built a very simple crawler that downloads HTML of a single page, reads its title and prints it to the console. This is the original source code:
 
@@ -106,11 +107,13 @@ For even more control, you can use the `include` or `exclude` parameters, either
     {GlobsExample}
 </CodeBlock>
 
-{/* TODO:
-### Transform requests
+### Transform requests before enqueuing
 
-...
-*/}
+For cases where you need to modify or filter requests before they are enqueued, you can use the `transform_request_function` parameter. This function takes a <ApiLink to="class/Request">`Request`</ApiLink> object as input and should return either a modified <ApiLink to="class/Request">`Request`</ApiLink>` object or `None`. If the function returns `None`, the request will be skipped.
+
+<CodeBlock className="language-python">
+    {TransformExample}
+</CodeBlock>
 
 ## Next steps
 

diff --git a/docs/introduction/code/03_transform_request.py b/docs/introduction/code/03_transform_request.py
@@ -0,0 +1,41 @@
+from __future__ import annotations
+
+from crawlee import HttpHeaders, RequestOptions, RequestTransformAction
+from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+
+
+def transform_request(request_options: RequestOptions) -> RequestOptions | RequestTransformAction:
+    # Skip requests to PDF files
+    if request_options['url'].endswith('.pdf'):
+        return 'skip'
+
+    if '/docs' in request_options['url']:
+        # Add custom headers to requests to specific URLs
+        request_options['headers'] = HttpHeaders({'Custom-Header': 'value'})
+
+    elif '/blog' in request_options['url']:
+        # Add label for certain URLs
+        request_options['label'] = 'BLOG'
+
+    else:
+        # Signal that the request should proceed without any transformation
+        return 'unchanged'
+
+    return request_options
+
+
+async def main() -> None:
+    crawler = BeautifulSoupCrawler(max_requests_per_crawl=50)
+
+    @crawler.router.default_handler
+    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url}.')
+
+        # Transfor request befor enqueue
+        await context.enqueue_links(transform_request_function=transform_request)
+
+    @crawler.router.handler('BLOG')
+    async def blog_handler(context: BeautifulSoupCrawlingContext) -> None:
+        context.log.info(f'Blog Processing {context.request.url}.')
+
+    await crawler.run(['https://crawlee.dev/'])
diff --git a/src/crawlee/__init__.py b/src/crawlee/__init__.py
@@ -1,10 +1,19 @@
 from importlib import metadata
 
-from ._request import Request
+from ._request import Request, RequestOptions
 from ._service_locator import service_locator
-from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders
+from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction
 from ._utils.globs import Glob
 
 __version__ = metadata.version('crawlee')
 
-__all__ = ['ConcurrencySettings', 'EnqueueStrategy', 'Glob', 'HttpHeaders', 'Request', 'service_locator']
+__all__ = [
+    'ConcurrencySettings',
+    'EnqueueStrategy',
+    'Glob',
+    'HttpHeaders',
+    'Request',
+    'RequestOptions',
+    'RequestTransformAction',
+    'service_locator',
+]
diff --git a/src/crawlee/_request.py b/src/crawlee/_request.py
@@ -3,7 +3,7 @@
 from collections.abc import Iterator, MutableMapping
 from datetime import datetime
 from enum import IntEnum
-from typing import TYPE_CHECKING, Annotated, Any, cast
+from typing import TYPE_CHECKING, Annotated, Any, TypedDict, cast
 
 from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, PlainSerializer, PlainValidator, TypeAdapter
 from yarl import URL
@@ -15,7 +15,7 @@
 from crawlee._utils.urls import validate_http_url
 
 if TYPE_CHECKING:
-    from typing_extensions import Self
+    from typing_extensions import NotRequired, Required, Self
 
 
 class RequestState(IntEnum):
@@ -108,27 +108,57 @@ def __eq__(self, other: object) -> bool:
 user_data_adapter = TypeAdapter(UserData)
 
 
-class BaseRequestData(BaseModel):
-    """Data needed to create a new crawling request."""
+class RequestOptions(TypedDict):
+    """Options that can be used to customize request creation.
 
-    model_config = ConfigDict(populate_by_name=True)
+    This type exactly matches the parameters of `Request.from_url` method.
+    """
 
-    url: Annotated[str, BeforeValidator(validate_http_url), Field()]
-    """The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
-    and fragments."""
+    url: Required[str]
+    method: NotRequired[HttpMethod]
+    headers: NotRequired[HttpHeaders | dict[str, str] | None]
+    payload: NotRequired[HttpPayload | str | None]
+    label: NotRequired[str | None]
+    unique_key: NotRequired[str | None]
+    id: NotRequired[str | None]
+    keep_url_fragment: NotRequired[bool]
+    use_extended_unique_key: NotRequired[bool]
+    always_enqueue: NotRequired[bool]
+    user_data: NotRequired[dict[str, JsonSerializable]]
+    no_retry: NotRequired[bool]
 
-    unique_key: Annotated[str, Field(alias='uniqueKey')]
-    """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
-    to the same URL.
 
-    If `unique_key` is not provided, then it is automatically generated by normalizing the URL.
-    For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key`
-    of `http://www.example.com/something`.
+@docs_group('Data structures')
+class Request(BaseModel):
+    """Represents a request in the Crawlee framework, containing the necessary information for crawling operations.
 
-    Pass an arbitrary non-empty text value to the `unique_key` property
-    to override the default behavior and specify which URLs shall be considered equal.
+    The `Request` class is one of the core components in Crawlee, utilized by various components such as request
+    providers, HTTP clients, crawlers, and more. It encapsulates the essential data for executing web requests,
+    including the URL, HTTP method, headers, payload, and user data. The user data allows custom information
+    to be stored and persisted throughout the request lifecycle, including its retries.
+
+    Key functionalities include managing the request's identifier (`id`), unique key (`unique_key`) that is used
+    for request deduplication, controlling retries, handling state management, and enabling configuration for session
+    rotation and proxy handling.
+
+    The recommended way to create a new instance is by using the `Request.from_url` constructor, which automatically
+    generates a unique key and identifier based on the URL and request parameters.
+
+    ### Usage
+
+    ```python
+    from crawlee import Request
+
+    request = Request.from_url('https://crawlee.dev')
+    ```
     """
 
+    model_config = ConfigDict(populate_by_name=True)
+
+    url: Annotated[str, BeforeValidator(validate_http_url), Field()]
+    """The URL of the web page to crawl. Must be a valid HTTP or HTTPS URL, and may include query parameters
+    and fragments."""
+
     method: HttpMethod = 'GET'
     """HTTP request method."""
 
@@ -172,79 +202,16 @@ class BaseRequestData(BaseModel):
     handled_at: Annotated[datetime | None, Field(alias='handledAt')] = None
     """Timestamp when the request was handled."""
 
-    @classmethod
-    def from_url(
-        cls,
-        url: str,
-        *,
-        method: HttpMethod = 'GET',
-        headers: HttpHeaders | dict[str, str] | None = None,
-        payload: HttpPayload | str | None = None,
-        label: str | None = None,
-        unique_key: str | None = None,
-        keep_url_fragment: bool = False,
-        use_extended_unique_key: bool = False,
-        **kwargs: Any,
-    ) -> Self:
-        """Create a new `BaseRequestData` instance from a URL. See `Request.from_url` for more details."""
-        if isinstance(headers, dict) or headers is None:
-            headers = HttpHeaders(headers or {})
-
-        if isinstance(payload, str):
-            payload = payload.encode()
-
-        unique_key = unique_key or compute_unique_key(
-            url,
-            method=method,
-            headers=headers,
-            payload=payload,
-            keep_url_fragment=keep_url_fragment,
-            use_extended_unique_key=use_extended_unique_key,
-        )
-
-        request = cls(
-            url=url,
-            unique_key=unique_key,
-            method=method,
-            headers=headers,
-            payload=payload,
-            **kwargs,
-        )
-
-        if label is not None:
-            request.user_data['label'] = label
-
-        return request
-
-    def get_query_param_from_url(self, param: str, *, default: str | None = None) -> str | None:
-        """Get the value of a specific query parameter from the URL."""
-        query_params = URL(self.url).query
-        return query_params.get(param, default)
-
-
-@docs_group('Data structures')
-class Request(BaseRequestData):
-    """Represents a request in the Crawlee framework, containing the necessary information for crawling operations.
-
-    The `Request` class is one of the core components in Crawlee, utilized by various components such as request
-    providers, HTTP clients, crawlers, and more. It encapsulates the essential data for executing web requests,
-    including the URL, HTTP method, headers, payload, and user data. The user data allows custom information
-    to be stored and persisted throughout the request lifecycle, including its retries.
-
-    Key functionalities include managing the request's identifier (`id`), unique key (`unique_key`) that is used
-    for request deduplication, controlling retries, handling state management, and enabling configuration for session
-    rotation and proxy handling.
-
-    The recommended way to create a new instance is by using the `Request.from_url` constructor, which automatically
-    generates a unique key and identifier based on the URL and request parameters.
-
-    ### Usage
+    unique_key: Annotated[str, Field(alias='uniqueKey')]
+    """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
+    to the same URL.
 
-    ```python
-    from crawlee import Request
+    If `unique_key` is not provided, then it is automatically generated by normalizing the URL.
+    For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key`
+    of `http://www.example.com/something`.
 
-    request = Request.from_url('https://crawlee.dev')
-    ```
+    Pass an arbitrary non-empty text value to the `unique_key` property
+    to override the default behavior and specify which URLs shall be considered equal.
     """
 
     id: str
@@ -331,12 +298,10 @@ def from_url(
 
         return request
 
-    @classmethod
-    def from_base_request_data(cls, base_request_data: BaseRequestData, *, id: str | None = None) -> Self:
-        """Create a complete Request object based on a BaseRequestData instance."""
-        kwargs = base_request_data.model_dump()
-        kwargs['id'] = id or unique_key_to_request_id(base_request_data.unique_key)
-        return cls(**kwargs)
+    def get_query_param_from_url(self, param: str, *, default: str | None = None) -> str | None:
+        """Get the value of a specific query parameter from the URL."""
+        query_params = URL(self.url).query
+        return query_params.get(param, default)
 
     @property
     def label(self) -> str | None:

diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py
@@ -3,7 +3,7 @@
 from collections.abc import Iterator, Mapping
 from dataclasses import dataclass
 from enum import Enum
-from typing import TYPE_CHECKING, Annotated, Any, Literal, Optional, Protocol, TypeVar, Union, cast, overload
+from typing import TYPE_CHECKING, Annotated, Any, Callable, Literal, Optional, Protocol, TypeVar, Union, cast, overload
 
 from pydantic import ConfigDict, Field, PlainValidator, RootModel
 from typing_extensions import NotRequired, TypeAlias, TypedDict, Unpack
@@ -16,7 +16,7 @@
     from collections.abc import Coroutine, Sequence
 
     from crawlee import Glob, Request
-    from crawlee._request import BaseRequestData
+    from crawlee._request import RequestOptions
     from crawlee.http_clients import HttpResponse
     from crawlee.proxy_configuration import ProxyInfo
     from crawlee.sessions import Session
@@ -44,6 +44,8 @@
 
 HttpPayload: TypeAlias = bytes
 
+RequestTransformAction: TypeAlias = Literal['skip', 'unchanged']
+
 
 def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:
     """Converts all header keys to lowercase, strips whitespace, and returns them sorted by key."""
@@ -182,7 +184,7 @@ class EnqueueLinksKwargs(TypedDict):
 class AddRequestsKwargs(EnqueueLinksKwargs):
     """Keyword arguments for the `add_requests` methods."""
 
-    requests: Sequence[str | BaseRequestData | Request]
+    requests: Sequence[str | Request]
     """Requests to be added to the `RequestManager`."""
 
 
@@ -264,7 +266,7 @@ def __init__(self, *, key_value_store_getter: GetKeyValueStoreFunction) -> None:
 
     async def add_requests(
         self,
-        requests: Sequence[str | BaseRequestData],
+        requests: Sequence[str | Request],
         **kwargs: Unpack[EnqueueLinksKwargs],
     ) -> None:
         """Track a call to the `add_requests` context helper."""
@@ -315,7 +317,7 @@ class AddRequestsFunction(Protocol):
 
     def __call__(
         self,
-        requests: Sequence[str | BaseRequestData | Request],
+        requests: Sequence[str | Request],
         **kwargs: Unpack[EnqueueLinksKwargs],
     ) -> Coroutine[None, None, None]:
         """Call dunder method.
@@ -341,6 +343,7 @@ def __call__(
         selector: str = 'a',
         label: str | None = None,
         user_data: dict[str, Any] | None = None,
+        transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
         **kwargs: Unpack[EnqueueLinksKwargs],
     ) -> Coroutine[None, None, None]:
         """A call dunder method.
@@ -353,6 +356,10 @@ def __call__(
                 - `BeautifulSoupCrawler` supports CSS selectors.
             label: Label for the newly created `Request` objects, used for request routing.
             user_data: User data to be provided to the newly created `Request` objects.
+            transform_request_function: A function that takes `RequestOptions` and returns either:
+                - Modified `RequestOptions` to update the request configuration,
+                - `'skip'` to exclude the request from being enqueued,
+                - `'unchanged'` to use the original request options without modification.
             **kwargs: Additional keyword arguments.
         """