From b56ce661dca015572a1a1add91208b67699e3ae1 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 29 Apr 2025 00:47:31 +0000 Subject: [PATCH 01/10] add CRAWLER_STATUS event --- src/crawlee/crawlers/_basic/_basic_crawler.py | 8 ++++++++ src/crawlee/events/_types.py | 3 +++ 2 files changed, 11 insertions(+) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 7e07c87f16..40391b3ea2 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -35,6 +35,7 @@ SendRequestFunction, ) from crawlee._utils.docs import docs_group +from crawlee._utils.recurring_task import RecurringTask from crawlee._utils.robots import RobotsTxtFile from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute from crawlee._utils.wait import wait_for @@ -49,6 +50,7 @@ SessionError, UserDefinedErrorHandlerError, ) +from crawlee.events._types import Event from crawlee.http_clients import HttpxHttpClient from crawlee.router import Router from crawlee.sessions import SessionPool @@ -400,6 +402,7 @@ def __init__( is_task_ready_function=self.__is_task_ready_function, run_task_function=self.__run_task_function, ) + self._crawler_state_rec_task = RecurringTask(func=self._crawler_state_task, delay=timedelta(seconds=5)) # State flags self._keep_alive = keep_alive @@ -1391,3 +1394,8 @@ async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile: url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file. """ return await RobotsTxtFile.find(url, self._http_client) + + async def _crawler_state_task(self) -> None: + """Emit a persist state event with the given migration status.""" + event_manager = service_locator.get_event_manager() + event_manager.emit(event=Event.CRAWLER_STATUS, event_data=None) diff --git a/src/crawlee/events/_types.py b/src/crawlee/events/_types.py index aaa59f4b54..bda93d2b73 100644 --- a/src/crawlee/events/_types.py +++ b/src/crawlee/events/_types.py @@ -31,6 +31,9 @@ class Event(str, Enum): PAGE_CREATED = 'pageCreated' PAGE_CLOSED = 'pageClosed' + # State events + CRAWLER_STATUS = 'crawlerStatus' + @docs_group('Event payloads') class EventPersistStateData(BaseModel): From 9a6181f9a55a9e2cc98df4e66012518a489e3325 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Mon, 23 Jun 2025 23:50:38 +0000 Subject: [PATCH 02/10] add periodic status log --- src/crawlee/_log_config.py | 31 ++++---- src/crawlee/crawlers/_basic/_basic_crawler.py | 73 ++++++++++++++++++- src/crawlee/events/__init__.py | 2 + src/crawlee/events/_event_manager.py | 5 ++ src/crawlee/events/_types.py | 18 ++++- .../crawlers/_basic/test_basic_crawler.py | 57 +++++++++++++++ 6 files changed, 168 insertions(+), 18 deletions(-) diff --git a/src/crawlee/_log_config.py b/src/crawlee/_log_config.py index 914cc32f24..093e876554 100644 --- a/src/crawlee/_log_config.py +++ b/src/crawlee/_log_config.py @@ -4,7 +4,7 @@ import logging import sys import textwrap -from typing import Any +from typing import Any, Literal from colorama import Fore, Style, just_fix_windows_console from typing_extensions import assert_never @@ -34,22 +34,27 @@ _LOG_MESSAGE_INDENT = ' ' * 6 +def string_to_log_level(level: Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']) -> int: + """Convert a string representation of a log level to an integer log level.""" + if level == 'DEBUG': + return logging.DEBUG + if level == 'INFO': + return logging.INFO + if level == 'WARNING': + return logging.WARNING + if level == 'ERROR': + return logging.ERROR + if level == 'CRITICAL': + return logging.CRITICAL + + assert_never(level) + + def get_configured_log_level() -> int: config = service_locator.get_configuration() if 'log_level' in config.model_fields_set: - if config.log_level == 'DEBUG': - return logging.DEBUG - if config.log_level == 'INFO': - return logging.INFO - if config.log_level == 'WARNING': - return logging.WARNING - if config.log_level == 'ERROR': - return logging.ERROR - if config.log_level == 'CRITICAL': - return logging.CRITICAL - - assert_never(config.log_level) + return string_to_log_level(config.log_level) if sys.flags.dev_mode: return logging.DEBUG diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 40391b3ea2..1ff578238f 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -25,7 +25,7 @@ from crawlee import EnqueueStrategy, Glob, service_locator from crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus -from crawlee._log_config import configure_logger, get_configured_log_level +from crawlee._log_config import configure_logger, get_configured_log_level, string_to_log_level from crawlee._request import Request, RequestState from crawlee._types import ( BasicCrawlingContext, @@ -50,7 +50,7 @@ SessionError, UserDefinedErrorHandlerError, ) -from crawlee.events._types import Event +from crawlee.events._types import Event, EventCrawlerStatusData from crawlee.http_clients import HttpxHttpClient from crawlee.router import Router from crawlee.sessions import SessionPool @@ -172,6 +172,13 @@ class _BasicCrawlerOptions(TypedDict): """If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`.""" + status_message_logging_interval: NotRequired[timedelta] + """Interval for logging the crawler status messages.""" + + status_message_callback: NotRequired[Callable[[StatisticsState, StatisticsState | None, str], None]] + """Allows overriding the default status message. The callback needs to call `crawler.setStatusMessage()` explicitly. + The default status message is provided in the parameters.""" + class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], TypedDict): """Generic options the `BasicCrawler` constructor.""" @@ -254,6 +261,8 @@ def __init__( configure_logging: bool = True, statistics_log_format: Literal['table', 'inline'] = 'table', respect_robots_txt_file: bool = False, + status_message_logging_interval: timedelta = timedelta(seconds=10), + status_message_callback: Callable[[StatisticsState, StatisticsState | None, str], None] | None = None, _context_pipeline: ContextPipeline[TCrawlingContext] | None = None, _additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None, _logger: logging.Logger | None = None, @@ -299,6 +308,11 @@ def __init__( respect_robots_txt_file: If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction` + status_message_logging_interval: Interval for logging the crawler status messages + status_message_callback: A callback function for customizing crawler status messages. When provided, + this function will be called instead of the default status message logging. The function receives + the current statistics state, the previous state (if available), and the default status message + as parameters. _context_pipeline: Enables extending the request lifecycle and modifying the crawling context. Intended for use by subclasses rather than direct instantiation of `BasicCrawler`. _additional_context_managers: Additional context managers used throughout the crawler lifecycle. @@ -342,6 +356,9 @@ def __init__( self._failed_request_handler: FailedRequestHandler[TCrawlingContext | BasicCrawlingContext] | None = None self._abort_on_error = abort_on_error + # Crawler callbacks + self._status_message_callback = status_message_callback + # Context of each request with matching result of request handler. # Inheritors can use this to override the result of individual request handler runs in `_run_request_handler`. self._context_result_map = WeakKeyDictionary[BasicCrawlingContext, RequestHandlerRunResult]() @@ -402,7 +419,10 @@ def __init__( is_task_ready_function=self.__is_task_ready_function, run_task_function=self.__run_task_function, ) - self._crawler_state_rec_task = RecurringTask(func=self._crawler_state_task, delay=timedelta(seconds=5)) + self._crawler_state_rec_task = RecurringTask( + func=self._crawler_state_task, delay=status_message_logging_interval + ) + self._previous_crawler_state: TStatisticsState | None = None # State flags self._keep_alive = keep_alive @@ -599,6 +619,7 @@ def sigint_handler() -> None: except CancelledError: pass finally: + await self._crawler_state_rec_task.stop() if threading.current_thread() is threading.main_thread(): with suppress(NotImplementedError): asyncio.get_running_loop().remove_signal_handler(signal.SIGINT) @@ -630,6 +651,8 @@ def sigint_handler() -> None: async def _run_crawler(self) -> None: event_manager = service_locator.get_event_manager() + self._crawler_state_rec_task.start() + # Collect the context managers to be entered. Context managers that are already active are excluded, # as they were likely entered by the caller, who will also be responsible for exiting them. contexts_to_enter = [ @@ -1395,7 +1418,49 @@ async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile: """ return await RobotsTxtFile.find(url, self._http_client) + def set_status_message( + self, message: str, level: Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] = 'DEBUG' + ) -> None: + """Set a status message for the crawler. + + Args: + message: The status message to log. + level: The logging level for the message. + """ + log_level = string_to_log_level(level) + self.log.log(log_level, message) + async def _crawler_state_task(self) -> None: """Emit a persist state event with the given migration status.""" event_manager = service_locator.get_event_manager() - event_manager.emit(event=Event.CRAWLER_STATUS, event_data=None) + + current_state = self.statistics.state + + if ( + failed_requests := ( + current_state.requests_failed - (self._previous_crawler_state or current_state).requests_failed + ) + > 0 + ): + message = f'Experiencing problems, {failed_requests} failed requests since last status update.' + else: + request_manager = await self.get_request_manager() + total_count = await request_manager.get_total_count() + if total_count is not None and total_count > 0: + pages_info = f'{self._statistics.state.requests_finished}/{total_count}' + else: + pages_info = str(self._statistics.state.requests_finished) + + message = ( + f'Crawled {pages_info} pages, {self._statistics.state.requests_failed} failed requests, ' + f'desired concurrency {self._autoscaled_pool.desired_concurrency}.' + ) + + if self._status_message_callback: + self._status_message_callback(current_state, self._previous_crawler_state, message) + else: + self.set_status_message(message) + + event_manager.emit(event=Event.CRAWLER_STATUS, event_data=EventCrawlerStatusData(message=message)) + + self._previous_crawler_state = current_state diff --git a/src/crawlee/events/__init__.py b/src/crawlee/events/__init__.py index 1c2cda0173..2aa2beecfd 100644 --- a/src/crawlee/events/__init__.py +++ b/src/crawlee/events/__init__.py @@ -3,6 +3,7 @@ from ._types import ( Event, EventAbortingData, + EventCrawlerStatusData, EventData, EventExitData, EventListener, @@ -14,6 +15,7 @@ __all__ = [ 'Event', 'EventAbortingData', + 'EventCrawlerStatusData', 'EventData', 'EventExitData', 'EventListener', diff --git a/src/crawlee/events/_event_manager.py b/src/crawlee/events/_event_manager.py index b08727e4da..3723fc78fe 100644 --- a/src/crawlee/events/_event_manager.py +++ b/src/crawlee/events/_event_manager.py @@ -19,6 +19,7 @@ from crawlee.events._types import ( Event, EventAbortingData, + EventCrawlerStatusData, EventExitData, EventListener, EventMigratingData, @@ -147,6 +148,8 @@ def on(self, *, event: Literal[Event.ABORTING], listener: EventListener[EventAbo @overload def on(self, *, event: Literal[Event.EXIT], listener: EventListener[EventExitData]) -> None: ... @overload + def on(self, *, event: Literal[Event.CRAWLER_STATUS], listener: EventListener[EventCrawlerStatusData]) -> None: ... + @overload def on(self, *, event: Event, listener: EventListener[None]) -> None: ... def on(self, *, event: Event, listener: EventListener[Any]) -> None: @@ -222,6 +225,8 @@ def emit(self, *, event: Literal[Event.ABORTING], event_data: EventAbortingData) @overload def emit(self, *, event: Literal[Event.EXIT], event_data: EventExitData) -> None: ... @overload + def emit(self, *, event: Literal[Event.CRAWLER_STATUS], event_data: EventCrawlerStatusData) -> None: ... + @overload def emit(self, *, event: Event, event_data: Any) -> None: ... @ensure_context diff --git a/src/crawlee/events/_types.py b/src/crawlee/events/_types.py index bda93d2b73..c5afeb68cd 100644 --- a/src/crawlee/events/_types.py +++ b/src/crawlee/events/_types.py @@ -82,7 +82,23 @@ class EventExitData(BaseModel): model_config = ConfigDict(populate_by_name=True) -EventData = Union[EventPersistStateData, EventSystemInfoData, EventMigratingData, EventAbortingData, EventExitData] +@docs_group('Event payloads') +class EventCrawlerStatusData(BaseModel): + """Data for the crawler status event.""" + + model_config = ConfigDict(populate_by_name=True) + + message: str + + +EventData = Union[ + EventPersistStateData, + EventSystemInfoData, + EventMigratingData, + EventAbortingData, + EventExitData, + EventCrawlerStatusData, +] """A helper type for all possible event payloads""" WrappedListener = Callable[..., Coroutine[Any, Any, None]] diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 4f151ad621..c40b1ef53f 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -23,6 +23,7 @@ from crawlee.configuration import Configuration from crawlee.crawlers import BasicCrawler from crawlee.errors import RequestCollisionError, SessionError, UserDefinedErrorHandlerError +from crawlee.events import Event, EventCrawlerStatusData from crawlee.events._local_event_manager import LocalEventManager from crawlee.request_loaders import RequestList, RequestManagerTandem from crawlee.sessions import Session, SessionPool @@ -36,6 +37,7 @@ from yarl import URL from crawlee._types import JsonSerializable + from crawlee.statistics import StatisticsState from crawlee.storage_clients._memory import DatasetClient @@ -1345,3 +1347,58 @@ async def handler(context: BasicCrawlingContext) -> None: break else: raise AssertionError('Expected log message about request handler error was not found.') + + +async def test_status_message_callback() -> None: + """Test that status message callback is called with the correct message.""" + status_message_callback = AsyncMock() + states: list[dict[str, StatisticsState | None]] = [] + + def status_callback(state: StatisticsState, previous_state: StatisticsState | None, message: str) -> None: + status_message_callback(message) + states.append({'state': state, 'previous_state': previous_state}) + + crawler = BasicCrawler( + status_message_callback=status_callback, status_message_logging_interval=timedelta(seconds=0.01) + ) + + @crawler.router.default_handler + async def handler(context: BasicCrawlingContext) -> None: + await asyncio.sleep(0.1) # Simulate some processing time + + await crawler.run(['http://a.com/']) + + assert status_message_callback.called + + assert len(states) > 1 + + first_call = states[0] + second_call = states[1] + + # For the first call, `previous_state` is None + assert first_call['state'] is not None + assert first_call['previous_state'] is None + + # For second call, `previous_state` is the first state + assert second_call['state'] is not None + assert second_call['previous_state'] is not None + assert second_call['previous_state'] == first_call['state'] + + +async def test_status_message_emit() -> None: + event_manager = service_locator.get_event_manager() + + status_message_listener = Mock() + + def listener(event_data: EventCrawlerStatusData) -> None: + status_message_listener(event_data) + + event_manager.on(event=Event.CRAWLER_STATUS, listener=listener) + + crawler = BasicCrawler(request_handler=AsyncMock()) + + await crawler.run(['http://a.com/']) + + event_manager.off(event=Event.CRAWLER_STATUS, listener=listener) + + assert status_message_listener.called From edb7cb814fbda5325d2cb7cedaf811dfb08983de Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Mon, 23 Jun 2025 23:52:59 +0000 Subject: [PATCH 03/10] up log level --- src/crawlee/crawlers/_basic/_basic_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 1ff578238f..355b19e8b8 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -1459,7 +1459,7 @@ async def _crawler_state_task(self) -> None: if self._status_message_callback: self._status_message_callback(current_state, self._previous_crawler_state, message) else: - self.set_status_message(message) + self.set_status_message(message, level='INFO') event_manager.emit(event=Event.CRAWLER_STATUS, event_data=EventCrawlerStatusData(message=message)) From c2680a0fbd2311d4058f4f0eaf6983c49c8a60ae Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 24 Jun 2025 00:30:00 +0000 Subject: [PATCH 04/10] add `crawler` parameter in `status_message_callback` --- src/crawlee/crawlers/_basic/_basic_crawler.py | 16 ++++++++++------ tests/unit/crawlers/_basic/test_basic_crawler.py | 4 +++- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 40e4593736..a36f8175c5 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -194,9 +194,11 @@ class _BasicCrawlerOptions(TypedDict): status_message_logging_interval: NotRequired[timedelta] """Interval for logging the crawler status messages.""" - status_message_callback: NotRequired[Callable[[StatisticsState, StatisticsState | None, str], None]] - """Allows overriding the default status message. The callback needs to call `crawler.setStatusMessage()` explicitly. - The default status message is provided in the parameters.""" + status_message_callback: NotRequired[ + Callable[[BasicCrawler[TCrawlingContext, TStatisticsState], StatisticsState, StatisticsState | None, str], None] + ] + """Allows overriding the default status message. The callback needs to call `crawler.set_status_message()` + explicitly. The default status message is provided in the parameters.""" class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], TypedDict): @@ -281,7 +283,10 @@ def __init__( statistics_log_format: Literal['table', 'inline'] = 'table', respect_robots_txt_file: bool = False, status_message_logging_interval: timedelta = timedelta(seconds=10), - status_message_callback: Callable[[StatisticsState, StatisticsState | None, str], None] | None = None, + status_message_callback: Callable[ + [BasicCrawler[TCrawlingContext, TStatisticsState], StatisticsState, StatisticsState | None, str], None + ] + | None = None, _context_pipeline: ContextPipeline[TCrawlingContext] | None = None, _additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None, _logger: logging.Logger | None = None, @@ -309,7 +314,6 @@ def __init__( `max_requests_per_crawl` is achieved. max_session_rotations: Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs or if the website blocks the request. - The session rotations are not counted towards the `max_request_retries` limit. max_crawl_depth: Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond this depth. The crawl depth starts at 0 for initial requests and increases with each subsequent level @@ -1588,7 +1592,7 @@ async def _crawler_state_task(self) -> None: ) if self._status_message_callback: - self._status_message_callback(current_state, self._previous_crawler_state, message) + self._status_message_callback(self, current_state, self._previous_crawler_state, message) else: self.set_status_message(message, level='INFO') diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index ce99b7a0c3..38f03cf65b 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -1426,7 +1426,9 @@ async def test_status_message_callback() -> None: status_message_callback = AsyncMock() states: list[dict[str, StatisticsState | None]] = [] - def status_callback(state: StatisticsState, previous_state: StatisticsState | None, message: str) -> None: + def status_callback( + crawler: BasicCrawler, state: StatisticsState, previous_state: StatisticsState | None, message: str + ) -> None: status_message_callback(message) states.append({'state': state, 'previous_state': previous_state}) From cb89c8ee75bd90ec1b5154d061be55d93724b375 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 24 Jun 2025 18:06:58 +0000 Subject: [PATCH 05/10] update signature --- src/crawlee/_log_config.py | 8 +++-- src/crawlee/_types.py | 2 ++ src/crawlee/configuration.py | 5 +-- src/crawlee/crawlers/_basic/_basic_crawler.py | 31 +++++++++---------- src/crawlee/events/_types.py | 2 ++ .../crawlers/_basic/test_basic_crawler.py | 7 +++-- 6 files changed, 32 insertions(+), 23 deletions(-) diff --git a/src/crawlee/_log_config.py b/src/crawlee/_log_config.py index 093e876554..5fc9e94b8a 100644 --- a/src/crawlee/_log_config.py +++ b/src/crawlee/_log_config.py @@ -4,15 +4,19 @@ import logging import sys import textwrap -from typing import Any, Literal +from typing import TYPE_CHECKING, Any from colorama import Fore, Style, just_fix_windows_console from typing_extensions import assert_never from crawlee import service_locator +if TYPE_CHECKING: + from crawlee._types import LogLevel + just_fix_windows_console() + _LOG_NAME_COLOR = Fore.LIGHTBLACK_EX _LOG_LEVEL_COLOR = { @@ -34,7 +38,7 @@ _LOG_MESSAGE_INDENT = ' ' * 6 -def string_to_log_level(level: Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']) -> int: +def string_to_log_level(level: LogLevel) -> int: """Convert a string representation of a log level to an integer log level.""" if level == 'DEBUG': return logging.DEBUG diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 3cb84111fe..72a108cb6c 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -52,6 +52,8 @@ SkippedReason: TypeAlias = Literal['robots_txt'] +LogLevel: TypeAlias = Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] + def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]: """Convert all header keys to lowercase, strips whitespace, and returns them sorted by key.""" diff --git a/src/crawlee/configuration.py b/src/crawlee/configuration.py index de22118816..2158fe4d0a 100644 --- a/src/crawlee/configuration.py +++ b/src/crawlee/configuration.py @@ -1,11 +1,12 @@ from __future__ import annotations from datetime import timedelta -from typing import TYPE_CHECKING, Annotated, Literal +from typing import TYPE_CHECKING, Annotated from pydantic import AliasChoices, BeforeValidator, Field from pydantic_settings import BaseSettings, SettingsConfigDict +from crawlee._types import LogLevel from crawlee._utils.docs import docs_group from crawlee._utils.models import timedelta_ms @@ -62,7 +63,7 @@ class Configuration(BaseSettings): https://playwright.dev/docs/api/class-browsertype#browser-type-launch.""" log_level: Annotated[ - Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + LogLevel, Field( validation_alias=AliasChoices( 'apify_log_level', diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index a36f8175c5..7d9f2fc08c 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -33,6 +33,7 @@ GetKeyValueStoreFromRequestHandlerFunction, HttpHeaders, HttpPayload, + LogLevel, RequestHandlerRunResult, SendRequestFunction, SkippedReason, @@ -195,10 +196,10 @@ class _BasicCrawlerOptions(TypedDict): """Interval for logging the crawler status messages.""" status_message_callback: NotRequired[ - Callable[[BasicCrawler[TCrawlingContext, TStatisticsState], StatisticsState, StatisticsState | None, str], None] + Callable[[StatisticsState, StatisticsState | None, str], Awaitable[str | None]] ] - """Allows overriding the default status message. The callback needs to call `crawler.set_status_message()` - explicitly. The default status message is provided in the parameters.""" + """Allows overriding the default status message. The default status message is provided in the parameters. + Returning `None` suppresses the status message.""" class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], TypedDict): @@ -283,9 +284,7 @@ def __init__( statistics_log_format: Literal['table', 'inline'] = 'table', respect_robots_txt_file: bool = False, status_message_logging_interval: timedelta = timedelta(seconds=10), - status_message_callback: Callable[ - [BasicCrawler[TCrawlingContext, TStatisticsState], StatisticsState, StatisticsState | None, str], None - ] + status_message_callback: Callable[[StatisticsState, StatisticsState | None, str], Awaitable[str | None]] | None = None, _context_pipeline: ContextPipeline[TCrawlingContext] | None = None, _additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None, @@ -305,7 +304,6 @@ def __init__( max_request_retries: Specifies the maximum number of retries allowed for a request if its processing fails. This includes retries due to navigation errors or errors thrown from user-supplied functions (`request_handler`, `pre_navigation_hooks` etc.). - This limit does not apply to retries triggered by session rotation (see `max_session_rotations`). max_requests_per_crawl: Maximum number of pages to open during a crawl. The crawl stops upon reaching this limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means @@ -338,10 +336,8 @@ def __init__( for each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction` status_message_logging_interval: Interval for logging the crawler status messages - status_message_callback: A callback function for customizing crawler status messages. When provided, - this function will be called instead of the default status message logging. The function receives - the current statistics state, the previous state (if available), and the default status message - as parameters. + status_message_callback: Allows overriding the default status message. The default status message is + provided in the parameters. Returning `None` suppresses the status message. _context_pipeline: Enables extending the request lifecycle and modifying the crawling context. Intended for use by subclasses rather than direct instantiation of `BasicCrawler`. _additional_context_managers: Additional context managers used throughout the crawler lifecycle. @@ -1553,9 +1549,7 @@ async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile: """ return await RobotsTxtFile.find(url, self._http_client) - def set_status_message( - self, message: str, level: Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] = 'DEBUG' - ) -> None: + def set_status_message(self, message: str, level: LogLevel = 'DEBUG') -> None: """Set a status message for the crawler. Args: @@ -1592,10 +1586,15 @@ async def _crawler_state_task(self) -> None: ) if self._status_message_callback: - self._status_message_callback(self, current_state, self._previous_crawler_state, message) + new_message = await self._status_message_callback(current_state, self._previous_crawler_state, message) + if new_message: + message = new_message + self.set_status_message(message, level='INFO') else: self.set_status_message(message, level='INFO') - event_manager.emit(event=Event.CRAWLER_STATUS, event_data=EventCrawlerStatusData(message=message)) + event_manager.emit( + event=Event.CRAWLER_STATUS, event_data=EventCrawlerStatusData(message=message, crawler_id=id(self)) + ) self._previous_crawler_state = current_state diff --git a/src/crawlee/events/_types.py b/src/crawlee/events/_types.py index c5afeb68cd..0634b02794 100644 --- a/src/crawlee/events/_types.py +++ b/src/crawlee/events/_types.py @@ -90,6 +90,8 @@ class EventCrawlerStatusData(BaseModel): message: str + crawler_id: int + EventData = Union[ EventPersistStateData, diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 38f03cf65b..2c722e8e16 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -1426,11 +1426,12 @@ async def test_status_message_callback() -> None: status_message_callback = AsyncMock() states: list[dict[str, StatisticsState | None]] = [] - def status_callback( - crawler: BasicCrawler, state: StatisticsState, previous_state: StatisticsState | None, message: str - ) -> None: + async def status_callback( + state: StatisticsState, previous_state: StatisticsState | None, message: str + ) -> str | None: status_message_callback(message) states.append({'state': state, 'previous_state': previous_state}) + return message crawler = BasicCrawler( status_message_callback=status_callback, status_message_logging_interval=timedelta(seconds=0.01) From 3303d680b4741d6095302930e72bb903bdabc6ab Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 25 Jun 2025 12:31:39 +0000 Subject: [PATCH 06/10] `set_status_message` to `_log_status_message` --- src/crawlee/crawlers/_basic/_basic_crawler.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 7d9f2fc08c..567012e3d3 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -1549,8 +1549,8 @@ async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile: """ return await RobotsTxtFile.find(url, self._http_client) - def set_status_message(self, message: str, level: LogLevel = 'DEBUG') -> None: - """Set a status message for the crawler. + def _log_status_message(self, message: str, level: LogLevel = 'DEBUG') -> None: + """Log a status message for the crawler. Args: message: The status message to log. @@ -1589,9 +1589,9 @@ async def _crawler_state_task(self) -> None: new_message = await self._status_message_callback(current_state, self._previous_crawler_state, message) if new_message: message = new_message - self.set_status_message(message, level='INFO') + self._log_status_message(message, level='INFO') else: - self.set_status_message(message, level='INFO') + self._log_status_message(message, level='INFO') event_manager.emit( event=Event.CRAWLER_STATUS, event_data=EventCrawlerStatusData(message=message, crawler_id=id(self)) From 2af8a4b4587a864a94a7f8f786ca2fefcffa0c80 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Fri, 27 Jun 2025 20:00:55 +0000 Subject: [PATCH 07/10] add docs for `EventCrawlerStatusData` --- src/crawlee/events/_types.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/crawlee/events/_types.py b/src/crawlee/events/_types.py index 0634b02794..23834171f0 100644 --- a/src/crawlee/events/_types.py +++ b/src/crawlee/events/_types.py @@ -89,8 +89,10 @@ class EventCrawlerStatusData(BaseModel): model_config = ConfigDict(populate_by_name=True) message: str + """A message describing the current status of the crawler.""" crawler_id: int + """The ID of the crawler that emitted the event.""" EventData = Union[ From 09a73a6a661e9f650cc78bbee22dbb082ede27e3 Mon Sep 17 00:00:00 2001 From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com> Date: Wed, 2 Jul 2025 11:12:16 +0300 Subject: [PATCH 08/10] Update src/crawlee/crawlers/_basic/_basic_crawler.py Co-authored-by: Vlada Dusek --- src/crawlee/crawlers/_basic/_basic_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 567012e3d3..130a304eb1 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -335,7 +335,7 @@ def __init__( respect_robots_txt_file: If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction` - status_message_logging_interval: Interval for logging the crawler status messages + status_message_logging_interval: Interval for logging the crawler status messages. status_message_callback: Allows overriding the default status message. The default status message is provided in the parameters. Returning `None` suppresses the status message. _context_pipeline: Enables extending the request lifecycle and modifying the crawling context. From c7d395426d0a2bcb00d94320fb30af51ddd3afaf Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 2 Jul 2025 11:47:02 +0000 Subject: [PATCH 09/10] change abcde.com links to placeholders links --- .../crawlers/_basic/test_basic_crawler.py | 165 ++++++++++-------- .../unit/request_loaders/test_request_list.py | 10 +- .../storages/test_request_manager_tandem.py | 24 +-- 3 files changed, 108 insertions(+), 91 deletions(-) diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 4f7d612827..8a151f3312 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -42,7 +42,7 @@ async def test_processes_requests_from_explicit_queue() -> None: queue = await RequestQueue.open() - await queue.add_requests(['http://a.com/', 'http://b.com/', 'http://c.com/']) + await queue.add_requests(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) crawler = BasicCrawler(request_manager=queue) calls = list[str]() @@ -53,14 +53,14 @@ async def handler(context: BasicCrawlingContext) -> None: await crawler.run() - assert calls == ['http://a.com/', 'http://b.com/', 'http://c.com/'] + assert calls == ['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'] async def test_processes_requests_from_request_source_tandem() -> None: request_queue = await RequestQueue.open() - await request_queue.add_requests(['http://a.com/', 'http://b.com/', 'http://c.com/']) + await request_queue.add_requests(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) - request_list = RequestList(['http://a.com/', 'http://d.com', 'http://e.com']) + request_list = RequestList(['https://placeholder.com', 'https://placeholder.gov', 'https://placeholder.biz']) crawler = BasicCrawler(request_manager=RequestManagerTandem(request_list, request_queue)) calls = set[str]() @@ -71,7 +71,13 @@ async def handler(context: BasicCrawlingContext) -> None: await crawler.run() - assert calls == {'http://a.com/', 'http://b.com/', 'http://c.com/', 'http://d.com', 'http://e.com'} + assert calls == { + 'https://placeholder.com', + 'https://placeholder.io', + 'https://placeholder.dev', + 'https://placeholder.gov', + 'https://placeholder.biz', + } async def test_processes_requests_from_run_args() -> None: @@ -82,9 +88,9 @@ async def test_processes_requests_from_run_args() -> None: async def handler(context: BasicCrawlingContext) -> None: calls.append(context.request.url) - await crawler.run(['http://a.com/', 'http://b.com/', 'http://c.com/']) + await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) - assert calls == ['http://a.com/', 'http://b.com/', 'http://c.com/'] + assert calls == ['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'] async def test_allows_multiple_run_calls() -> None: @@ -95,16 +101,16 @@ async def test_allows_multiple_run_calls() -> None: async def handler(context: BasicCrawlingContext) -> None: calls.append(context.request.url) - await crawler.run(['http://a.com/', 'http://b.com/', 'http://c.com/']) - await crawler.run(['http://a.com/', 'http://b.com/', 'http://c.com/']) + await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) + await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) assert calls == [ - 'http://a.com/', - 'http://b.com/', - 'http://c.com/', - 'http://a.com/', - 'http://b.com/', - 'http://c.com/', + 'https://placeholder.com', + 'https://placeholder.io', + 'https://placeholder.dev', + 'https://placeholder.com', + 'https://placeholder.io', + 'https://placeholder.dev', ] @@ -116,17 +122,17 @@ async def test_retries_failed_requests() -> None: async def handler(context: BasicCrawlingContext) -> None: calls.append(context.request.url) - if context.request.url == 'http://b.com/': + if context.request.url == 'https://placeholder.io': raise RuntimeError('Arbitrary crash for testing purposes') - await crawler.run(['http://a.com/', 'http://b.com/', 'http://c.com/']) + await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) assert calls == [ - 'http://a.com/', - 'http://b.com/', - 'http://c.com/', - 'http://b.com/', - 'http://b.com/', + 'https://placeholder.com', + 'https://placeholder.io', + 'https://placeholder.dev', + 'https://placeholder.io', + 'https://placeholder.io', ] @@ -139,16 +145,22 @@ async def handler(context: BasicCrawlingContext) -> None: calls.append(context.request.url) raise RuntimeError('Arbitrary crash for testing purposes') - await crawler.run(['http://a.com/', 'http://b.com/', Request.from_url(url='http://c.com/', no_retry=True)]) + await crawler.run( + [ + 'https://placeholder.com', + 'https://placeholder.io', + Request.from_url(url='https://placeholder.dev', no_retry=True), + ] + ) assert calls == [ - 'http://a.com/', - 'http://b.com/', - 'http://c.com/', - 'http://a.com/', - 'http://b.com/', - 'http://a.com/', - 'http://b.com/', + 'https://placeholder.com', + 'https://placeholder.io', + 'https://placeholder.dev', + 'https://placeholder.com', + 'https://placeholder.io', + 'https://placeholder.com', + 'https://placeholder.io', ] @@ -163,19 +175,19 @@ async def handler(context: BasicCrawlingContext) -> None: await crawler.run( [ - 'http://a.com/', - 'http://b.com/', - Request.from_url(url='http://c.com/', user_data={'__crawlee': {'maxRetries': 4}}), + 'https://placeholder.com', + 'https://placeholder.io', + Request.from_url(url='https://placeholder.dev', user_data={'__crawlee': {'maxRetries': 4}}), ] ) assert calls == [ - 'http://a.com/', - 'http://b.com/', - 'http://c.com/', - 'http://c.com/', - 'http://c.com/', - 'http://c.com/', + 'https://placeholder.com', + 'https://placeholder.io', + 'https://placeholder.dev', + 'https://placeholder.dev', + 'https://placeholder.dev', + 'https://placeholder.dev', ] @@ -194,7 +206,7 @@ class Call: @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: - if context.request.url == 'http://b.com/': + if context.request.url == 'https://placeholder.io': raise RuntimeError('Arbitrary crash for testing purposes') @crawler.error_handler @@ -211,20 +223,20 @@ async def error_handler(context: BasicCrawlingContext, error: Exception) -> Requ request['headers'] = HttpHeaders({'custom_retry_count': str(custom_retry_count + 1)}) return Request.model_validate(request) - await crawler.run(['http://a.com/', 'http://b.com/', 'http://c.com/']) + await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) # Verify that the error handler was called twice assert len(calls) == 2 # Check the first call... first_call = calls[0] - assert first_call.url == 'http://b.com/' + assert first_call.url == 'https://placeholder.io' assert isinstance(first_call.error, RuntimeError) assert first_call.custom_retry_count == 0 # Check the second call... second_call = calls[1] - assert second_call.url == 'http://b.com/' + assert second_call.url == 'https://placeholder.io' assert isinstance(second_call.error, RuntimeError) assert second_call.custom_retry_count == 1 @@ -254,7 +266,7 @@ async def test_handles_error_in_error_handler() -> None: @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: - if context.request.url == 'http://b.com/': + if context.request.url == 'https://placeholder.io': raise RuntimeError('Arbitrary crash for testing purposes') @crawler.error_handler @@ -262,7 +274,7 @@ async def error_handler(context: BasicCrawlingContext, error: Exception) -> None raise RuntimeError('Crash in error handler') with pytest.raises(UserDefinedErrorHandlerError): - await crawler.run(['http://a.com/', 'http://b.com/', 'http://c.com/']) + await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) async def test_calls_failed_request_handler() -> None: @@ -271,17 +283,17 @@ async def test_calls_failed_request_handler() -> None: @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: - if context.request.url == 'http://b.com/': + if context.request.url == 'https://placeholder.io': raise RuntimeError('Arbitrary crash for testing purposes') @crawler.failed_request_handler async def failed_request_handler(context: BasicCrawlingContext, error: Exception) -> None: calls.append((context, error)) - await crawler.run(['http://a.com/', 'http://b.com/', 'http://c.com/']) + await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) assert len(calls) == 1 - assert calls[0][0].request.url == 'http://b.com/' + assert calls[0][0].request.url == 'https://placeholder.io' assert isinstance(calls[0][1], RuntimeError) @@ -290,7 +302,7 @@ async def test_handles_error_in_failed_request_handler() -> None: @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: - if context.request.url == 'http://b.com/': + if context.request.url == 'https://placeholder.io': raise RuntimeError('Arbitrary crash for testing purposes') @crawler.failed_request_handler @@ -298,7 +310,7 @@ async def failed_request_handler(context: BasicCrawlingContext, error: Exception raise RuntimeError('Crash in failed request handler') with pytest.raises(UserDefinedErrorHandlerError): - await crawler.run(['http://a.com/', 'http://b.com/', 'http://c.com/']) + await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) @pytest.mark.parametrize( @@ -320,7 +332,7 @@ async def handler(context: BasicCrawlingContext) -> None: response_data['body'] = json.loads(response.read()) response_data['headers'] = response.headers - await crawler.run(['http://a.com/', 'http://b.com/', 'http://c.com/']) + await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) response_body = response_data.get('body') assert response_body is not None @@ -365,15 +377,15 @@ class AddRequestsTestInput: # Basic use case pytest.param( AddRequestsTestInput( - start_url='https://a.com/', - loaded_url='https://a.com/', + start_url='https://placeholder.com', + loaded_url='https://placeholder.com', requests=[ - 'https://a.com/', - Request.from_url('http://b.com/'), - 'http://c.com/', + 'https://placeholder.com', + Request.from_url('https://placeholder.io'), + 'https://placeholder.dev', ], kwargs={}, - expected_urls=['http://b.com/', 'http://c.com/'], + expected_urls=['https://placeholder.io', 'https://placeholder.dev'], ), id='basic', ), @@ -669,7 +681,7 @@ async def handler(context: BasicCrawlingContext) -> None: await context.push_data({'b': 2}) raise RuntimeError('Watch me crash') - stats = await crawler.run(['https://a.com']) + stats = await crawler.run(['https://placeholder.com']) assert (await crawler.get_data()).items == [] assert stats.requests_total == 1 @@ -895,15 +907,15 @@ async def test_consecutive_runs_purge_request_queue() -> None: async def handler(context: BasicCrawlingContext) -> None: visit(context.request.url) - await crawler.run(['http://a.com', 'http://b.com', 'http://c.com']) - await crawler.run(['http://a.com', 'http://b.com', 'http://c.com']) - await crawler.run(['http://a.com', 'http://b.com', 'http://c.com']) + await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) + await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) + await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) counter = Counter(args[0][0] for args in visit.call_args_list) assert counter == { - 'http://a.com': 3, - 'http://b.com': 3, - 'http://c.com': 3, + 'https://placeholder.com': 3, + 'https://placeholder.io': 3, + 'https://placeholder.dev': 3, } @@ -1156,7 +1168,7 @@ async def handler(context: BasicCrawlingContext) -> None: # Timeout in pytest, because previous implementation would run crawler until following: # "The request queue seems to be stuck for 300.0s, resetting internal state." async with timeout(max_request_retries * double_handler_timeout_s): - await crawler.run(['http://a.com/']) + await crawler.run(['https://placeholder.com']) assert crawler.statistics.state.requests_finished == 1 assert mocked_handler_before_sleep.call_count == max_request_retries @@ -1177,7 +1189,7 @@ async def test_keep_alive( """Test that crawler can be kept alive without any requests and stopped with `crawler.stop()`. Crawler should stop if `max_requests_per_crawl` is reached regardless of the `keep_alive` flag.""" - additional_urls = ['http://a.com/', 'http://b.com/'] + additional_urls = ['https://placeholder.com', 'https://placeholder.io'] expected_handler_calls = [call(url) for url in additional_urls[:expected_handled_requests_count]] crawler = BasicCrawler( @@ -1226,9 +1238,9 @@ async def handler(context: BasicCrawlingContext) -> None: context.session.retire() if retire else None - await context.add_requests(['http://b.com/']) + await context.add_requests(['https://placeholder.io']) - await crawler.run(['http://a.com/']) + await crawler.run(['https://placeholder.com']) # The session should differ if `retire` was called and match otherwise since pool size == 1 if retire: @@ -1249,7 +1261,8 @@ async def handler(context: BasicCrawlingContext) -> None: used_sessions.append(context.session.id) requests = [ - Request.from_url('http://a.com/', session_id=check_session.id, always_enqueue=True) for _ in range(10) + Request.from_url('https://placeholder.com', session_id=check_session.id, always_enqueue=True) + for _ in range(10) ] await crawler.run(requests) @@ -1280,7 +1293,7 @@ async def handler(context: BasicCrawlingContext) -> None: used_sessions.append(context.session.id) requests = [ - Request.from_url('http://a.com/', session_id=str(session_id), use_extended_unique_key=True) + Request.from_url('https://placeholder.com', session_id=str(session_id), use_extended_unique_key=True) for session_id in range(10) ] @@ -1293,7 +1306,7 @@ async def handler(context: BasicCrawlingContext) -> None: async def test_error_bound_session_to_request() -> None: crawler = BasicCrawler(request_handler=AsyncMock()) - requests = [Request.from_url('http://a.com/', session_id='1', always_enqueue=True) for _ in range(10)] + requests = [Request.from_url('https://placeholder.com', session_id='1', always_enqueue=True) for _ in range(10)] stats = await crawler.run(requests) @@ -1311,7 +1324,7 @@ async def error_req_hook(context: BasicCrawlingContext, error: Exception) -> Non if isinstance(error, RequestCollisionError): await error_handler_mock(context, error) - requests = [Request.from_url('http://a.com/', session_id='1')] + requests = [Request.from_url('https://placeholder.com', session_id='1')] await crawler.run(requests) @@ -1330,7 +1343,7 @@ async def handler(context: BasicCrawlingContext) -> None: async def failed_request_handler(context: BasicCrawlingContext, error: Exception) -> None: handler_requests.add(context.request.url) - requests = ['http://a.com/', 'http://b.com/', 'http://c.com/'] + requests = ['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'] await crawler.run(requests) @@ -1363,7 +1376,7 @@ async def handler(context: BasicCrawlingContext) -> None: # Capture all logs from the 'crawlee' logger at INFO level or higher with caplog.at_level(logging.INFO, logger='crawlee'): - await crawler.run([Request.from_url('http://a.com/')]) + await crawler.run([Request.from_url('https://placeholder.com')]) # Check for the timeout message in any of the logs found_timeout_message = False @@ -1398,7 +1411,7 @@ async def status_callback( async def handler(context: BasicCrawlingContext) -> None: await asyncio.sleep(0.1) # Simulate some processing time - await crawler.run(['http://a.com/']) + await crawler.run(['https://placeholder.com']) assert status_message_callback.called @@ -1429,7 +1442,7 @@ def listener(event_data: EventCrawlerStatusData) -> None: crawler = BasicCrawler(request_handler=AsyncMock()) - await crawler.run(['http://a.com/']) + await crawler.run(['https://placeholder.com']) event_manager.off(event=Event.CRAWLER_STATUS, listener=listener) diff --git a/tests/unit/request_loaders/test_request_list.py b/tests/unit/request_loaders/test_request_list.py index 5142b7719d..1f2345a6af 100644 --- a/tests/unit/request_loaders/test_request_list.py +++ b/tests/unit/request_loaders/test_request_list.py @@ -4,7 +4,7 @@ async def test_sync_traversal() -> None: - request_list = RequestList(['https://a.com', 'https://b.com', 'https://c.com']) + request_list = RequestList(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) while not await request_list.is_finished(): item = await request_list.fetch_next_request() @@ -17,9 +17,9 @@ async def test_sync_traversal() -> None: async def test_async_traversal() -> None: async def generator() -> AsyncGenerator[str]: - yield 'https://a.com' - yield 'https://b.com' - yield 'https://c.com' + yield 'https://placeholder.com' + yield 'https://placeholder.io' + yield 'https://placeholder.dev' request_list = RequestList(generator()) @@ -33,7 +33,7 @@ async def generator() -> AsyncGenerator[str]: async def test_is_empty_does_not_depend_on_fetch_next_request() -> None: - request_list = RequestList(['https://a.com', 'https://b.com', 'https://c.com']) + request_list = RequestList(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) item_1 = await request_list.fetch_next_request() assert item_1 is not None diff --git a/tests/unit/storages/test_request_manager_tandem.py b/tests/unit/storages/test_request_manager_tandem.py index 70240914ec..cb31f5e4e1 100644 --- a/tests/unit/storages/test_request_manager_tandem.py +++ b/tests/unit/storages/test_request_manager_tandem.py @@ -25,26 +25,30 @@ class TestInput: argvalues=[ pytest.param( TestInput( - request_loader_items=['http://a.com', 'http://b.com'], + request_loader_items=['https://placeholder.com', 'https://placeholder.io'], request_manager_items=[], - discovered_items=[Request.from_url('http://c.com')], + discovered_items=[Request.from_url('https://placeholder.dev')], expected_result={ - 'http://a.com', - 'http://b.com', - 'http://c.com', + 'https://placeholder.com', + 'https://placeholder.io', + 'https://placeholder.dev', }, ), id='basic_usage', ), pytest.param( TestInput( - request_loader_items=[Request.from_url('http://a.com'), None, Request.from_url('http://c.com')], - request_manager_items=['http://b.com', 'http://d.com'], + request_loader_items=[ + Request.from_url('https://placeholder.com'), + None, + Request.from_url('https://placeholder.dev'), + ], + request_manager_items=['https://placeholder.io', 'http://d.com'], discovered_items=[], expected_result={ - 'http://a.com', - 'http://b.com', - 'http://c.com', + 'https://placeholder.com', + 'https://placeholder.io', + 'https://placeholder.dev', 'http://d.com', }, ), From 65b46ccf5e4bbb5faabd135db7992e0532c93f48 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 2 Jul 2025 12:00:41 +0000 Subject: [PATCH 10/10] placeholders with subdomains --- .../session_management/multi_sessions_http.py | 2 +- .../crawlers/_basic/test_basic_crawler.py | 166 +++++++++--------- .../unit/request_loaders/test_request_list.py | 10 +- .../storages/test_request_manager_tandem.py | 22 +-- 4 files changed, 101 insertions(+), 99 deletions(-) diff --git a/docs/guides/code_examples/session_management/multi_sessions_http.py b/docs/guides/code_examples/session_management/multi_sessions_http.py index 74f1bafc4c..0bd4a88beb 100644 --- a/docs/guides/code_examples/session_management/multi_sessions_http.py +++ b/docs/guides/code_examples/session_management/multi_sessions_http.py @@ -49,7 +49,7 @@ async def session_init(context: HttpCrawlingContext) -> None: if context.session: context.log.info(f'Init session {context.session.id}') next_request = Request.from_url( - 'https://placeholder.dev', session_id=context.session.id + 'https://a.placeholder.com', session_id=context.session.id ) next_requests.append(next_request) diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 8a151f3312..4e8a513118 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -42,7 +42,7 @@ async def test_processes_requests_from_explicit_queue() -> None: queue = await RequestQueue.open() - await queue.add_requests(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) + await queue.add_requests(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) crawler = BasicCrawler(request_manager=queue) calls = list[str]() @@ -53,14 +53,16 @@ async def handler(context: BasicCrawlingContext) -> None: await crawler.run() - assert calls == ['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'] + assert calls == ['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'] async def test_processes_requests_from_request_source_tandem() -> None: request_queue = await RequestQueue.open() - await request_queue.add_requests(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) + await request_queue.add_requests( + ['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'] + ) - request_list = RequestList(['https://placeholder.com', 'https://placeholder.gov', 'https://placeholder.biz']) + request_list = RequestList(['https://a.placeholder.com', 'https://d.placeholder.com', 'https://e.placeholder.com']) crawler = BasicCrawler(request_manager=RequestManagerTandem(request_list, request_queue)) calls = set[str]() @@ -72,11 +74,11 @@ async def handler(context: BasicCrawlingContext) -> None: await crawler.run() assert calls == { - 'https://placeholder.com', - 'https://placeholder.io', - 'https://placeholder.dev', - 'https://placeholder.gov', - 'https://placeholder.biz', + 'https://a.placeholder.com', + 'https://b.placeholder.com', + 'https://c.placeholder.com', + 'https://d.placeholder.com', + 'https://e.placeholder.com', } @@ -88,9 +90,9 @@ async def test_processes_requests_from_run_args() -> None: async def handler(context: BasicCrawlingContext) -> None: calls.append(context.request.url) - await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) + await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) - assert calls == ['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'] + assert calls == ['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'] async def test_allows_multiple_run_calls() -> None: @@ -101,16 +103,16 @@ async def test_allows_multiple_run_calls() -> None: async def handler(context: BasicCrawlingContext) -> None: calls.append(context.request.url) - await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) - await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) + await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) + await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) assert calls == [ - 'https://placeholder.com', - 'https://placeholder.io', - 'https://placeholder.dev', - 'https://placeholder.com', - 'https://placeholder.io', - 'https://placeholder.dev', + 'https://a.placeholder.com', + 'https://b.placeholder.com', + 'https://c.placeholder.com', + 'https://a.placeholder.com', + 'https://b.placeholder.com', + 'https://c.placeholder.com', ] @@ -122,17 +124,17 @@ async def test_retries_failed_requests() -> None: async def handler(context: BasicCrawlingContext) -> None: calls.append(context.request.url) - if context.request.url == 'https://placeholder.io': + if context.request.url == 'https://b.placeholder.com': raise RuntimeError('Arbitrary crash for testing purposes') - await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) + await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) assert calls == [ - 'https://placeholder.com', - 'https://placeholder.io', - 'https://placeholder.dev', - 'https://placeholder.io', - 'https://placeholder.io', + 'https://a.placeholder.com', + 'https://b.placeholder.com', + 'https://c.placeholder.com', + 'https://b.placeholder.com', + 'https://b.placeholder.com', ] @@ -147,20 +149,20 @@ async def handler(context: BasicCrawlingContext) -> None: await crawler.run( [ - 'https://placeholder.com', - 'https://placeholder.io', - Request.from_url(url='https://placeholder.dev', no_retry=True), + 'https://a.placeholder.com', + 'https://b.placeholder.com', + Request.from_url(url='https://c.placeholder.com', no_retry=True), ] ) assert calls == [ - 'https://placeholder.com', - 'https://placeholder.io', - 'https://placeholder.dev', - 'https://placeholder.com', - 'https://placeholder.io', - 'https://placeholder.com', - 'https://placeholder.io', + 'https://a.placeholder.com', + 'https://b.placeholder.com', + 'https://c.placeholder.com', + 'https://a.placeholder.com', + 'https://b.placeholder.com', + 'https://a.placeholder.com', + 'https://b.placeholder.com', ] @@ -175,19 +177,19 @@ async def handler(context: BasicCrawlingContext) -> None: await crawler.run( [ - 'https://placeholder.com', - 'https://placeholder.io', - Request.from_url(url='https://placeholder.dev', user_data={'__crawlee': {'maxRetries': 4}}), + 'https://a.placeholder.com', + 'https://b.placeholder.com', + Request.from_url(url='https://c.placeholder.com', user_data={'__crawlee': {'maxRetries': 4}}), ] ) assert calls == [ - 'https://placeholder.com', - 'https://placeholder.io', - 'https://placeholder.dev', - 'https://placeholder.dev', - 'https://placeholder.dev', - 'https://placeholder.dev', + 'https://a.placeholder.com', + 'https://b.placeholder.com', + 'https://c.placeholder.com', + 'https://c.placeholder.com', + 'https://c.placeholder.com', + 'https://c.placeholder.com', ] @@ -206,7 +208,7 @@ class Call: @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: - if context.request.url == 'https://placeholder.io': + if context.request.url == 'https://b.placeholder.com': raise RuntimeError('Arbitrary crash for testing purposes') @crawler.error_handler @@ -223,20 +225,20 @@ async def error_handler(context: BasicCrawlingContext, error: Exception) -> Requ request['headers'] = HttpHeaders({'custom_retry_count': str(custom_retry_count + 1)}) return Request.model_validate(request) - await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) + await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) # Verify that the error handler was called twice assert len(calls) == 2 # Check the first call... first_call = calls[0] - assert first_call.url == 'https://placeholder.io' + assert first_call.url == 'https://b.placeholder.com' assert isinstance(first_call.error, RuntimeError) assert first_call.custom_retry_count == 0 # Check the second call... second_call = calls[1] - assert second_call.url == 'https://placeholder.io' + assert second_call.url == 'https://b.placeholder.com' assert isinstance(second_call.error, RuntimeError) assert second_call.custom_retry_count == 1 @@ -266,7 +268,7 @@ async def test_handles_error_in_error_handler() -> None: @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: - if context.request.url == 'https://placeholder.io': + if context.request.url == 'https://b.placeholder.com': raise RuntimeError('Arbitrary crash for testing purposes') @crawler.error_handler @@ -274,7 +276,7 @@ async def error_handler(context: BasicCrawlingContext, error: Exception) -> None raise RuntimeError('Crash in error handler') with pytest.raises(UserDefinedErrorHandlerError): - await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) + await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) async def test_calls_failed_request_handler() -> None: @@ -283,17 +285,17 @@ async def test_calls_failed_request_handler() -> None: @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: - if context.request.url == 'https://placeholder.io': + if context.request.url == 'https://b.placeholder.com': raise RuntimeError('Arbitrary crash for testing purposes') @crawler.failed_request_handler async def failed_request_handler(context: BasicCrawlingContext, error: Exception) -> None: calls.append((context, error)) - await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) + await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) assert len(calls) == 1 - assert calls[0][0].request.url == 'https://placeholder.io' + assert calls[0][0].request.url == 'https://b.placeholder.com' assert isinstance(calls[0][1], RuntimeError) @@ -302,7 +304,7 @@ async def test_handles_error_in_failed_request_handler() -> None: @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: - if context.request.url == 'https://placeholder.io': + if context.request.url == 'https://b.placeholder.com': raise RuntimeError('Arbitrary crash for testing purposes') @crawler.failed_request_handler @@ -310,7 +312,7 @@ async def failed_request_handler(context: BasicCrawlingContext, error: Exception raise RuntimeError('Crash in failed request handler') with pytest.raises(UserDefinedErrorHandlerError): - await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) + await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) @pytest.mark.parametrize( @@ -332,7 +334,7 @@ async def handler(context: BasicCrawlingContext) -> None: response_data['body'] = json.loads(response.read()) response_data['headers'] = response.headers - await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) + await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) response_body = response_data.get('body') assert response_body is not None @@ -377,15 +379,15 @@ class AddRequestsTestInput: # Basic use case pytest.param( AddRequestsTestInput( - start_url='https://placeholder.com', - loaded_url='https://placeholder.com', + start_url='https://a.placeholder.com', + loaded_url='https://a.placeholder.com', requests=[ - 'https://placeholder.com', - Request.from_url('https://placeholder.io'), - 'https://placeholder.dev', + 'https://a.placeholder.com', + Request.from_url('https://b.placeholder.com'), + 'https://c.placeholder.com', ], kwargs={}, - expected_urls=['https://placeholder.io', 'https://placeholder.dev'], + expected_urls=['https://b.placeholder.com', 'https://c.placeholder.com'], ), id='basic', ), @@ -681,7 +683,7 @@ async def handler(context: BasicCrawlingContext) -> None: await context.push_data({'b': 2}) raise RuntimeError('Watch me crash') - stats = await crawler.run(['https://placeholder.com']) + stats = await crawler.run(['https://a.placeholder.com']) assert (await crawler.get_data()).items == [] assert stats.requests_total == 1 @@ -907,15 +909,15 @@ async def test_consecutive_runs_purge_request_queue() -> None: async def handler(context: BasicCrawlingContext) -> None: visit(context.request.url) - await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) - await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) - await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) + await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) + await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) + await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) counter = Counter(args[0][0] for args in visit.call_args_list) assert counter == { - 'https://placeholder.com': 3, - 'https://placeholder.io': 3, - 'https://placeholder.dev': 3, + 'https://a.placeholder.com': 3, + 'https://b.placeholder.com': 3, + 'https://c.placeholder.com': 3, } @@ -1168,7 +1170,7 @@ async def handler(context: BasicCrawlingContext) -> None: # Timeout in pytest, because previous implementation would run crawler until following: # "The request queue seems to be stuck for 300.0s, resetting internal state." async with timeout(max_request_retries * double_handler_timeout_s): - await crawler.run(['https://placeholder.com']) + await crawler.run(['https://a.placeholder.com']) assert crawler.statistics.state.requests_finished == 1 assert mocked_handler_before_sleep.call_count == max_request_retries @@ -1189,7 +1191,7 @@ async def test_keep_alive( """Test that crawler can be kept alive without any requests and stopped with `crawler.stop()`. Crawler should stop if `max_requests_per_crawl` is reached regardless of the `keep_alive` flag.""" - additional_urls = ['https://placeholder.com', 'https://placeholder.io'] + additional_urls = ['https://a.placeholder.com', 'https://b.placeholder.com'] expected_handler_calls = [call(url) for url in additional_urls[:expected_handled_requests_count]] crawler = BasicCrawler( @@ -1238,9 +1240,9 @@ async def handler(context: BasicCrawlingContext) -> None: context.session.retire() if retire else None - await context.add_requests(['https://placeholder.io']) + await context.add_requests(['https://b.placeholder.com']) - await crawler.run(['https://placeholder.com']) + await crawler.run(['https://a.placeholder.com']) # The session should differ if `retire` was called and match otherwise since pool size == 1 if retire: @@ -1261,7 +1263,7 @@ async def handler(context: BasicCrawlingContext) -> None: used_sessions.append(context.session.id) requests = [ - Request.from_url('https://placeholder.com', session_id=check_session.id, always_enqueue=True) + Request.from_url('https://a.placeholder.com', session_id=check_session.id, always_enqueue=True) for _ in range(10) ] @@ -1293,7 +1295,7 @@ async def handler(context: BasicCrawlingContext) -> None: used_sessions.append(context.session.id) requests = [ - Request.from_url('https://placeholder.com', session_id=str(session_id), use_extended_unique_key=True) + Request.from_url('https://a.placeholder.com', session_id=str(session_id), use_extended_unique_key=True) for session_id in range(10) ] @@ -1306,7 +1308,7 @@ async def handler(context: BasicCrawlingContext) -> None: async def test_error_bound_session_to_request() -> None: crawler = BasicCrawler(request_handler=AsyncMock()) - requests = [Request.from_url('https://placeholder.com', session_id='1', always_enqueue=True) for _ in range(10)] + requests = [Request.from_url('https://a.placeholder.com', session_id='1', always_enqueue=True) for _ in range(10)] stats = await crawler.run(requests) @@ -1324,7 +1326,7 @@ async def error_req_hook(context: BasicCrawlingContext, error: Exception) -> Non if isinstance(error, RequestCollisionError): await error_handler_mock(context, error) - requests = [Request.from_url('https://placeholder.com', session_id='1')] + requests = [Request.from_url('https://a.placeholder.com', session_id='1')] await crawler.run(requests) @@ -1343,7 +1345,7 @@ async def handler(context: BasicCrawlingContext) -> None: async def failed_request_handler(context: BasicCrawlingContext, error: Exception) -> None: handler_requests.add(context.request.url) - requests = ['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'] + requests = ['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'] await crawler.run(requests) @@ -1376,7 +1378,7 @@ async def handler(context: BasicCrawlingContext) -> None: # Capture all logs from the 'crawlee' logger at INFO level or higher with caplog.at_level(logging.INFO, logger='crawlee'): - await crawler.run([Request.from_url('https://placeholder.com')]) + await crawler.run([Request.from_url('https://a.placeholder.com')]) # Check for the timeout message in any of the logs found_timeout_message = False @@ -1411,7 +1413,7 @@ async def status_callback( async def handler(context: BasicCrawlingContext) -> None: await asyncio.sleep(0.1) # Simulate some processing time - await crawler.run(['https://placeholder.com']) + await crawler.run(['https://a.placeholder.com']) assert status_message_callback.called @@ -1442,7 +1444,7 @@ def listener(event_data: EventCrawlerStatusData) -> None: crawler = BasicCrawler(request_handler=AsyncMock()) - await crawler.run(['https://placeholder.com']) + await crawler.run(['https://a.placeholder.com']) event_manager.off(event=Event.CRAWLER_STATUS, listener=listener) diff --git a/tests/unit/request_loaders/test_request_list.py b/tests/unit/request_loaders/test_request_list.py index 1f2345a6af..e3ded91b7f 100644 --- a/tests/unit/request_loaders/test_request_list.py +++ b/tests/unit/request_loaders/test_request_list.py @@ -4,7 +4,7 @@ async def test_sync_traversal() -> None: - request_list = RequestList(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) + request_list = RequestList(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) while not await request_list.is_finished(): item = await request_list.fetch_next_request() @@ -17,9 +17,9 @@ async def test_sync_traversal() -> None: async def test_async_traversal() -> None: async def generator() -> AsyncGenerator[str]: - yield 'https://placeholder.com' - yield 'https://placeholder.io' - yield 'https://placeholder.dev' + yield 'https://a.placeholder.com' + yield 'https://b.placeholder.com' + yield 'https://c.placeholder.com' request_list = RequestList(generator()) @@ -33,7 +33,7 @@ async def generator() -> AsyncGenerator[str]: async def test_is_empty_does_not_depend_on_fetch_next_request() -> None: - request_list = RequestList(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']) + request_list = RequestList(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) item_1 = await request_list.fetch_next_request() assert item_1 is not None diff --git a/tests/unit/storages/test_request_manager_tandem.py b/tests/unit/storages/test_request_manager_tandem.py index cb31f5e4e1..69bd944348 100644 --- a/tests/unit/storages/test_request_manager_tandem.py +++ b/tests/unit/storages/test_request_manager_tandem.py @@ -25,13 +25,13 @@ class TestInput: argvalues=[ pytest.param( TestInput( - request_loader_items=['https://placeholder.com', 'https://placeholder.io'], + request_loader_items=['https://a.placeholder.com', 'https://b.placeholder.com'], request_manager_items=[], - discovered_items=[Request.from_url('https://placeholder.dev')], + discovered_items=[Request.from_url('https://c.placeholder.com')], expected_result={ - 'https://placeholder.com', - 'https://placeholder.io', - 'https://placeholder.dev', + 'https://a.placeholder.com', + 'https://b.placeholder.com', + 'https://c.placeholder.com', }, ), id='basic_usage', @@ -39,16 +39,16 @@ class TestInput: pytest.param( TestInput( request_loader_items=[ - Request.from_url('https://placeholder.com'), + Request.from_url('https://a.placeholder.com'), None, - Request.from_url('https://placeholder.dev'), + Request.from_url('https://c.placeholder.com'), ], - request_manager_items=['https://placeholder.io', 'http://d.com'], + request_manager_items=['https://b.placeholder.com', 'http://d.com'], discovered_items=[], expected_result={ - 'https://placeholder.com', - 'https://placeholder.io', - 'https://placeholder.dev', + 'https://a.placeholder.com', + 'https://b.placeholder.com', + 'https://c.placeholder.com', 'http://d.com', }, ),