From b56ce661dca015572a1a1add91208b67699e3ae1 Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Tue, 29 Apr 2025 00:47:31 +0000
Subject: [PATCH 01/10] add CRAWLER_STATUS event

---
 src/crawlee/crawlers/_basic/_basic_crawler.py | 8 ++++++++
 src/crawlee/events/_types.py                  | 3 +++
 2 files changed, 11 insertions(+)

diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index 7e07c87f16..40391b3ea2 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -35,6 +35,7 @@
     SendRequestFunction,
 )
 from crawlee._utils.docs import docs_group
+from crawlee._utils.recurring_task import RecurringTask
 from crawlee._utils.robots import RobotsTxtFile
 from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
 from crawlee._utils.wait import wait_for
@@ -49,6 +50,7 @@
     SessionError,
     UserDefinedErrorHandlerError,
 )
+from crawlee.events._types import Event
 from crawlee.http_clients import HttpxHttpClient
 from crawlee.router import Router
 from crawlee.sessions import SessionPool
@@ -400,6 +402,7 @@ def __init__(
             is_task_ready_function=self.__is_task_ready_function,
             run_task_function=self.__run_task_function,
         )
+        self._crawler_state_rec_task = RecurringTask(func=self._crawler_state_task, delay=timedelta(seconds=5))
 
         # State flags
         self._keep_alive = keep_alive
@@ -1391,3 +1394,8 @@ async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
             url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file.
         """
         return await RobotsTxtFile.find(url, self._http_client)
+
+    async def _crawler_state_task(self) -> None:
+        """Emit a persist state event with the given migration status."""
+        event_manager = service_locator.get_event_manager()
+        event_manager.emit(event=Event.CRAWLER_STATUS, event_data=None)
diff --git a/src/crawlee/events/_types.py b/src/crawlee/events/_types.py
index aaa59f4b54..bda93d2b73 100644
--- a/src/crawlee/events/_types.py
+++ b/src/crawlee/events/_types.py
@@ -31,6 +31,9 @@ class Event(str, Enum):
     PAGE_CREATED = 'pageCreated'
     PAGE_CLOSED = 'pageClosed'
 
+    # State events
+    CRAWLER_STATUS = 'crawlerStatus'
+
 
 @docs_group('Event payloads')
 class EventPersistStateData(BaseModel):

From 9a6181f9a55a9e2cc98df4e66012518a489e3325 Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Mon, 23 Jun 2025 23:50:38 +0000
Subject: [PATCH 02/10] add periodic status log

---
 src/crawlee/_log_config.py                    | 31 ++++----
 src/crawlee/crawlers/_basic/_basic_crawler.py | 73 ++++++++++++++++++-
 src/crawlee/events/__init__.py                |  2 +
 src/crawlee/events/_event_manager.py          |  5 ++
 src/crawlee/events/_types.py                  | 18 ++++-
 .../crawlers/_basic/test_basic_crawler.py     | 57 +++++++++++++++
 6 files changed, 168 insertions(+), 18 deletions(-)

diff --git a/src/crawlee/_log_config.py b/src/crawlee/_log_config.py
index 914cc32f24..093e876554 100644
--- a/src/crawlee/_log_config.py
+++ b/src/crawlee/_log_config.py
@@ -4,7 +4,7 @@
 import logging
 import sys
 import textwrap
-from typing import Any
+from typing import Any, Literal
 
 from colorama import Fore, Style, just_fix_windows_console
 from typing_extensions import assert_never
@@ -34,22 +34,27 @@
 _LOG_MESSAGE_INDENT = ' ' * 6
 
 
+def string_to_log_level(level: Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']) -> int:
+    """Convert a string representation of a log level to an integer log level."""
+    if level == 'DEBUG':
+        return logging.DEBUG
+    if level == 'INFO':
+        return logging.INFO
+    if level == 'WARNING':
+        return logging.WARNING
+    if level == 'ERROR':
+        return logging.ERROR
+    if level == 'CRITICAL':
+        return logging.CRITICAL
+
+    assert_never(level)
+
+
 def get_configured_log_level() -> int:
     config = service_locator.get_configuration()
 
     if 'log_level' in config.model_fields_set:
-        if config.log_level == 'DEBUG':
-            return logging.DEBUG
-        if config.log_level == 'INFO':
-            return logging.INFO
-        if config.log_level == 'WARNING':
-            return logging.WARNING
-        if config.log_level == 'ERROR':
-            return logging.ERROR
-        if config.log_level == 'CRITICAL':
-            return logging.CRITICAL
-
-        assert_never(config.log_level)
+        return string_to_log_level(config.log_level)
 
     if sys.flags.dev_mode:
         return logging.DEBUG
diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index 40391b3ea2..1ff578238f 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -25,7 +25,7 @@
 
 from crawlee import EnqueueStrategy, Glob, service_locator
 from crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus
-from crawlee._log_config import configure_logger, get_configured_log_level
+from crawlee._log_config import configure_logger, get_configured_log_level, string_to_log_level
 from crawlee._request import Request, RequestState
 from crawlee._types import (
     BasicCrawlingContext,
@@ -50,7 +50,7 @@
     SessionError,
     UserDefinedErrorHandlerError,
 )
-from crawlee.events._types import Event
+from crawlee.events._types import Event, EventCrawlerStatusData
 from crawlee.http_clients import HttpxHttpClient
 from crawlee.router import Router
 from crawlee.sessions import SessionPool
@@ -172,6 +172,13 @@ class _BasicCrawlerOptions(TypedDict):
     """If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,
     and skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`."""
 
+    status_message_logging_interval: NotRequired[timedelta]
+    """Interval for logging the crawler status messages."""
+
+    status_message_callback: NotRequired[Callable[[StatisticsState, StatisticsState | None, str], None]]
+    """Allows overriding the default status message. The callback needs to call `crawler.setStatusMessage()` explicitly.
+    The default status message is provided in the parameters."""
+
 
 class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], TypedDict):
     """Generic options the `BasicCrawler` constructor."""
@@ -254,6 +261,8 @@ def __init__(
         configure_logging: bool = True,
         statistics_log_format: Literal['table', 'inline'] = 'table',
         respect_robots_txt_file: bool = False,
+        status_message_logging_interval: timedelta = timedelta(seconds=10),
+        status_message_callback: Callable[[StatisticsState, StatisticsState | None, str], None] | None = None,
         _context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
         _additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None,
         _logger: logging.Logger | None = None,
@@ -299,6 +308,11 @@ def __init__(
             respect_robots_txt_file: If set to `True`, the crawler will automatically try to fetch the robots.txt file
                 for each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added
                 via `EnqueueLinksFunction`
+            status_message_logging_interval:  Interval for logging the crawler status messages
+            status_message_callback: A callback function for customizing crawler status messages. When provided,
+                this function will be called instead of the default status message logging. The function receives
+                the current statistics state, the previous state (if available), and the default status message
+                as parameters.
             _context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
                 Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
             _additional_context_managers: Additional context managers used throughout the crawler lifecycle.
@@ -342,6 +356,9 @@ def __init__(
         self._failed_request_handler: FailedRequestHandler[TCrawlingContext | BasicCrawlingContext] | None = None
         self._abort_on_error = abort_on_error
 
+        # Crawler callbacks
+        self._status_message_callback = status_message_callback
+
         # Context of each request with matching result of request handler.
         # Inheritors can use this to override the result of individual request handler runs in `_run_request_handler`.
         self._context_result_map = WeakKeyDictionary[BasicCrawlingContext, RequestHandlerRunResult]()
@@ -402,7 +419,10 @@ def __init__(
             is_task_ready_function=self.__is_task_ready_function,
             run_task_function=self.__run_task_function,
         )
-        self._crawler_state_rec_task = RecurringTask(func=self._crawler_state_task, delay=timedelta(seconds=5))
+        self._crawler_state_rec_task = RecurringTask(
+            func=self._crawler_state_task, delay=status_message_logging_interval
+        )
+        self._previous_crawler_state: TStatisticsState | None = None
 
         # State flags
         self._keep_alive = keep_alive
@@ -599,6 +619,7 @@ def sigint_handler() -> None:
         except CancelledError:
             pass
         finally:
+            await self._crawler_state_rec_task.stop()
             if threading.current_thread() is threading.main_thread():
                 with suppress(NotImplementedError):
                     asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
@@ -630,6 +651,8 @@ def sigint_handler() -> None:
     async def _run_crawler(self) -> None:
         event_manager = service_locator.get_event_manager()
 
+        self._crawler_state_rec_task.start()
+
         # Collect the context managers to be entered. Context managers that are already active are excluded,
         # as they were likely entered by the caller, who will also be responsible for exiting them.
         contexts_to_enter = [
@@ -1395,7 +1418,49 @@ async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
         """
         return await RobotsTxtFile.find(url, self._http_client)
 
+    def set_status_message(
+        self, message: str, level: Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] = 'DEBUG'
+    ) -> None:
+        """Set a status message for the crawler.
+
+        Args:
+            message: The status message to log.
+            level: The logging level for the message.
+        """
+        log_level = string_to_log_level(level)
+        self.log.log(log_level, message)
+
     async def _crawler_state_task(self) -> None:
         """Emit a persist state event with the given migration status."""
         event_manager = service_locator.get_event_manager()
-        event_manager.emit(event=Event.CRAWLER_STATUS, event_data=None)
+
+        current_state = self.statistics.state
+
+        if (
+            failed_requests := (
+                current_state.requests_failed - (self._previous_crawler_state or current_state).requests_failed
+            )
+            > 0
+        ):
+            message = f'Experiencing problems, {failed_requests} failed requests since last status update.'
+        else:
+            request_manager = await self.get_request_manager()
+            total_count = await request_manager.get_total_count()
+            if total_count is not None and total_count > 0:
+                pages_info = f'{self._statistics.state.requests_finished}/{total_count}'
+            else:
+                pages_info = str(self._statistics.state.requests_finished)
+
+            message = (
+                f'Crawled {pages_info} pages, {self._statistics.state.requests_failed} failed requests, '
+                f'desired concurrency {self._autoscaled_pool.desired_concurrency}.'
+            )
+
+        if self._status_message_callback:
+            self._status_message_callback(current_state, self._previous_crawler_state, message)
+        else:
+            self.set_status_message(message)
+
+        event_manager.emit(event=Event.CRAWLER_STATUS, event_data=EventCrawlerStatusData(message=message))
+
+        self._previous_crawler_state = current_state
diff --git a/src/crawlee/events/__init__.py b/src/crawlee/events/__init__.py
index 1c2cda0173..2aa2beecfd 100644
--- a/src/crawlee/events/__init__.py
+++ b/src/crawlee/events/__init__.py
@@ -3,6 +3,7 @@
 from ._types import (
     Event,
     EventAbortingData,
+    EventCrawlerStatusData,
     EventData,
     EventExitData,
     EventListener,
@@ -14,6 +15,7 @@
 __all__ = [
     'Event',
     'EventAbortingData',
+    'EventCrawlerStatusData',
     'EventData',
     'EventExitData',
     'EventListener',
diff --git a/src/crawlee/events/_event_manager.py b/src/crawlee/events/_event_manager.py
index b08727e4da..3723fc78fe 100644
--- a/src/crawlee/events/_event_manager.py
+++ b/src/crawlee/events/_event_manager.py
@@ -19,6 +19,7 @@
 from crawlee.events._types import (
     Event,
     EventAbortingData,
+    EventCrawlerStatusData,
     EventExitData,
     EventListener,
     EventMigratingData,
@@ -147,6 +148,8 @@ def on(self, *, event: Literal[Event.ABORTING], listener: EventListener[EventAbo
     @overload
     def on(self, *, event: Literal[Event.EXIT], listener: EventListener[EventExitData]) -> None: ...
     @overload
+    def on(self, *, event: Literal[Event.CRAWLER_STATUS], listener: EventListener[EventCrawlerStatusData]) -> None: ...
+    @overload
     def on(self, *, event: Event, listener: EventListener[None]) -> None: ...
 
     def on(self, *, event: Event, listener: EventListener[Any]) -> None:
@@ -222,6 +225,8 @@ def emit(self, *, event: Literal[Event.ABORTING], event_data: EventAbortingData)
     @overload
     def emit(self, *, event: Literal[Event.EXIT], event_data: EventExitData) -> None: ...
     @overload
+    def emit(self, *, event: Literal[Event.CRAWLER_STATUS], event_data: EventCrawlerStatusData) -> None: ...
+    @overload
     def emit(self, *, event: Event, event_data: Any) -> None: ...
 
     @ensure_context
diff --git a/src/crawlee/events/_types.py b/src/crawlee/events/_types.py
index bda93d2b73..c5afeb68cd 100644
--- a/src/crawlee/events/_types.py
+++ b/src/crawlee/events/_types.py
@@ -82,7 +82,23 @@ class EventExitData(BaseModel):
     model_config = ConfigDict(populate_by_name=True)
 
 
-EventData = Union[EventPersistStateData, EventSystemInfoData, EventMigratingData, EventAbortingData, EventExitData]
+@docs_group('Event payloads')
+class EventCrawlerStatusData(BaseModel):
+    """Data for the crawler status event."""
+
+    model_config = ConfigDict(populate_by_name=True)
+
+    message: str
+
+
+EventData = Union[
+    EventPersistStateData,
+    EventSystemInfoData,
+    EventMigratingData,
+    EventAbortingData,
+    EventExitData,
+    EventCrawlerStatusData,
+]
 """A helper type for all possible event payloads"""
 
 WrappedListener = Callable[..., Coroutine[Any, Any, None]]
diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py
index 4f151ad621..c40b1ef53f 100644
--- a/tests/unit/crawlers/_basic/test_basic_crawler.py
+++ b/tests/unit/crawlers/_basic/test_basic_crawler.py
@@ -23,6 +23,7 @@
 from crawlee.configuration import Configuration
 from crawlee.crawlers import BasicCrawler
 from crawlee.errors import RequestCollisionError, SessionError, UserDefinedErrorHandlerError
+from crawlee.events import Event, EventCrawlerStatusData
 from crawlee.events._local_event_manager import LocalEventManager
 from crawlee.request_loaders import RequestList, RequestManagerTandem
 from crawlee.sessions import Session, SessionPool
@@ -36,6 +37,7 @@
     from yarl import URL
 
     from crawlee._types import JsonSerializable
+    from crawlee.statistics import StatisticsState
     from crawlee.storage_clients._memory import DatasetClient
 
 
@@ -1345,3 +1347,58 @@ async def handler(context: BasicCrawlingContext) -> None:
             break
     else:
         raise AssertionError('Expected log message about request handler error was not found.')
+
+
+async def test_status_message_callback() -> None:
+    """Test that status message callback is called with the correct message."""
+    status_message_callback = AsyncMock()
+    states: list[dict[str, StatisticsState | None]] = []
+
+    def status_callback(state: StatisticsState, previous_state: StatisticsState | None, message: str) -> None:
+        status_message_callback(message)
+        states.append({'state': state, 'previous_state': previous_state})
+
+    crawler = BasicCrawler(
+        status_message_callback=status_callback, status_message_logging_interval=timedelta(seconds=0.01)
+    )
+
+    @crawler.router.default_handler
+    async def handler(context: BasicCrawlingContext) -> None:
+        await asyncio.sleep(0.1)  # Simulate some processing time
+
+    await crawler.run(['http://a.com/'])
+
+    assert status_message_callback.called
+
+    assert len(states) > 1
+
+    first_call = states[0]
+    second_call = states[1]
+
+    # For the first call, `previous_state` is None
+    assert first_call['state'] is not None
+    assert first_call['previous_state'] is None
+
+    # For second call, `previous_state` is the first state
+    assert second_call['state'] is not None
+    assert second_call['previous_state'] is not None
+    assert second_call['previous_state'] == first_call['state']
+
+
+async def test_status_message_emit() -> None:
+    event_manager = service_locator.get_event_manager()
+
+    status_message_listener = Mock()
+
+    def listener(event_data: EventCrawlerStatusData) -> None:
+        status_message_listener(event_data)
+
+    event_manager.on(event=Event.CRAWLER_STATUS, listener=listener)
+
+    crawler = BasicCrawler(request_handler=AsyncMock())
+
+    await crawler.run(['http://a.com/'])
+
+    event_manager.off(event=Event.CRAWLER_STATUS, listener=listener)
+
+    assert status_message_listener.called

From edb7cb814fbda5325d2cb7cedaf811dfb08983de Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Mon, 23 Jun 2025 23:52:59 +0000
Subject: [PATCH 03/10] up log level

---
 src/crawlee/crawlers/_basic/_basic_crawler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index 1ff578238f..355b19e8b8 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -1459,7 +1459,7 @@ async def _crawler_state_task(self) -> None:
         if self._status_message_callback:
             self._status_message_callback(current_state, self._previous_crawler_state, message)
         else:
-            self.set_status_message(message)
+            self.set_status_message(message, level='INFO')
 
         event_manager.emit(event=Event.CRAWLER_STATUS, event_data=EventCrawlerStatusData(message=message))
 

From c2680a0fbd2311d4058f4f0eaf6983c49c8a60ae Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Tue, 24 Jun 2025 00:30:00 +0000
Subject: [PATCH 04/10] add `crawler` parameter in `status_message_callback`

---
 src/crawlee/crawlers/_basic/_basic_crawler.py    | 16 ++++++++++------
 tests/unit/crawlers/_basic/test_basic_crawler.py |  4 +++-
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index 40e4593736..a36f8175c5 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -194,9 +194,11 @@ class _BasicCrawlerOptions(TypedDict):
     status_message_logging_interval: NotRequired[timedelta]
     """Interval for logging the crawler status messages."""
 
-    status_message_callback: NotRequired[Callable[[StatisticsState, StatisticsState | None, str], None]]
-    """Allows overriding the default status message. The callback needs to call `crawler.setStatusMessage()` explicitly.
-    The default status message is provided in the parameters."""
+    status_message_callback: NotRequired[
+        Callable[[BasicCrawler[TCrawlingContext, TStatisticsState], StatisticsState, StatisticsState | None, str], None]
+    ]
+    """Allows overriding the default status message. The callback needs to call `crawler.set_status_message()`
+    explicitly. The default status message is provided in the parameters."""
 
 
 class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], TypedDict):
@@ -281,7 +283,10 @@ def __init__(
         statistics_log_format: Literal['table', 'inline'] = 'table',
         respect_robots_txt_file: bool = False,
         status_message_logging_interval: timedelta = timedelta(seconds=10),
-        status_message_callback: Callable[[StatisticsState, StatisticsState | None, str], None] | None = None,
+        status_message_callback: Callable[
+            [BasicCrawler[TCrawlingContext, TStatisticsState], StatisticsState, StatisticsState | None, str], None
+        ]
+        | None = None,
         _context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
         _additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None,
         _logger: logging.Logger | None = None,
@@ -309,7 +314,6 @@ def __init__(
                 `max_requests_per_crawl` is achieved.
             max_session_rotations: Maximum number of session rotations per request. The crawler rotates the session
                 if a proxy error occurs or if the website blocks the request.
-
                 The session rotations are not counted towards the `max_request_retries` limit.
             max_crawl_depth: Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond
                 this depth. The crawl depth starts at 0 for initial requests and increases with each subsequent level
@@ -1588,7 +1592,7 @@ async def _crawler_state_task(self) -> None:
             )
 
         if self._status_message_callback:
-            self._status_message_callback(current_state, self._previous_crawler_state, message)
+            self._status_message_callback(self, current_state, self._previous_crawler_state, message)
         else:
             self.set_status_message(message, level='INFO')
 
diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py
index ce99b7a0c3..38f03cf65b 100644
--- a/tests/unit/crawlers/_basic/test_basic_crawler.py
+++ b/tests/unit/crawlers/_basic/test_basic_crawler.py
@@ -1426,7 +1426,9 @@ async def test_status_message_callback() -> None:
     status_message_callback = AsyncMock()
     states: list[dict[str, StatisticsState | None]] = []
 
-    def status_callback(state: StatisticsState, previous_state: StatisticsState | None, message: str) -> None:
+    def status_callback(
+        crawler: BasicCrawler, state: StatisticsState, previous_state: StatisticsState | None, message: str
+    ) -> None:
         status_message_callback(message)
         states.append({'state': state, 'previous_state': previous_state})
 

From cb89c8ee75bd90ec1b5154d061be55d93724b375 Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Tue, 24 Jun 2025 18:06:58 +0000
Subject: [PATCH 05/10] update signature

---
 src/crawlee/_log_config.py                    |  8 +++--
 src/crawlee/_types.py                         |  2 ++
 src/crawlee/configuration.py                  |  5 +--
 src/crawlee/crawlers/_basic/_basic_crawler.py | 31 +++++++++----------
 src/crawlee/events/_types.py                  |  2 ++
 .../crawlers/_basic/test_basic_crawler.py     |  7 +++--
 6 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/src/crawlee/_log_config.py b/src/crawlee/_log_config.py
index 093e876554..5fc9e94b8a 100644
--- a/src/crawlee/_log_config.py
+++ b/src/crawlee/_log_config.py
@@ -4,15 +4,19 @@
 import logging
 import sys
 import textwrap
-from typing import Any, Literal
+from typing import TYPE_CHECKING, Any
 
 from colorama import Fore, Style, just_fix_windows_console
 from typing_extensions import assert_never
 
 from crawlee import service_locator
 
+if TYPE_CHECKING:
+    from crawlee._types import LogLevel
+
 just_fix_windows_console()
 
+
 _LOG_NAME_COLOR = Fore.LIGHTBLACK_EX
 
 _LOG_LEVEL_COLOR = {
@@ -34,7 +38,7 @@
 _LOG_MESSAGE_INDENT = ' ' * 6
 
 
-def string_to_log_level(level: Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']) -> int:
+def string_to_log_level(level: LogLevel) -> int:
     """Convert a string representation of a log level to an integer log level."""
     if level == 'DEBUG':
         return logging.DEBUG
diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py
index 3cb84111fe..72a108cb6c 100644
--- a/src/crawlee/_types.py
+++ b/src/crawlee/_types.py
@@ -52,6 +52,8 @@
 
 SkippedReason: TypeAlias = Literal['robots_txt']
 
+LogLevel: TypeAlias = Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
+
 
 def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:
     """Convert all header keys to lowercase, strips whitespace, and returns them sorted by key."""
diff --git a/src/crawlee/configuration.py b/src/crawlee/configuration.py
index de22118816..2158fe4d0a 100644
--- a/src/crawlee/configuration.py
+++ b/src/crawlee/configuration.py
@@ -1,11 +1,12 @@
 from __future__ import annotations
 
 from datetime import timedelta
-from typing import TYPE_CHECKING, Annotated, Literal
+from typing import TYPE_CHECKING, Annotated
 
 from pydantic import AliasChoices, BeforeValidator, Field
 from pydantic_settings import BaseSettings, SettingsConfigDict
 
+from crawlee._types import LogLevel
 from crawlee._utils.docs import docs_group
 from crawlee._utils.models import timedelta_ms
 
@@ -62,7 +63,7 @@ class Configuration(BaseSettings):
     https://playwright.dev/docs/api/class-browsertype#browser-type-launch."""
 
     log_level: Annotated[
-        Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
+        LogLevel,
         Field(
             validation_alias=AliasChoices(
                 'apify_log_level',
diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index a36f8175c5..7d9f2fc08c 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -33,6 +33,7 @@
     GetKeyValueStoreFromRequestHandlerFunction,
     HttpHeaders,
     HttpPayload,
+    LogLevel,
     RequestHandlerRunResult,
     SendRequestFunction,
     SkippedReason,
@@ -195,10 +196,10 @@ class _BasicCrawlerOptions(TypedDict):
     """Interval for logging the crawler status messages."""
 
     status_message_callback: NotRequired[
-        Callable[[BasicCrawler[TCrawlingContext, TStatisticsState], StatisticsState, StatisticsState | None, str], None]
+        Callable[[StatisticsState, StatisticsState | None, str], Awaitable[str | None]]
     ]
-    """Allows overriding the default status message. The callback needs to call `crawler.set_status_message()`
-    explicitly. The default status message is provided in the parameters."""
+    """Allows overriding the default status message. The default status message is provided in the parameters.
+    Returning `None` suppresses the status message."""
 
 
 class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], TypedDict):
@@ -283,9 +284,7 @@ def __init__(
         statistics_log_format: Literal['table', 'inline'] = 'table',
         respect_robots_txt_file: bool = False,
         status_message_logging_interval: timedelta = timedelta(seconds=10),
-        status_message_callback: Callable[
-            [BasicCrawler[TCrawlingContext, TStatisticsState], StatisticsState, StatisticsState | None, str], None
-        ]
+        status_message_callback: Callable[[StatisticsState, StatisticsState | None, str], Awaitable[str | None]]
         | None = None,
         _context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
         _additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None,
@@ -305,7 +304,6 @@ def __init__(
             max_request_retries: Specifies the maximum number of retries allowed for a request if its processing fails.
                 This includes retries due to navigation errors or errors thrown from user-supplied functions
                 (`request_handler`, `pre_navigation_hooks` etc.).
-
                 This limit does not apply to retries triggered by session rotation (see `max_session_rotations`).
             max_requests_per_crawl: Maximum number of pages to open during a crawl. The crawl stops upon reaching
                 this limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means
@@ -338,10 +336,8 @@ def __init__(
                 for each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added
                 via `EnqueueLinksFunction`
             status_message_logging_interval:  Interval for logging the crawler status messages
-            status_message_callback: A callback function for customizing crawler status messages. When provided,
-                this function will be called instead of the default status message logging. The function receives
-                the current statistics state, the previous state (if available), and the default status message
-                as parameters.
+            status_message_callback: Allows overriding the default status message. The default status message is
+                provided in the parameters. Returning `None` suppresses the status message.
             _context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
                 Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
             _additional_context_managers: Additional context managers used throughout the crawler lifecycle.
@@ -1553,9 +1549,7 @@ async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
         """
         return await RobotsTxtFile.find(url, self._http_client)
 
-    def set_status_message(
-        self, message: str, level: Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] = 'DEBUG'
-    ) -> None:
+    def set_status_message(self, message: str, level: LogLevel = 'DEBUG') -> None:
         """Set a status message for the crawler.
 
         Args:
@@ -1592,10 +1586,15 @@ async def _crawler_state_task(self) -> None:
             )
 
         if self._status_message_callback:
-            self._status_message_callback(self, current_state, self._previous_crawler_state, message)
+            new_message = await self._status_message_callback(current_state, self._previous_crawler_state, message)
+            if new_message:
+                message = new_message
+                self.set_status_message(message, level='INFO')
         else:
             self.set_status_message(message, level='INFO')
 
-        event_manager.emit(event=Event.CRAWLER_STATUS, event_data=EventCrawlerStatusData(message=message))
+        event_manager.emit(
+            event=Event.CRAWLER_STATUS, event_data=EventCrawlerStatusData(message=message, crawler_id=id(self))
+        )
 
         self._previous_crawler_state = current_state
diff --git a/src/crawlee/events/_types.py b/src/crawlee/events/_types.py
index c5afeb68cd..0634b02794 100644
--- a/src/crawlee/events/_types.py
+++ b/src/crawlee/events/_types.py
@@ -90,6 +90,8 @@ class EventCrawlerStatusData(BaseModel):
 
     message: str
 
+    crawler_id: int
+
 
 EventData = Union[
     EventPersistStateData,
diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py
index 38f03cf65b..2c722e8e16 100644
--- a/tests/unit/crawlers/_basic/test_basic_crawler.py
+++ b/tests/unit/crawlers/_basic/test_basic_crawler.py
@@ -1426,11 +1426,12 @@ async def test_status_message_callback() -> None:
     status_message_callback = AsyncMock()
     states: list[dict[str, StatisticsState | None]] = []
 
-    def status_callback(
-        crawler: BasicCrawler, state: StatisticsState, previous_state: StatisticsState | None, message: str
-    ) -> None:
+    async def status_callback(
+        state: StatisticsState, previous_state: StatisticsState | None, message: str
+    ) -> str | None:
         status_message_callback(message)
         states.append({'state': state, 'previous_state': previous_state})
+        return message
 
     crawler = BasicCrawler(
         status_message_callback=status_callback, status_message_logging_interval=timedelta(seconds=0.01)

From 3303d680b4741d6095302930e72bb903bdabc6ab Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Wed, 25 Jun 2025 12:31:39 +0000
Subject: [PATCH 06/10] `set_status_message` to `_log_status_message`

---
 src/crawlee/crawlers/_basic/_basic_crawler.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index 7d9f2fc08c..567012e3d3 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -1549,8 +1549,8 @@ async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
         """
         return await RobotsTxtFile.find(url, self._http_client)
 
-    def set_status_message(self, message: str, level: LogLevel = 'DEBUG') -> None:
-        """Set a status message for the crawler.
+    def _log_status_message(self, message: str, level: LogLevel = 'DEBUG') -> None:
+        """Log a status message for the crawler.
 
         Args:
             message: The status message to log.
@@ -1589,9 +1589,9 @@ async def _crawler_state_task(self) -> None:
             new_message = await self._status_message_callback(current_state, self._previous_crawler_state, message)
             if new_message:
                 message = new_message
-                self.set_status_message(message, level='INFO')
+                self._log_status_message(message, level='INFO')
         else:
-            self.set_status_message(message, level='INFO')
+            self._log_status_message(message, level='INFO')
 
         event_manager.emit(
             event=Event.CRAWLER_STATUS, event_data=EventCrawlerStatusData(message=message, crawler_id=id(self))

From 2af8a4b4587a864a94a7f8f786ca2fefcffa0c80 Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Fri, 27 Jun 2025 20:00:55 +0000
Subject: [PATCH 07/10] add docs for `EventCrawlerStatusData`

---
 src/crawlee/events/_types.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/crawlee/events/_types.py b/src/crawlee/events/_types.py
index 0634b02794..23834171f0 100644
--- a/src/crawlee/events/_types.py
+++ b/src/crawlee/events/_types.py
@@ -89,8 +89,10 @@ class EventCrawlerStatusData(BaseModel):
     model_config = ConfigDict(populate_by_name=True)
 
     message: str
+    """A message describing the current status of the crawler."""
 
     crawler_id: int
+    """The ID of the crawler that emitted the event."""
 
 
 EventData = Union[

From 09a73a6a661e9f650cc78bbee22dbb082ede27e3 Mon Sep 17 00:00:00 2001
From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com>
Date: Wed, 2 Jul 2025 11:12:16 +0300
Subject: [PATCH 08/10] Update src/crawlee/crawlers/_basic/_basic_crawler.py

Co-authored-by: Vlada Dusek <v.dusek96@gmail.com>
---
 src/crawlee/crawlers/_basic/_basic_crawler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index 567012e3d3..130a304eb1 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -335,7 +335,7 @@ def __init__(
             respect_robots_txt_file: If set to `True`, the crawler will automatically try to fetch the robots.txt file
                 for each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added
                 via `EnqueueLinksFunction`
-            status_message_logging_interval:  Interval for logging the crawler status messages
+            status_message_logging_interval: Interval for logging the crawler status messages.
             status_message_callback: Allows overriding the default status message. The default status message is
                 provided in the parameters. Returning `None` suppresses the status message.
             _context_pipeline: Enables extending the request lifecycle and modifying the crawling context.

From c7d395426d0a2bcb00d94320fb30af51ddd3afaf Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Wed, 2 Jul 2025 11:47:02 +0000
Subject: [PATCH 09/10] change abcde.com links to placeholders links

---
 .../crawlers/_basic/test_basic_crawler.py     | 165 ++++++++++--------
 .../unit/request_loaders/test_request_list.py |  10 +-
 .../storages/test_request_manager_tandem.py   |  24 +--
 3 files changed, 108 insertions(+), 91 deletions(-)

diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py
index 4f7d612827..8a151f3312 100644
--- a/tests/unit/crawlers/_basic/test_basic_crawler.py
+++ b/tests/unit/crawlers/_basic/test_basic_crawler.py
@@ -42,7 +42,7 @@
 
 async def test_processes_requests_from_explicit_queue() -> None:
     queue = await RequestQueue.open()
-    await queue.add_requests(['http://a.com/', 'http://b.com/', 'http://c.com/'])
+    await queue.add_requests(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
 
     crawler = BasicCrawler(request_manager=queue)
     calls = list[str]()
@@ -53,14 +53,14 @@ async def handler(context: BasicCrawlingContext) -> None:
 
     await crawler.run()
 
-    assert calls == ['http://a.com/', 'http://b.com/', 'http://c.com/']
+    assert calls == ['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']
 
 
 async def test_processes_requests_from_request_source_tandem() -> None:
     request_queue = await RequestQueue.open()
-    await request_queue.add_requests(['http://a.com/', 'http://b.com/', 'http://c.com/'])
+    await request_queue.add_requests(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
 
-    request_list = RequestList(['http://a.com/', 'http://d.com', 'http://e.com'])
+    request_list = RequestList(['https://placeholder.com', 'https://placeholder.gov', 'https://placeholder.biz'])
 
     crawler = BasicCrawler(request_manager=RequestManagerTandem(request_list, request_queue))
     calls = set[str]()
@@ -71,7 +71,13 @@ async def handler(context: BasicCrawlingContext) -> None:
 
     await crawler.run()
 
-    assert calls == {'http://a.com/', 'http://b.com/', 'http://c.com/', 'http://d.com', 'http://e.com'}
+    assert calls == {
+        'https://placeholder.com',
+        'https://placeholder.io',
+        'https://placeholder.dev',
+        'https://placeholder.gov',
+        'https://placeholder.biz',
+    }
 
 
 async def test_processes_requests_from_run_args() -> None:
@@ -82,9 +88,9 @@ async def test_processes_requests_from_run_args() -> None:
     async def handler(context: BasicCrawlingContext) -> None:
         calls.append(context.request.url)
 
-    await crawler.run(['http://a.com/', 'http://b.com/', 'http://c.com/'])
+    await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
 
-    assert calls == ['http://a.com/', 'http://b.com/', 'http://c.com/']
+    assert calls == ['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']
 
 
 async def test_allows_multiple_run_calls() -> None:
@@ -95,16 +101,16 @@ async def test_allows_multiple_run_calls() -> None:
     async def handler(context: BasicCrawlingContext) -> None:
         calls.append(context.request.url)
 
-    await crawler.run(['http://a.com/', 'http://b.com/', 'http://c.com/'])
-    await crawler.run(['http://a.com/', 'http://b.com/', 'http://c.com/'])
+    await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
+    await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
 
     assert calls == [
-        'http://a.com/',
-        'http://b.com/',
-        'http://c.com/',
-        'http://a.com/',
-        'http://b.com/',
-        'http://c.com/',
+        'https://placeholder.com',
+        'https://placeholder.io',
+        'https://placeholder.dev',
+        'https://placeholder.com',
+        'https://placeholder.io',
+        'https://placeholder.dev',
     ]
 
 
@@ -116,17 +122,17 @@ async def test_retries_failed_requests() -> None:
     async def handler(context: BasicCrawlingContext) -> None:
         calls.append(context.request.url)
 
-        if context.request.url == 'http://b.com/':
+        if context.request.url == 'https://placeholder.io':
             raise RuntimeError('Arbitrary crash for testing purposes')
 
-    await crawler.run(['http://a.com/', 'http://b.com/', 'http://c.com/'])
+    await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
 
     assert calls == [
-        'http://a.com/',
-        'http://b.com/',
-        'http://c.com/',
-        'http://b.com/',
-        'http://b.com/',
+        'https://placeholder.com',
+        'https://placeholder.io',
+        'https://placeholder.dev',
+        'https://placeholder.io',
+        'https://placeholder.io',
     ]
 
 
@@ -139,16 +145,22 @@ async def handler(context: BasicCrawlingContext) -> None:
         calls.append(context.request.url)
         raise RuntimeError('Arbitrary crash for testing purposes')
 
-    await crawler.run(['http://a.com/', 'http://b.com/', Request.from_url(url='http://c.com/', no_retry=True)])
+    await crawler.run(
+        [
+            'https://placeholder.com',
+            'https://placeholder.io',
+            Request.from_url(url='https://placeholder.dev', no_retry=True),
+        ]
+    )
 
     assert calls == [
-        'http://a.com/',
-        'http://b.com/',
-        'http://c.com/',
-        'http://a.com/',
-        'http://b.com/',
-        'http://a.com/',
-        'http://b.com/',
+        'https://placeholder.com',
+        'https://placeholder.io',
+        'https://placeholder.dev',
+        'https://placeholder.com',
+        'https://placeholder.io',
+        'https://placeholder.com',
+        'https://placeholder.io',
     ]
 
 
@@ -163,19 +175,19 @@ async def handler(context: BasicCrawlingContext) -> None:
 
     await crawler.run(
         [
-            'http://a.com/',
-            'http://b.com/',
-            Request.from_url(url='http://c.com/', user_data={'__crawlee': {'maxRetries': 4}}),
+            'https://placeholder.com',
+            'https://placeholder.io',
+            Request.from_url(url='https://placeholder.dev', user_data={'__crawlee': {'maxRetries': 4}}),
         ]
     )
 
     assert calls == [
-        'http://a.com/',
-        'http://b.com/',
-        'http://c.com/',
-        'http://c.com/',
-        'http://c.com/',
-        'http://c.com/',
+        'https://placeholder.com',
+        'https://placeholder.io',
+        'https://placeholder.dev',
+        'https://placeholder.dev',
+        'https://placeholder.dev',
+        'https://placeholder.dev',
     ]
 
 
@@ -194,7 +206,7 @@ class Call:
 
     @crawler.router.default_handler
     async def handler(context: BasicCrawlingContext) -> None:
-        if context.request.url == 'http://b.com/':
+        if context.request.url == 'https://placeholder.io':
             raise RuntimeError('Arbitrary crash for testing purposes')
 
     @crawler.error_handler
@@ -211,20 +223,20 @@ async def error_handler(context: BasicCrawlingContext, error: Exception) -> Requ
         request['headers'] = HttpHeaders({'custom_retry_count': str(custom_retry_count + 1)})
         return Request.model_validate(request)
 
-    await crawler.run(['http://a.com/', 'http://b.com/', 'http://c.com/'])
+    await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
 
     # Verify that the error handler was called twice
     assert len(calls) == 2
 
     # Check the first call...
     first_call = calls[0]
-    assert first_call.url == 'http://b.com/'
+    assert first_call.url == 'https://placeholder.io'
     assert isinstance(first_call.error, RuntimeError)
     assert first_call.custom_retry_count == 0
 
     # Check the second call...
     second_call = calls[1]
-    assert second_call.url == 'http://b.com/'
+    assert second_call.url == 'https://placeholder.io'
     assert isinstance(second_call.error, RuntimeError)
     assert second_call.custom_retry_count == 1
 
@@ -254,7 +266,7 @@ async def test_handles_error_in_error_handler() -> None:
 
     @crawler.router.default_handler
     async def handler(context: BasicCrawlingContext) -> None:
-        if context.request.url == 'http://b.com/':
+        if context.request.url == 'https://placeholder.io':
             raise RuntimeError('Arbitrary crash for testing purposes')
 
     @crawler.error_handler
@@ -262,7 +274,7 @@ async def error_handler(context: BasicCrawlingContext, error: Exception) -> None
         raise RuntimeError('Crash in error handler')
 
     with pytest.raises(UserDefinedErrorHandlerError):
-        await crawler.run(['http://a.com/', 'http://b.com/', 'http://c.com/'])
+        await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
 
 
 async def test_calls_failed_request_handler() -> None:
@@ -271,17 +283,17 @@ async def test_calls_failed_request_handler() -> None:
 
     @crawler.router.default_handler
     async def handler(context: BasicCrawlingContext) -> None:
-        if context.request.url == 'http://b.com/':
+        if context.request.url == 'https://placeholder.io':
             raise RuntimeError('Arbitrary crash for testing purposes')
 
     @crawler.failed_request_handler
     async def failed_request_handler(context: BasicCrawlingContext, error: Exception) -> None:
         calls.append((context, error))
 
-    await crawler.run(['http://a.com/', 'http://b.com/', 'http://c.com/'])
+    await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
 
     assert len(calls) == 1
-    assert calls[0][0].request.url == 'http://b.com/'
+    assert calls[0][0].request.url == 'https://placeholder.io'
     assert isinstance(calls[0][1], RuntimeError)
 
 
@@ -290,7 +302,7 @@ async def test_handles_error_in_failed_request_handler() -> None:
 
     @crawler.router.default_handler
     async def handler(context: BasicCrawlingContext) -> None:
-        if context.request.url == 'http://b.com/':
+        if context.request.url == 'https://placeholder.io':
             raise RuntimeError('Arbitrary crash for testing purposes')
 
     @crawler.failed_request_handler
@@ -298,7 +310,7 @@ async def failed_request_handler(context: BasicCrawlingContext, error: Exception
         raise RuntimeError('Crash in failed request handler')
 
     with pytest.raises(UserDefinedErrorHandlerError):
-        await crawler.run(['http://a.com/', 'http://b.com/', 'http://c.com/'])
+        await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
 
 
 @pytest.mark.parametrize(
@@ -320,7 +332,7 @@ async def handler(context: BasicCrawlingContext) -> None:
         response_data['body'] = json.loads(response.read())
         response_data['headers'] = response.headers
 
-    await crawler.run(['http://a.com/', 'http://b.com/', 'http://c.com/'])
+    await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
 
     response_body = response_data.get('body')
     assert response_body is not None
@@ -365,15 +377,15 @@ class AddRequestsTestInput:
         # Basic use case
         pytest.param(
             AddRequestsTestInput(
-                start_url='https://a.com/',
-                loaded_url='https://a.com/',
+                start_url='https://placeholder.com',
+                loaded_url='https://placeholder.com',
                 requests=[
-                    'https://a.com/',
-                    Request.from_url('http://b.com/'),
-                    'http://c.com/',
+                    'https://placeholder.com',
+                    Request.from_url('https://placeholder.io'),
+                    'https://placeholder.dev',
                 ],
                 kwargs={},
-                expected_urls=['http://b.com/', 'http://c.com/'],
+                expected_urls=['https://placeholder.io', 'https://placeholder.dev'],
             ),
             id='basic',
         ),
@@ -669,7 +681,7 @@ async def handler(context: BasicCrawlingContext) -> None:
         await context.push_data({'b': 2})
         raise RuntimeError('Watch me crash')
 
-    stats = await crawler.run(['https://a.com'])
+    stats = await crawler.run(['https://placeholder.com'])
 
     assert (await crawler.get_data()).items == []
     assert stats.requests_total == 1
@@ -895,15 +907,15 @@ async def test_consecutive_runs_purge_request_queue() -> None:
     async def handler(context: BasicCrawlingContext) -> None:
         visit(context.request.url)
 
-    await crawler.run(['http://a.com', 'http://b.com', 'http://c.com'])
-    await crawler.run(['http://a.com', 'http://b.com', 'http://c.com'])
-    await crawler.run(['http://a.com', 'http://b.com', 'http://c.com'])
+    await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
+    await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
+    await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
 
     counter = Counter(args[0][0] for args in visit.call_args_list)
     assert counter == {
-        'http://a.com': 3,
-        'http://b.com': 3,
-        'http://c.com': 3,
+        'https://placeholder.com': 3,
+        'https://placeholder.io': 3,
+        'https://placeholder.dev': 3,
     }
 
 
@@ -1156,7 +1168,7 @@ async def handler(context: BasicCrawlingContext) -> None:
     # Timeout in pytest, because previous implementation would run crawler until following:
     # "The request queue seems to be stuck for 300.0s, resetting internal state."
     async with timeout(max_request_retries * double_handler_timeout_s):
-        await crawler.run(['http://a.com/'])
+        await crawler.run(['https://placeholder.com'])
 
     assert crawler.statistics.state.requests_finished == 1
     assert mocked_handler_before_sleep.call_count == max_request_retries
@@ -1177,7 +1189,7 @@ async def test_keep_alive(
     """Test that crawler can be kept alive without any requests and stopped with `crawler.stop()`.
 
     Crawler should stop if `max_requests_per_crawl` is reached regardless of the `keep_alive` flag."""
-    additional_urls = ['http://a.com/', 'http://b.com/']
+    additional_urls = ['https://placeholder.com', 'https://placeholder.io']
     expected_handler_calls = [call(url) for url in additional_urls[:expected_handled_requests_count]]
 
     crawler = BasicCrawler(
@@ -1226,9 +1238,9 @@ async def handler(context: BasicCrawlingContext) -> None:
 
             context.session.retire() if retire else None
 
-        await context.add_requests(['http://b.com/'])
+        await context.add_requests(['https://placeholder.io'])
 
-    await crawler.run(['http://a.com/'])
+    await crawler.run(['https://placeholder.com'])
 
     # The session should differ if `retire` was called and match otherwise since pool size == 1
     if retire:
@@ -1249,7 +1261,8 @@ async def handler(context: BasicCrawlingContext) -> None:
                 used_sessions.append(context.session.id)
 
         requests = [
-            Request.from_url('http://a.com/', session_id=check_session.id, always_enqueue=True) for _ in range(10)
+            Request.from_url('https://placeholder.com', session_id=check_session.id, always_enqueue=True)
+            for _ in range(10)
         ]
 
         await crawler.run(requests)
@@ -1280,7 +1293,7 @@ async def handler(context: BasicCrawlingContext) -> None:
             used_sessions.append(context.session.id)
 
     requests = [
-        Request.from_url('http://a.com/', session_id=str(session_id), use_extended_unique_key=True)
+        Request.from_url('https://placeholder.com', session_id=str(session_id), use_extended_unique_key=True)
         for session_id in range(10)
     ]
 
@@ -1293,7 +1306,7 @@ async def handler(context: BasicCrawlingContext) -> None:
 async def test_error_bound_session_to_request() -> None:
     crawler = BasicCrawler(request_handler=AsyncMock())
 
-    requests = [Request.from_url('http://a.com/', session_id='1', always_enqueue=True) for _ in range(10)]
+    requests = [Request.from_url('https://placeholder.com', session_id='1', always_enqueue=True) for _ in range(10)]
 
     stats = await crawler.run(requests)
 
@@ -1311,7 +1324,7 @@ async def error_req_hook(context: BasicCrawlingContext, error: Exception) -> Non
         if isinstance(error, RequestCollisionError):
             await error_handler_mock(context, error)
 
-    requests = [Request.from_url('http://a.com/', session_id='1')]
+    requests = [Request.from_url('https://placeholder.com', session_id='1')]
 
     await crawler.run(requests)
 
@@ -1330,7 +1343,7 @@ async def handler(context: BasicCrawlingContext) -> None:
     async def failed_request_handler(context: BasicCrawlingContext, error: Exception) -> None:
         handler_requests.add(context.request.url)
 
-    requests = ['http://a.com/', 'http://b.com/', 'http://c.com/']
+    requests = ['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']
 
     await crawler.run(requests)
 
@@ -1363,7 +1376,7 @@ async def handler(context: BasicCrawlingContext) -> None:
 
     # Capture all logs from the 'crawlee' logger at INFO level or higher
     with caplog.at_level(logging.INFO, logger='crawlee'):
-        await crawler.run([Request.from_url('http://a.com/')])
+        await crawler.run([Request.from_url('https://placeholder.com')])
 
     # Check for the timeout message in any of the logs
     found_timeout_message = False
@@ -1398,7 +1411,7 @@ async def status_callback(
     async def handler(context: BasicCrawlingContext) -> None:
         await asyncio.sleep(0.1)  # Simulate some processing time
 
-    await crawler.run(['http://a.com/'])
+    await crawler.run(['https://placeholder.com'])
 
     assert status_message_callback.called
 
@@ -1429,7 +1442,7 @@ def listener(event_data: EventCrawlerStatusData) -> None:
 
     crawler = BasicCrawler(request_handler=AsyncMock())
 
-    await crawler.run(['http://a.com/'])
+    await crawler.run(['https://placeholder.com'])
 
     event_manager.off(event=Event.CRAWLER_STATUS, listener=listener)
 
diff --git a/tests/unit/request_loaders/test_request_list.py b/tests/unit/request_loaders/test_request_list.py
index 5142b7719d..1f2345a6af 100644
--- a/tests/unit/request_loaders/test_request_list.py
+++ b/tests/unit/request_loaders/test_request_list.py
@@ -4,7 +4,7 @@
 
 
 async def test_sync_traversal() -> None:
-    request_list = RequestList(['https://a.com', 'https://b.com', 'https://c.com'])
+    request_list = RequestList(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
 
     while not await request_list.is_finished():
         item = await request_list.fetch_next_request()
@@ -17,9 +17,9 @@ async def test_sync_traversal() -> None:
 
 async def test_async_traversal() -> None:
     async def generator() -> AsyncGenerator[str]:
-        yield 'https://a.com'
-        yield 'https://b.com'
-        yield 'https://c.com'
+        yield 'https://placeholder.com'
+        yield 'https://placeholder.io'
+        yield 'https://placeholder.dev'
 
     request_list = RequestList(generator())
 
@@ -33,7 +33,7 @@ async def generator() -> AsyncGenerator[str]:
 
 
 async def test_is_empty_does_not_depend_on_fetch_next_request() -> None:
-    request_list = RequestList(['https://a.com', 'https://b.com', 'https://c.com'])
+    request_list = RequestList(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
 
     item_1 = await request_list.fetch_next_request()
     assert item_1 is not None
diff --git a/tests/unit/storages/test_request_manager_tandem.py b/tests/unit/storages/test_request_manager_tandem.py
index 70240914ec..cb31f5e4e1 100644
--- a/tests/unit/storages/test_request_manager_tandem.py
+++ b/tests/unit/storages/test_request_manager_tandem.py
@@ -25,26 +25,30 @@ class TestInput:
     argvalues=[
         pytest.param(
             TestInput(
-                request_loader_items=['http://a.com', 'http://b.com'],
+                request_loader_items=['https://placeholder.com', 'https://placeholder.io'],
                 request_manager_items=[],
-                discovered_items=[Request.from_url('http://c.com')],
+                discovered_items=[Request.from_url('https://placeholder.dev')],
                 expected_result={
-                    'http://a.com',
-                    'http://b.com',
-                    'http://c.com',
+                    'https://placeholder.com',
+                    'https://placeholder.io',
+                    'https://placeholder.dev',
                 },
             ),
             id='basic_usage',
         ),
         pytest.param(
             TestInput(
-                request_loader_items=[Request.from_url('http://a.com'), None, Request.from_url('http://c.com')],
-                request_manager_items=['http://b.com', 'http://d.com'],
+                request_loader_items=[
+                    Request.from_url('https://placeholder.com'),
+                    None,
+                    Request.from_url('https://placeholder.dev'),
+                ],
+                request_manager_items=['https://placeholder.io', 'http://d.com'],
                 discovered_items=[],
                 expected_result={
-                    'http://a.com',
-                    'http://b.com',
-                    'http://c.com',
+                    'https://placeholder.com',
+                    'https://placeholder.io',
+                    'https://placeholder.dev',
                     'http://d.com',
                 },
             ),

From 65b46ccf5e4bbb5faabd135db7992e0532c93f48 Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Wed, 2 Jul 2025 12:00:41 +0000
Subject: [PATCH 10/10] placeholders with subdomains

---
 .../session_management/multi_sessions_http.py |   2 +-
 .../crawlers/_basic/test_basic_crawler.py     | 166 +++++++++---------
 .../unit/request_loaders/test_request_list.py |  10 +-
 .../storages/test_request_manager_tandem.py   |  22 +--
 4 files changed, 101 insertions(+), 99 deletions(-)

diff --git a/docs/guides/code_examples/session_management/multi_sessions_http.py b/docs/guides/code_examples/session_management/multi_sessions_http.py
index 74f1bafc4c..0bd4a88beb 100644
--- a/docs/guides/code_examples/session_management/multi_sessions_http.py
+++ b/docs/guides/code_examples/session_management/multi_sessions_http.py
@@ -49,7 +49,7 @@ async def session_init(context: HttpCrawlingContext) -> None:
         if context.session:
             context.log.info(f'Init session {context.session.id}')
             next_request = Request.from_url(
-                'https://placeholder.dev', session_id=context.session.id
+                'https://a.placeholder.com', session_id=context.session.id
             )
             next_requests.append(next_request)
 
diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py
index 8a151f3312..4e8a513118 100644
--- a/tests/unit/crawlers/_basic/test_basic_crawler.py
+++ b/tests/unit/crawlers/_basic/test_basic_crawler.py
@@ -42,7 +42,7 @@
 
 async def test_processes_requests_from_explicit_queue() -> None:
     queue = await RequestQueue.open()
-    await queue.add_requests(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
+    await queue.add_requests(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])
 
     crawler = BasicCrawler(request_manager=queue)
     calls = list[str]()
@@ -53,14 +53,16 @@ async def handler(context: BasicCrawlingContext) -> None:
 
     await crawler.run()
 
-    assert calls == ['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']
+    assert calls == ['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']
 
 
 async def test_processes_requests_from_request_source_tandem() -> None:
     request_queue = await RequestQueue.open()
-    await request_queue.add_requests(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
+    await request_queue.add_requests(
+        ['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']
+    )
 
-    request_list = RequestList(['https://placeholder.com', 'https://placeholder.gov', 'https://placeholder.biz'])
+    request_list = RequestList(['https://a.placeholder.com', 'https://d.placeholder.com', 'https://e.placeholder.com'])
 
     crawler = BasicCrawler(request_manager=RequestManagerTandem(request_list, request_queue))
     calls = set[str]()
@@ -72,11 +74,11 @@ async def handler(context: BasicCrawlingContext) -> None:
     await crawler.run()
 
     assert calls == {
-        'https://placeholder.com',
-        'https://placeholder.io',
-        'https://placeholder.dev',
-        'https://placeholder.gov',
-        'https://placeholder.biz',
+        'https://a.placeholder.com',
+        'https://b.placeholder.com',
+        'https://c.placeholder.com',
+        'https://d.placeholder.com',
+        'https://e.placeholder.com',
     }
 
 
@@ -88,9 +90,9 @@ async def test_processes_requests_from_run_args() -> None:
     async def handler(context: BasicCrawlingContext) -> None:
         calls.append(context.request.url)
 
-    await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
+    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])
 
-    assert calls == ['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']
+    assert calls == ['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']
 
 
 async def test_allows_multiple_run_calls() -> None:
@@ -101,16 +103,16 @@ async def test_allows_multiple_run_calls() -> None:
     async def handler(context: BasicCrawlingContext) -> None:
         calls.append(context.request.url)
 
-    await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
-    await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
+    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])
+    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])
 
     assert calls == [
-        'https://placeholder.com',
-        'https://placeholder.io',
-        'https://placeholder.dev',
-        'https://placeholder.com',
-        'https://placeholder.io',
-        'https://placeholder.dev',
+        'https://a.placeholder.com',
+        'https://b.placeholder.com',
+        'https://c.placeholder.com',
+        'https://a.placeholder.com',
+        'https://b.placeholder.com',
+        'https://c.placeholder.com',
     ]
 
 
@@ -122,17 +124,17 @@ async def test_retries_failed_requests() -> None:
     async def handler(context: BasicCrawlingContext) -> None:
         calls.append(context.request.url)
 
-        if context.request.url == 'https://placeholder.io':
+        if context.request.url == 'https://b.placeholder.com':
             raise RuntimeError('Arbitrary crash for testing purposes')
 
-    await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
+    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])
 
     assert calls == [
-        'https://placeholder.com',
-        'https://placeholder.io',
-        'https://placeholder.dev',
-        'https://placeholder.io',
-        'https://placeholder.io',
+        'https://a.placeholder.com',
+        'https://b.placeholder.com',
+        'https://c.placeholder.com',
+        'https://b.placeholder.com',
+        'https://b.placeholder.com',
     ]
 
 
@@ -147,20 +149,20 @@ async def handler(context: BasicCrawlingContext) -> None:
 
     await crawler.run(
         [
-            'https://placeholder.com',
-            'https://placeholder.io',
-            Request.from_url(url='https://placeholder.dev', no_retry=True),
+            'https://a.placeholder.com',
+            'https://b.placeholder.com',
+            Request.from_url(url='https://c.placeholder.com', no_retry=True),
         ]
     )
 
     assert calls == [
-        'https://placeholder.com',
-        'https://placeholder.io',
-        'https://placeholder.dev',
-        'https://placeholder.com',
-        'https://placeholder.io',
-        'https://placeholder.com',
-        'https://placeholder.io',
+        'https://a.placeholder.com',
+        'https://b.placeholder.com',
+        'https://c.placeholder.com',
+        'https://a.placeholder.com',
+        'https://b.placeholder.com',
+        'https://a.placeholder.com',
+        'https://b.placeholder.com',
     ]
 
 
@@ -175,19 +177,19 @@ async def handler(context: BasicCrawlingContext) -> None:
 
     await crawler.run(
         [
-            'https://placeholder.com',
-            'https://placeholder.io',
-            Request.from_url(url='https://placeholder.dev', user_data={'__crawlee': {'maxRetries': 4}}),
+            'https://a.placeholder.com',
+            'https://b.placeholder.com',
+            Request.from_url(url='https://c.placeholder.com', user_data={'__crawlee': {'maxRetries': 4}}),
         ]
     )
 
     assert calls == [
-        'https://placeholder.com',
-        'https://placeholder.io',
-        'https://placeholder.dev',
-        'https://placeholder.dev',
-        'https://placeholder.dev',
-        'https://placeholder.dev',
+        'https://a.placeholder.com',
+        'https://b.placeholder.com',
+        'https://c.placeholder.com',
+        'https://c.placeholder.com',
+        'https://c.placeholder.com',
+        'https://c.placeholder.com',
     ]
 
 
@@ -206,7 +208,7 @@ class Call:
 
     @crawler.router.default_handler
     async def handler(context: BasicCrawlingContext) -> None:
-        if context.request.url == 'https://placeholder.io':
+        if context.request.url == 'https://b.placeholder.com':
             raise RuntimeError('Arbitrary crash for testing purposes')
 
     @crawler.error_handler
@@ -223,20 +225,20 @@ async def error_handler(context: BasicCrawlingContext, error: Exception) -> Requ
         request['headers'] = HttpHeaders({'custom_retry_count': str(custom_retry_count + 1)})
         return Request.model_validate(request)
 
-    await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
+    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])
 
     # Verify that the error handler was called twice
     assert len(calls) == 2
 
     # Check the first call...
     first_call = calls[0]
-    assert first_call.url == 'https://placeholder.io'
+    assert first_call.url == 'https://b.placeholder.com'
     assert isinstance(first_call.error, RuntimeError)
     assert first_call.custom_retry_count == 0
 
     # Check the second call...
     second_call = calls[1]
-    assert second_call.url == 'https://placeholder.io'
+    assert second_call.url == 'https://b.placeholder.com'
     assert isinstance(second_call.error, RuntimeError)
     assert second_call.custom_retry_count == 1
 
@@ -266,7 +268,7 @@ async def test_handles_error_in_error_handler() -> None:
 
     @crawler.router.default_handler
     async def handler(context: BasicCrawlingContext) -> None:
-        if context.request.url == 'https://placeholder.io':
+        if context.request.url == 'https://b.placeholder.com':
             raise RuntimeError('Arbitrary crash for testing purposes')
 
     @crawler.error_handler
@@ -274,7 +276,7 @@ async def error_handler(context: BasicCrawlingContext, error: Exception) -> None
         raise RuntimeError('Crash in error handler')
 
     with pytest.raises(UserDefinedErrorHandlerError):
-        await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
+        await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])
 
 
 async def test_calls_failed_request_handler() -> None:
@@ -283,17 +285,17 @@ async def test_calls_failed_request_handler() -> None:
 
     @crawler.router.default_handler
     async def handler(context: BasicCrawlingContext) -> None:
-        if context.request.url == 'https://placeholder.io':
+        if context.request.url == 'https://b.placeholder.com':
             raise RuntimeError('Arbitrary crash for testing purposes')
 
     @crawler.failed_request_handler
     async def failed_request_handler(context: BasicCrawlingContext, error: Exception) -> None:
         calls.append((context, error))
 
-    await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
+    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])
 
     assert len(calls) == 1
-    assert calls[0][0].request.url == 'https://placeholder.io'
+    assert calls[0][0].request.url == 'https://b.placeholder.com'
     assert isinstance(calls[0][1], RuntimeError)
 
 
@@ -302,7 +304,7 @@ async def test_handles_error_in_failed_request_handler() -> None:
 
     @crawler.router.default_handler
     async def handler(context: BasicCrawlingContext) -> None:
-        if context.request.url == 'https://placeholder.io':
+        if context.request.url == 'https://b.placeholder.com':
             raise RuntimeError('Arbitrary crash for testing purposes')
 
     @crawler.failed_request_handler
@@ -310,7 +312,7 @@ async def failed_request_handler(context: BasicCrawlingContext, error: Exception
         raise RuntimeError('Crash in failed request handler')
 
     with pytest.raises(UserDefinedErrorHandlerError):
-        await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
+        await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])
 
 
 @pytest.mark.parametrize(
@@ -332,7 +334,7 @@ async def handler(context: BasicCrawlingContext) -> None:
         response_data['body'] = json.loads(response.read())
         response_data['headers'] = response.headers
 
-    await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
+    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])
 
     response_body = response_data.get('body')
     assert response_body is not None
@@ -377,15 +379,15 @@ class AddRequestsTestInput:
         # Basic use case
         pytest.param(
             AddRequestsTestInput(
-                start_url='https://placeholder.com',
-                loaded_url='https://placeholder.com',
+                start_url='https://a.placeholder.com',
+                loaded_url='https://a.placeholder.com',
                 requests=[
-                    'https://placeholder.com',
-                    Request.from_url('https://placeholder.io'),
-                    'https://placeholder.dev',
+                    'https://a.placeholder.com',
+                    Request.from_url('https://b.placeholder.com'),
+                    'https://c.placeholder.com',
                 ],
                 kwargs={},
-                expected_urls=['https://placeholder.io', 'https://placeholder.dev'],
+                expected_urls=['https://b.placeholder.com', 'https://c.placeholder.com'],
             ),
             id='basic',
         ),
@@ -681,7 +683,7 @@ async def handler(context: BasicCrawlingContext) -> None:
         await context.push_data({'b': 2})
         raise RuntimeError('Watch me crash')
 
-    stats = await crawler.run(['https://placeholder.com'])
+    stats = await crawler.run(['https://a.placeholder.com'])
 
     assert (await crawler.get_data()).items == []
     assert stats.requests_total == 1
@@ -907,15 +909,15 @@ async def test_consecutive_runs_purge_request_queue() -> None:
     async def handler(context: BasicCrawlingContext) -> None:
         visit(context.request.url)
 
-    await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
-    await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
-    await crawler.run(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
+    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])
+    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])
+    await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])
 
     counter = Counter(args[0][0] for args in visit.call_args_list)
     assert counter == {
-        'https://placeholder.com': 3,
-        'https://placeholder.io': 3,
-        'https://placeholder.dev': 3,
+        'https://a.placeholder.com': 3,
+        'https://b.placeholder.com': 3,
+        'https://c.placeholder.com': 3,
     }
 
 
@@ -1168,7 +1170,7 @@ async def handler(context: BasicCrawlingContext) -> None:
     # Timeout in pytest, because previous implementation would run crawler until following:
     # "The request queue seems to be stuck for 300.0s, resetting internal state."
     async with timeout(max_request_retries * double_handler_timeout_s):
-        await crawler.run(['https://placeholder.com'])
+        await crawler.run(['https://a.placeholder.com'])
 
     assert crawler.statistics.state.requests_finished == 1
     assert mocked_handler_before_sleep.call_count == max_request_retries
@@ -1189,7 +1191,7 @@ async def test_keep_alive(
     """Test that crawler can be kept alive without any requests and stopped with `crawler.stop()`.
 
     Crawler should stop if `max_requests_per_crawl` is reached regardless of the `keep_alive` flag."""
-    additional_urls = ['https://placeholder.com', 'https://placeholder.io']
+    additional_urls = ['https://a.placeholder.com', 'https://b.placeholder.com']
     expected_handler_calls = [call(url) for url in additional_urls[:expected_handled_requests_count]]
 
     crawler = BasicCrawler(
@@ -1238,9 +1240,9 @@ async def handler(context: BasicCrawlingContext) -> None:
 
             context.session.retire() if retire else None
 
-        await context.add_requests(['https://placeholder.io'])
+        await context.add_requests(['https://b.placeholder.com'])
 
-    await crawler.run(['https://placeholder.com'])
+    await crawler.run(['https://a.placeholder.com'])
 
     # The session should differ if `retire` was called and match otherwise since pool size == 1
     if retire:
@@ -1261,7 +1263,7 @@ async def handler(context: BasicCrawlingContext) -> None:
                 used_sessions.append(context.session.id)
 
         requests = [
-            Request.from_url('https://placeholder.com', session_id=check_session.id, always_enqueue=True)
+            Request.from_url('https://a.placeholder.com', session_id=check_session.id, always_enqueue=True)
             for _ in range(10)
         ]
 
@@ -1293,7 +1295,7 @@ async def handler(context: BasicCrawlingContext) -> None:
             used_sessions.append(context.session.id)
 
     requests = [
-        Request.from_url('https://placeholder.com', session_id=str(session_id), use_extended_unique_key=True)
+        Request.from_url('https://a.placeholder.com', session_id=str(session_id), use_extended_unique_key=True)
         for session_id in range(10)
     ]
 
@@ -1306,7 +1308,7 @@ async def handler(context: BasicCrawlingContext) -> None:
 async def test_error_bound_session_to_request() -> None:
     crawler = BasicCrawler(request_handler=AsyncMock())
 
-    requests = [Request.from_url('https://placeholder.com', session_id='1', always_enqueue=True) for _ in range(10)]
+    requests = [Request.from_url('https://a.placeholder.com', session_id='1', always_enqueue=True) for _ in range(10)]
 
     stats = await crawler.run(requests)
 
@@ -1324,7 +1326,7 @@ async def error_req_hook(context: BasicCrawlingContext, error: Exception) -> Non
         if isinstance(error, RequestCollisionError):
             await error_handler_mock(context, error)
 
-    requests = [Request.from_url('https://placeholder.com', session_id='1')]
+    requests = [Request.from_url('https://a.placeholder.com', session_id='1')]
 
     await crawler.run(requests)
 
@@ -1343,7 +1345,7 @@ async def handler(context: BasicCrawlingContext) -> None:
     async def failed_request_handler(context: BasicCrawlingContext, error: Exception) -> None:
         handler_requests.add(context.request.url)
 
-    requests = ['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev']
+    requests = ['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']
 
     await crawler.run(requests)
 
@@ -1376,7 +1378,7 @@ async def handler(context: BasicCrawlingContext) -> None:
 
     # Capture all logs from the 'crawlee' logger at INFO level or higher
     with caplog.at_level(logging.INFO, logger='crawlee'):
-        await crawler.run([Request.from_url('https://placeholder.com')])
+        await crawler.run([Request.from_url('https://a.placeholder.com')])
 
     # Check for the timeout message in any of the logs
     found_timeout_message = False
@@ -1411,7 +1413,7 @@ async def status_callback(
     async def handler(context: BasicCrawlingContext) -> None:
         await asyncio.sleep(0.1)  # Simulate some processing time
 
-    await crawler.run(['https://placeholder.com'])
+    await crawler.run(['https://a.placeholder.com'])
 
     assert status_message_callback.called
 
@@ -1442,7 +1444,7 @@ def listener(event_data: EventCrawlerStatusData) -> None:
 
     crawler = BasicCrawler(request_handler=AsyncMock())
 
-    await crawler.run(['https://placeholder.com'])
+    await crawler.run(['https://a.placeholder.com'])
 
     event_manager.off(event=Event.CRAWLER_STATUS, listener=listener)
 
diff --git a/tests/unit/request_loaders/test_request_list.py b/tests/unit/request_loaders/test_request_list.py
index 1f2345a6af..e3ded91b7f 100644
--- a/tests/unit/request_loaders/test_request_list.py
+++ b/tests/unit/request_loaders/test_request_list.py
@@ -4,7 +4,7 @@
 
 
 async def test_sync_traversal() -> None:
-    request_list = RequestList(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
+    request_list = RequestList(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])
 
     while not await request_list.is_finished():
         item = await request_list.fetch_next_request()
@@ -17,9 +17,9 @@ async def test_sync_traversal() -> None:
 
 async def test_async_traversal() -> None:
     async def generator() -> AsyncGenerator[str]:
-        yield 'https://placeholder.com'
-        yield 'https://placeholder.io'
-        yield 'https://placeholder.dev'
+        yield 'https://a.placeholder.com'
+        yield 'https://b.placeholder.com'
+        yield 'https://c.placeholder.com'
 
     request_list = RequestList(generator())
 
@@ -33,7 +33,7 @@ async def generator() -> AsyncGenerator[str]:
 
 
 async def test_is_empty_does_not_depend_on_fetch_next_request() -> None:
-    request_list = RequestList(['https://placeholder.com', 'https://placeholder.io', 'https://placeholder.dev'])
+    request_list = RequestList(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])
 
     item_1 = await request_list.fetch_next_request()
     assert item_1 is not None
diff --git a/tests/unit/storages/test_request_manager_tandem.py b/tests/unit/storages/test_request_manager_tandem.py
index cb31f5e4e1..69bd944348 100644
--- a/tests/unit/storages/test_request_manager_tandem.py
+++ b/tests/unit/storages/test_request_manager_tandem.py
@@ -25,13 +25,13 @@ class TestInput:
     argvalues=[
         pytest.param(
             TestInput(
-                request_loader_items=['https://placeholder.com', 'https://placeholder.io'],
+                request_loader_items=['https://a.placeholder.com', 'https://b.placeholder.com'],
                 request_manager_items=[],
-                discovered_items=[Request.from_url('https://placeholder.dev')],
+                discovered_items=[Request.from_url('https://c.placeholder.com')],
                 expected_result={
-                    'https://placeholder.com',
-                    'https://placeholder.io',
-                    'https://placeholder.dev',
+                    'https://a.placeholder.com',
+                    'https://b.placeholder.com',
+                    'https://c.placeholder.com',
                 },
             ),
             id='basic_usage',
@@ -39,16 +39,16 @@ class TestInput:
         pytest.param(
             TestInput(
                 request_loader_items=[
-                    Request.from_url('https://placeholder.com'),
+                    Request.from_url('https://a.placeholder.com'),
                     None,
-                    Request.from_url('https://placeholder.dev'),
+                    Request.from_url('https://c.placeholder.com'),
                 ],
-                request_manager_items=['https://placeholder.io', 'http://d.com'],
+                request_manager_items=['https://b.placeholder.com', 'http://d.com'],
                 discovered_items=[],
                 expected_result={
-                    'https://placeholder.com',
-                    'https://placeholder.io',
-                    'https://placeholder.dev',
+                    'https://a.placeholder.com',
+                    'https://b.placeholder.com',
+                    'https://c.placeholder.com',
                     'http://d.com',
                 },
             ),