Skip to content

Commit b992fb2

Browse files
authored
feat: add periodic status logging and status_message_callback parameter for customization (#1265)
### Description - Add periodic logging of the current crawler status - Add `status_message_logging_interval` parameter to configure the logging interval - Add `status_message_callback` parameter to allow custom status message handling - Add a new `CRAWLER_STATUS` `Event` that emits status messages through `EventManager` ### Issues - Closes: #96
1 parent 11bcc9d commit b992fb2

File tree

11 files changed

+306
-109
lines changed

11 files changed

+306
-109
lines changed

docs/guides/code_examples/session_management/multi_sessions_http.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ async def session_init(context: HttpCrawlingContext) -> None:
4949
if context.session:
5050
context.log.info(f'Init session {context.session.id}')
5151
next_request = Request.from_url(
52-
'https://placeholder.dev', session_id=context.session.id
52+
'https://a.placeholder.com', session_id=context.session.id
5353
)
5454
next_requests.append(next_request)
5555

src/crawlee/_log_config.py

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,19 @@
44
import logging
55
import sys
66
import textwrap
7-
from typing import Any
7+
from typing import TYPE_CHECKING, Any
88

99
from colorama import Fore, Style, just_fix_windows_console
1010
from typing_extensions import assert_never
1111

1212
from crawlee import service_locator
1313

14+
if TYPE_CHECKING:
15+
from crawlee._types import LogLevel
16+
1417
just_fix_windows_console()
1518

19+
1620
_LOG_NAME_COLOR = Fore.LIGHTBLACK_EX
1721

1822
_LOG_LEVEL_COLOR = {
@@ -34,22 +38,27 @@
3438
_LOG_MESSAGE_INDENT = ' ' * 6
3539

3640

41+
def string_to_log_level(level: LogLevel) -> int:
42+
"""Convert a string representation of a log level to an integer log level."""
43+
if level == 'DEBUG':
44+
return logging.DEBUG
45+
if level == 'INFO':
46+
return logging.INFO
47+
if level == 'WARNING':
48+
return logging.WARNING
49+
if level == 'ERROR':
50+
return logging.ERROR
51+
if level == 'CRITICAL':
52+
return logging.CRITICAL
53+
54+
assert_never(level)
55+
56+
3757
def get_configured_log_level() -> int:
3858
config = service_locator.get_configuration()
3959

4060
if 'log_level' in config.model_fields_set:
41-
if config.log_level == 'DEBUG':
42-
return logging.DEBUG
43-
if config.log_level == 'INFO':
44-
return logging.INFO
45-
if config.log_level == 'WARNING':
46-
return logging.WARNING
47-
if config.log_level == 'ERROR':
48-
return logging.ERROR
49-
if config.log_level == 'CRITICAL':
50-
return logging.CRITICAL
51-
52-
assert_never(config.log_level)
61+
return string_to_log_level(config.log_level)
5362

5463
if sys.flags.dev_mode:
5564
return logging.DEBUG

src/crawlee/_types.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@
5555

5656
SkippedReason = Literal['robots_txt']
5757

58+
LogLevel = Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
59+
5860

5961
def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:
6062
"""Convert all header keys to lowercase, strips whitespace, and returns them sorted by key."""

src/crawlee/configuration.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
from __future__ import annotations
22

33
from datetime import timedelta
4-
from typing import TYPE_CHECKING, Annotated, Literal
4+
from typing import TYPE_CHECKING, Annotated
55

66
from pydantic import AliasChoices, BeforeValidator, Field
77
from pydantic_settings import BaseSettings, SettingsConfigDict
88

9+
from crawlee._types import LogLevel
910
from crawlee._utils.docs import docs_group
1011
from crawlee._utils.models import timedelta_ms
1112

@@ -62,7 +63,7 @@ class Configuration(BaseSettings):
6263
https://playwright.dev/docs/api/class-browsertype#browser-type-launch."""
6364

6465
log_level: Annotated[
65-
Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
66+
LogLevel,
6667
Field(
6768
validation_alias=AliasChoices(
6869
'apify_log_level',

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 79 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,20 +25,22 @@
2525

2626
from crawlee import EnqueueStrategy, Glob, RequestTransformAction, service_locator
2727
from crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus
28-
from crawlee._log_config import configure_logger, get_configured_log_level
28+
from crawlee._log_config import configure_logger, get_configured_log_level, string_to_log_level
2929
from crawlee._request import Request, RequestOptions, RequestState
3030
from crawlee._types import (
3131
BasicCrawlingContext,
3232
EnqueueLinksKwargs,
3333
GetKeyValueStoreFromRequestHandlerFunction,
3434
HttpHeaders,
3535
HttpPayload,
36+
LogLevel,
3637
RequestHandlerRunResult,
3738
SendRequestFunction,
3839
SkippedReason,
3940
)
4041
from crawlee._utils.docs import docs_group
4142
from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
43+
from crawlee._utils.recurring_task import RecurringTask
4244
from crawlee._utils.robots import RobotsTxtFile
4345
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
4446
from crawlee._utils.wait import wait_for
@@ -53,6 +55,7 @@
5355
SessionError,
5456
UserDefinedErrorHandlerError,
5557
)
58+
from crawlee.events._types import Event, EventCrawlerStatusData
5659
from crawlee.http_clients import HttpxHttpClient
5760
from crawlee.router import Router
5861
from crawlee.sessions import SessionPool
@@ -191,6 +194,15 @@ class _BasicCrawlerOptions(TypedDict):
191194
"""If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,
192195
and skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`."""
193196

197+
status_message_logging_interval: NotRequired[timedelta]
198+
"""Interval for logging the crawler status messages."""
199+
200+
status_message_callback: NotRequired[
201+
Callable[[StatisticsState, StatisticsState | None, str], Awaitable[str | None]]
202+
]
203+
"""Allows overriding the default status message. The default status message is provided in the parameters.
204+
Returning `None` suppresses the status message."""
205+
194206

195207
class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], TypedDict):
196208
"""Generic options the `BasicCrawler` constructor."""
@@ -273,6 +285,9 @@ def __init__(
273285
configure_logging: bool = True,
274286
statistics_log_format: Literal['table', 'inline'] = 'table',
275287
respect_robots_txt_file: bool = False,
288+
status_message_logging_interval: timedelta = timedelta(seconds=10),
289+
status_message_callback: Callable[[StatisticsState, StatisticsState | None, str], Awaitable[str | None]]
290+
| None = None,
276291
_context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
277292
_additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None,
278293
_logger: logging.Logger | None = None,
@@ -291,7 +306,6 @@ def __init__(
291306
max_request_retries: Specifies the maximum number of retries allowed for a request if its processing fails.
292307
This includes retries due to navigation errors or errors thrown from user-supplied functions
293308
(`request_handler`, `pre_navigation_hooks` etc.).
294-
295309
This limit does not apply to retries triggered by session rotation (see `max_session_rotations`).
296310
max_requests_per_crawl: Maximum number of pages to open during a crawl. The crawl stops upon reaching
297311
this limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means
@@ -300,7 +314,6 @@ def __init__(
300314
`max_requests_per_crawl` is achieved.
301315
max_session_rotations: Maximum number of session rotations per request. The crawler rotates the session
302316
if a proxy error occurs or if the website blocks the request.
303-
304317
The session rotations are not counted towards the `max_request_retries` limit.
305318
max_crawl_depth: Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond
306319
this depth. The crawl depth starts at 0 for initial requests and increases with each subsequent level
@@ -324,6 +337,9 @@ def __init__(
324337
respect_robots_txt_file: If set to `True`, the crawler will automatically try to fetch the robots.txt file
325338
for each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added
326339
via `EnqueueLinksFunction`
340+
status_message_logging_interval: Interval for logging the crawler status messages.
341+
status_message_callback: Allows overriding the default status message. The default status message is
342+
provided in the parameters. Returning `None` suppresses the status message.
327343
_context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
328344
Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
329345
_additional_context_managers: Additional context managers used throughout the crawler lifecycle.
@@ -368,6 +384,9 @@ def __init__(
368384
self._on_skipped_request: SkippedRequestCallback | None = None
369385
self._abort_on_error = abort_on_error
370386

387+
# Crawler callbacks
388+
self._status_message_callback = status_message_callback
389+
371390
# Context of each request with matching result of request handler.
372391
# Inheritors can use this to override the result of individual request handler runs in `_run_request_handler`.
373392
self._context_result_map = WeakKeyDictionary[BasicCrawlingContext, RequestHandlerRunResult]()
@@ -428,6 +447,10 @@ def __init__(
428447
is_task_ready_function=self.__is_task_ready_function,
429448
run_task_function=self.__run_task_function,
430449
)
450+
self._crawler_state_rec_task = RecurringTask(
451+
func=self._crawler_state_task, delay=status_message_logging_interval
452+
)
453+
self._previous_crawler_state: TStatisticsState | None = None
431454

432455
# State flags
433456
self._keep_alive = keep_alive
@@ -632,6 +655,7 @@ def sigint_handler() -> None:
632655
except CancelledError:
633656
pass
634657
finally:
658+
await self._crawler_state_rec_task.stop()
635659
if threading.current_thread() is threading.main_thread():
636660
with suppress(NotImplementedError):
637661
asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
@@ -663,6 +687,8 @@ def sigint_handler() -> None:
663687
async def _run_crawler(self) -> None:
664688
event_manager = service_locator.get_event_manager()
665689

690+
self._crawler_state_rec_task.start()
691+
666692
# Collect the context managers to be entered. Context managers that are already active are excluded,
667693
# as they were likely entered by the caller, who will also be responsible for exiting them.
668694
contexts_to_enter = [
@@ -1481,3 +1507,53 @@ async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
14811507
url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file.
14821508
"""
14831509
return await RobotsTxtFile.find(url, self._http_client)
1510+
1511+
def _log_status_message(self, message: str, level: LogLevel = 'DEBUG') -> None:
1512+
"""Log a status message for the crawler.
1513+
1514+
Args:
1515+
message: The status message to log.
1516+
level: The logging level for the message.
1517+
"""
1518+
log_level = string_to_log_level(level)
1519+
self.log.log(log_level, message)
1520+
1521+
async def _crawler_state_task(self) -> None:
1522+
"""Emit a persist state event with the given migration status."""
1523+
event_manager = service_locator.get_event_manager()
1524+
1525+
current_state = self.statistics.state
1526+
1527+
if (
1528+
failed_requests := (
1529+
current_state.requests_failed - (self._previous_crawler_state or current_state).requests_failed
1530+
)
1531+
> 0
1532+
):
1533+
message = f'Experiencing problems, {failed_requests} failed requests since last status update.'
1534+
else:
1535+
request_manager = await self.get_request_manager()
1536+
total_count = await request_manager.get_total_count()
1537+
if total_count is not None and total_count > 0:
1538+
pages_info = f'{self._statistics.state.requests_finished}/{total_count}'
1539+
else:
1540+
pages_info = str(self._statistics.state.requests_finished)
1541+
1542+
message = (
1543+
f'Crawled {pages_info} pages, {self._statistics.state.requests_failed} failed requests, '
1544+
f'desired concurrency {self._autoscaled_pool.desired_concurrency}.'
1545+
)
1546+
1547+
if self._status_message_callback:
1548+
new_message = await self._status_message_callback(current_state, self._previous_crawler_state, message)
1549+
if new_message:
1550+
message = new_message
1551+
self._log_status_message(message, level='INFO')
1552+
else:
1553+
self._log_status_message(message, level='INFO')
1554+
1555+
event_manager.emit(
1556+
event=Event.CRAWLER_STATUS, event_data=EventCrawlerStatusData(message=message, crawler_id=id(self))
1557+
)
1558+
1559+
self._previous_crawler_state = current_state

src/crawlee/events/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from ._types import (
44
Event,
55
EventAbortingData,
6+
EventCrawlerStatusData,
67
EventData,
78
EventExitData,
89
EventListener,
@@ -14,6 +15,7 @@
1415
__all__ = [
1516
'Event',
1617
'EventAbortingData',
18+
'EventCrawlerStatusData',
1719
'EventData',
1820
'EventExitData',
1921
'EventListener',

src/crawlee/events/_event_manager.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from crawlee.events._types import (
2020
Event,
2121
EventAbortingData,
22+
EventCrawlerStatusData,
2223
EventExitData,
2324
EventListener,
2425
EventMigratingData,
@@ -147,6 +148,8 @@ def on(self, *, event: Literal[Event.ABORTING], listener: EventListener[EventAbo
147148
@overload
148149
def on(self, *, event: Literal[Event.EXIT], listener: EventListener[EventExitData]) -> None: ...
149150
@overload
151+
def on(self, *, event: Literal[Event.CRAWLER_STATUS], listener: EventListener[EventCrawlerStatusData]) -> None: ...
152+
@overload
150153
def on(self, *, event: Event, listener: EventListener[None]) -> None: ...
151154

152155
def on(self, *, event: Event, listener: EventListener[Any]) -> None:
@@ -222,6 +225,8 @@ def emit(self, *, event: Literal[Event.ABORTING], event_data: EventAbortingData)
222225
@overload
223226
def emit(self, *, event: Literal[Event.EXIT], event_data: EventExitData) -> None: ...
224227
@overload
228+
def emit(self, *, event: Literal[Event.CRAWLER_STATUS], event_data: EventCrawlerStatusData) -> None: ...
229+
@overload
225230
def emit(self, *, event: Event, event_data: Any) -> None: ...
226231

227232
@ensure_context

src/crawlee/events/_types.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ class Event(str, Enum):
3131
PAGE_CREATED = 'pageCreated'
3232
PAGE_CLOSED = 'pageClosed'
3333

34+
# State events
35+
CRAWLER_STATUS = 'crawlerStatus'
36+
3437

3538
@docs_group('Event payloads')
3639
class EventPersistStateData(BaseModel):
@@ -79,7 +82,27 @@ class EventExitData(BaseModel):
7982
model_config = ConfigDict(populate_by_name=True)
8083

8184

82-
EventData = EventPersistStateData | EventSystemInfoData | EventMigratingData | EventAbortingData | EventExitData
85+
@docs_group('Event payloads')
86+
class EventCrawlerStatusData(BaseModel):
87+
"""Data for the crawler status event."""
88+
89+
model_config = ConfigDict(populate_by_name=True)
90+
91+
message: str
92+
"""A message describing the current status of the crawler."""
93+
94+
crawler_id: int
95+
"""The ID of the crawler that emitted the event."""
96+
97+
98+
EventData = (
99+
EventPersistStateData
100+
| EventSystemInfoData
101+
| EventMigratingData
102+
| EventAbortingData
103+
| EventExitData
104+
| EventCrawlerStatusData
105+
)
83106
"""A helper type for all possible event payloads"""
84107

85108
WrappedListener = Callable[..., Coroutine[Any, Any, None]]

0 commit comments

Comments
 (0)