25
25
26
26
from crawlee import EnqueueStrategy , Glob , RequestTransformAction , service_locator
27
27
from crawlee ._autoscaling import AutoscaledPool , Snapshotter , SystemStatus
28
- from crawlee ._log_config import configure_logger , get_configured_log_level
28
+ from crawlee ._log_config import configure_logger , get_configured_log_level , string_to_log_level
29
29
from crawlee ._request import Request , RequestOptions , RequestState
30
30
from crawlee ._types import (
31
31
BasicCrawlingContext ,
32
32
EnqueueLinksKwargs ,
33
33
GetKeyValueStoreFromRequestHandlerFunction ,
34
34
HttpHeaders ,
35
35
HttpPayload ,
36
+ LogLevel ,
36
37
RequestHandlerRunResult ,
37
38
SendRequestFunction ,
38
39
SkippedReason ,
39
40
)
40
41
from crawlee ._utils .docs import docs_group
41
42
from crawlee ._utils .file import export_csv_to_stream , export_json_to_stream
43
+ from crawlee ._utils .recurring_task import RecurringTask
42
44
from crawlee ._utils .robots import RobotsTxtFile
43
45
from crawlee ._utils .urls import convert_to_absolute_url , is_url_absolute
44
46
from crawlee ._utils .wait import wait_for
53
55
SessionError ,
54
56
UserDefinedErrorHandlerError ,
55
57
)
58
+ from crawlee .events ._types import Event , EventCrawlerStatusData
56
59
from crawlee .http_clients import HttpxHttpClient
57
60
from crawlee .router import Router
58
61
from crawlee .sessions import SessionPool
@@ -191,6 +194,15 @@ class _BasicCrawlerOptions(TypedDict):
191
194
"""If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,
192
195
and skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`."""
193
196
197
+ status_message_logging_interval : NotRequired [timedelta ]
198
+ """Interval for logging the crawler status messages."""
199
+
200
+ status_message_callback : NotRequired [
201
+ Callable [[StatisticsState , StatisticsState | None , str ], Awaitable [str | None ]]
202
+ ]
203
+ """Allows overriding the default status message. The default status message is provided in the parameters.
204
+ Returning `None` suppresses the status message."""
205
+
194
206
195
207
class _BasicCrawlerOptionsGeneric (Generic [TCrawlingContext , TStatisticsState ], TypedDict ):
196
208
"""Generic options the `BasicCrawler` constructor."""
@@ -273,6 +285,9 @@ def __init__(
273
285
configure_logging : bool = True ,
274
286
statistics_log_format : Literal ['table' , 'inline' ] = 'table' ,
275
287
respect_robots_txt_file : bool = False ,
288
+ status_message_logging_interval : timedelta = timedelta (seconds = 10 ),
289
+ status_message_callback : Callable [[StatisticsState , StatisticsState | None , str ], Awaitable [str | None ]]
290
+ | None = None ,
276
291
_context_pipeline : ContextPipeline [TCrawlingContext ] | None = None ,
277
292
_additional_context_managers : Sequence [AbstractAsyncContextManager ] | None = None ,
278
293
_logger : logging .Logger | None = None ,
@@ -291,7 +306,6 @@ def __init__(
291
306
max_request_retries: Specifies the maximum number of retries allowed for a request if its processing fails.
292
307
This includes retries due to navigation errors or errors thrown from user-supplied functions
293
308
(`request_handler`, `pre_navigation_hooks` etc.).
294
-
295
309
This limit does not apply to retries triggered by session rotation (see `max_session_rotations`).
296
310
max_requests_per_crawl: Maximum number of pages to open during a crawl. The crawl stops upon reaching
297
311
this limit. Setting this value can help avoid infinite loops in misconfigured crawlers. `None` means
@@ -300,7 +314,6 @@ def __init__(
300
314
`max_requests_per_crawl` is achieved.
301
315
max_session_rotations: Maximum number of session rotations per request. The crawler rotates the session
302
316
if a proxy error occurs or if the website blocks the request.
303
-
304
317
The session rotations are not counted towards the `max_request_retries` limit.
305
318
max_crawl_depth: Specifies the maximum crawl depth. If set, the crawler will stop processing links beyond
306
319
this depth. The crawl depth starts at 0 for initial requests and increases with each subsequent level
@@ -324,6 +337,9 @@ def __init__(
324
337
respect_robots_txt_file: If set to `True`, the crawler will automatically try to fetch the robots.txt file
325
338
for each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added
326
339
via `EnqueueLinksFunction`
340
+ status_message_logging_interval: Interval for logging the crawler status messages.
341
+ status_message_callback: Allows overriding the default status message. The default status message is
342
+ provided in the parameters. Returning `None` suppresses the status message.
327
343
_context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
328
344
Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
329
345
_additional_context_managers: Additional context managers used throughout the crawler lifecycle.
@@ -368,6 +384,9 @@ def __init__(
368
384
self ._on_skipped_request : SkippedRequestCallback | None = None
369
385
self ._abort_on_error = abort_on_error
370
386
387
+ # Crawler callbacks
388
+ self ._status_message_callback = status_message_callback
389
+
371
390
# Context of each request with matching result of request handler.
372
391
# Inheritors can use this to override the result of individual request handler runs in `_run_request_handler`.
373
392
self ._context_result_map = WeakKeyDictionary [BasicCrawlingContext , RequestHandlerRunResult ]()
@@ -428,6 +447,10 @@ def __init__(
428
447
is_task_ready_function = self .__is_task_ready_function ,
429
448
run_task_function = self .__run_task_function ,
430
449
)
450
+ self ._crawler_state_rec_task = RecurringTask (
451
+ func = self ._crawler_state_task , delay = status_message_logging_interval
452
+ )
453
+ self ._previous_crawler_state : TStatisticsState | None = None
431
454
432
455
# State flags
433
456
self ._keep_alive = keep_alive
@@ -632,6 +655,7 @@ def sigint_handler() -> None:
632
655
except CancelledError :
633
656
pass
634
657
finally :
658
+ await self ._crawler_state_rec_task .stop ()
635
659
if threading .current_thread () is threading .main_thread ():
636
660
with suppress (NotImplementedError ):
637
661
asyncio .get_running_loop ().remove_signal_handler (signal .SIGINT )
@@ -663,6 +687,8 @@ def sigint_handler() -> None:
663
687
async def _run_crawler (self ) -> None :
664
688
event_manager = service_locator .get_event_manager ()
665
689
690
+ self ._crawler_state_rec_task .start ()
691
+
666
692
# Collect the context managers to be entered. Context managers that are already active are excluded,
667
693
# as they were likely entered by the caller, who will also be responsible for exiting them.
668
694
contexts_to_enter = [
@@ -1481,3 +1507,53 @@ async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile:
1481
1507
url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file.
1482
1508
"""
1483
1509
return await RobotsTxtFile .find (url , self ._http_client )
1510
+
1511
+ def _log_status_message (self , message : str , level : LogLevel = 'DEBUG' ) -> None :
1512
+ """Log a status message for the crawler.
1513
+
1514
+ Args:
1515
+ message: The status message to log.
1516
+ level: The logging level for the message.
1517
+ """
1518
+ log_level = string_to_log_level (level )
1519
+ self .log .log (log_level , message )
1520
+
1521
+ async def _crawler_state_task (self ) -> None :
1522
+ """Emit a persist state event with the given migration status."""
1523
+ event_manager = service_locator .get_event_manager ()
1524
+
1525
+ current_state = self .statistics .state
1526
+
1527
+ if (
1528
+ failed_requests := (
1529
+ current_state .requests_failed - (self ._previous_crawler_state or current_state ).requests_failed
1530
+ )
1531
+ > 0
1532
+ ):
1533
+ message = f'Experiencing problems, { failed_requests } failed requests since last status update.'
1534
+ else :
1535
+ request_manager = await self .get_request_manager ()
1536
+ total_count = await request_manager .get_total_count ()
1537
+ if total_count is not None and total_count > 0 :
1538
+ pages_info = f'{ self ._statistics .state .requests_finished } /{ total_count } '
1539
+ else :
1540
+ pages_info = str (self ._statistics .state .requests_finished )
1541
+
1542
+ message = (
1543
+ f'Crawled { pages_info } pages, { self ._statistics .state .requests_failed } failed requests, '
1544
+ f'desired concurrency { self ._autoscaled_pool .desired_concurrency } .'
1545
+ )
1546
+
1547
+ if self ._status_message_callback :
1548
+ new_message = await self ._status_message_callback (current_state , self ._previous_crawler_state , message )
1549
+ if new_message :
1550
+ message = new_message
1551
+ self ._log_status_message (message , level = 'INFO' )
1552
+ else :
1553
+ self ._log_status_message (message , level = 'INFO' )
1554
+
1555
+ event_manager .emit (
1556
+ event = Event .CRAWLER_STATUS , event_data = EventCrawlerStatusData (message = message , crawler_id = id (self ))
1557
+ )
1558
+
1559
+ self ._previous_crawler_state = current_state
0 commit comments