Skip to content

Commit 603aa2b

Browse files
authored
feat: Add retire_browser_after_page_count parameter for BrowserPool (#1266)
### Description - Add `retire_browser_after_page_count` parameter for `BrowserPool`
1 parent a90a6a9 commit 603aa2b

File tree

4 files changed

+55
-1
lines changed

4 files changed

+55
-1
lines changed

src/crawlee/browsers/_browser_controller.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@ class BrowserController(ABC):
2626
def pages(self) -> list[Page]:
2727
"""Return the list of opened pages."""
2828

29+
@property
30+
@abstractmethod
31+
def total_opened_pages(self) -> int:
32+
"""Return the total number of pages opened since the browser was launched."""
33+
2934
@property
3035
@abstractmethod
3136
def pages_count(self) -> int:

src/crawlee/browsers/_browser_pool.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ def __init__(
5353
browser_inactive_threshold: timedelta = timedelta(seconds=10),
5454
identify_inactive_browsers_interval: timedelta = timedelta(seconds=20),
5555
close_inactive_browsers_interval: timedelta = timedelta(seconds=30),
56+
retire_browser_after_page_count: int = 100,
5657
) -> None:
5758
"""Initialize a new instance.
5859
@@ -67,7 +68,10 @@ def __init__(
6768
as retired.
6869
close_inactive_browsers_interval: The interval at which the pool checks for inactive browsers
6970
and closes them. The browser is considered as inactive if it has no active pages and has been idle
70-
for the specified period.
71+
for the specified period. The browser is considered as retired if it has no active pages and has total
72+
pages count greater than or equal to `retire_browser_after_page_count`.
73+
retire_browser_after_page_count: The maximum number of processed pages after which the browser is considered
74+
as retired.
7175
"""
7276
self._plugins = plugins or [PlaywrightBrowserPlugin()]
7377
self._operation_timeout = operation_timeout
@@ -91,6 +95,7 @@ def __init__(
9195
)
9296

9397
self._total_pages_count = 0
98+
self._retire_browser_after_page_count = retire_browser_after_page_count
9499
self._pages = WeakValueDictionary[str, CrawleePage]() # Track the pages in the pool
95100
self._plugins_cycle = itertools.cycle(self._plugins) # Cycle through the plugins
96101

@@ -305,6 +310,9 @@ async def _get_new_page(
305310
except RuntimeError as exc:
306311
raise RuntimeError('Browser pool is not initialized.') from exc
307312

313+
if browser_controller.total_opened_pages >= self._retire_browser_after_page_count:
314+
self._retire_browser(browser_controller)
315+
308316
crawlee_page = CrawleePage(id=page_id, page=page, browser_type=plugin.browser_type)
309317
self._pages[page_id] = crawlee_page
310318
self._total_pages_count += 1
@@ -321,6 +329,12 @@ def _pick_browser_with_free_capacity(
321329

322330
return None
323331

332+
def _retire_browser(self, browser: BrowserController) -> None:
333+
"""Retire a browser by moving it to the inactive list."""
334+
if browser in self._active_browsers:
335+
self._active_browsers.remove(browser)
336+
self._inactive_browsers.append(browser)
337+
324338
async def _launch_new_browser(self, plugin: BrowserPlugin) -> BrowserController:
325339
"""Launch a new browser instance using the specified plugin."""
326340
browser = await plugin.new_browser()

src/crawlee/browsers/_playwright_browser_controller.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,11 +74,18 @@ def __init__(
7474
self._pages = list[Page]()
7575
self._last_page_opened_at = datetime.now(timezone.utc)
7676

77+
self._total_opened_pages = 0
78+
7779
@property
7880
@override
7981
def pages(self) -> list[Page]:
8082
return self._pages
8183

84+
@property
85+
@override
86+
def total_opened_pages(self) -> int:
87+
return self._total_opened_pages
88+
8289
@property
8390
@override
8491
def pages_count(self) -> int:
@@ -160,6 +167,8 @@ async def new_page(
160167
self._pages.append(page)
161168
self._last_page_opened_at = datetime.now(timezone.utc)
162169

170+
self._total_opened_pages += 1
171+
163172
return page
164173

165174
@override

tests/unit/browsers/test_browser_pool.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,3 +160,29 @@ async def test_with_plugin_contains_page_options(server_url: URL) -> None:
160160
await test_page.page.goto(str(server_url / 'user-agent'))
161161
assert 'My Best User-Agent' in await test_page.page.content()
162162
await test_page.page.close()
163+
164+
165+
@pytest.mark.parametrize(
166+
('retire_after_page_count', 'expect_equal_browsers'),
167+
[
168+
pytest.param(2, True, id='Two pages opened in the same browser'),
169+
pytest.param(1, False, id='Each page opened in a new browser.'),
170+
],
171+
)
172+
async def test_browser_pool_retire_browser_after_page_count(
173+
retire_after_page_count: int, *, expect_equal_browsers: bool
174+
) -> None:
175+
async with BrowserPool(retire_browser_after_page_count=retire_after_page_count) as browser_pool:
176+
test_page = await browser_pool.new_page()
177+
first_browser = test_page.page.context
178+
await test_page.page.close()
179+
180+
test_page = await browser_pool.new_page()
181+
second_browser = test_page.page.context
182+
183+
await test_page.page.close()
184+
185+
if expect_equal_browsers:
186+
assert first_browser is second_browser
187+
else:
188+
assert first_browser is not second_browser

0 commit comments

Comments
 (0)