Skip to content

Commit 5e4c3eb

Browse files
move wait for dom to cdp from js (#120)
* move wait for dom to cdp from js * formatting * Update stagehand/page.py thx Sean! Co-authored-by: Sean McGuire <75873287+seanmcguire12@users.noreply.github.com> * formatting * update test --------- Co-authored-by: Sean McGuire <75873287+seanmcguire12@users.noreply.github.com> Co-authored-by: Sean McGuire <seanmcguire1@outlook.com>
1 parent e450463 commit 5e4c3eb

File tree

5 files changed

+386
-76
lines changed

5 files changed

+386
-76
lines changed

stagehand/domScripts.js

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -291,21 +291,6 @@
291291
};
292292

293293
// lib/dom/utils.ts
294-
async function waitForDomSettle() {
295-
return new Promise((resolve) => {
296-
const createTimeout = () => {
297-
return setTimeout(() => {
298-
resolve();
299-
}, 2e3);
300-
};
301-
let timeout = createTimeout();
302-
const observer = new MutationObserver(() => {
303-
clearTimeout(timeout);
304-
timeout = createTimeout();
305-
});
306-
observer.observe(window.document.body, { childList: true, subtree: true });
307-
});
308-
}
309294
function calculateViewportHeight() {
310295
return Math.ceil(window.innerHeight * 0.75);
311296
}
@@ -1046,7 +1031,6 @@
10461031
}
10471032
return boundingBoxes;
10481033
}
1049-
window.waitForDomSettle = waitForDomSettle;
10501034
window.processDom = processDom;
10511035
window.processAllOfDom = processAllOfDom;
10521036
window.storeDOM = storeDOM;

stagehand/page.py

Lines changed: 194 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -402,8 +402,6 @@ async def send_cdp(self, method: str, params: Optional[dict] = None) -> dict:
402402
self._stagehand.logger.debug(
403403
f"CDP command '{method}' failed: {e}. Attempting to reconnect..."
404404
)
405-
# Try to reconnect
406-
await self._ensure_cdp_session()
407405
# Handle specific errors if needed (e.g., session closed)
408406
if "Target closed" in str(e) or "Session closed" in str(e):
409407
# Attempt to reset the client if the session closed unexpectedly
@@ -446,70 +444,212 @@ async def _wait_for_settled_dom(self, timeout_ms: int = None):
446444
"""
447445
Wait for the DOM to settle (stop changing) before proceeding.
448446
447+
**Definition of "settled"**
448+
• No in-flight network requests (except WebSocket / Server-Sent-Events).
449+
• That idle state lasts for at least **500 ms** (the "quiet-window").
450+
451+
**How it works**
452+
1. Subscribes to CDP Network and Page events for the main target and all
453+
out-of-process iframes (via `Target.setAutoAttach { flatten:true }`).
454+
2. Every time `Network.requestWillBeSent` fires, the request ID is added
455+
to an **`inflight`** set.
456+
3. When the request finishes—`loadingFinished`, `loadingFailed`,
457+
`requestServedFromCache`, or a *data:* response—the request ID is
458+
removed.
459+
4. *Document* requests are also mapped **frameId → requestId**; when
460+
`Page.frameStoppedLoading` fires the corresponding Document request is
461+
removed immediately (covers iframes whose network events never close).
462+
5. A **stalled-request sweep timer** runs every 500 ms. If a *Document*
463+
request has been open for ≥ 2 s it is forcibly removed; this prevents
464+
ad/analytics iframes from blocking the wait forever.
465+
6. When `inflight` becomes empty the helper starts a 500 ms timer.
466+
If no new request appears before the timer fires, the promise
467+
resolves → **DOM is considered settled**.
468+
7. A global guard (`timeoutMs` or `stagehand.domSettleTimeoutMs`,
469+
default ≈ 30 s) ensures we always resolve; if it fires we log how many
470+
requests were still outstanding.
471+
449472
Args:
450473
timeout_ms (int, optional): Maximum time to wait in milliseconds.
451474
If None, uses the stagehand client's dom_settle_timeout_ms.
452475
"""
453-
try:
454-
timeout = timeout_ms or getattr(
455-
self._stagehand, "dom_settle_timeout_ms", 30000
456-
)
457-
import asyncio
458-
459-
# Wait for domcontentloaded first
460-
await self._page.wait_for_load_state("domcontentloaded")
461-
462-
# Create a timeout promise that resolves after the specified time
463-
timeout_task = asyncio.create_task(asyncio.sleep(timeout / 1000))
476+
import asyncio
477+
import time
464478

465-
# Try to check if the DOM has settled
466-
try:
467-
# Create a task for evaluating the DOM settling
468-
eval_task = asyncio.create_task(
469-
self._page.evaluate(
470-
"""
471-
() => {
472-
return new Promise((resolve) => {
473-
if (typeof window.waitForDomSettle === 'function') {
474-
window.waitForDomSettle().then(resolve);
475-
} else {
476-
console.warn('waitForDomSettle is not defined, considering DOM as settled');
477-
resolve();
478-
}
479-
});
480-
}
481-
"""
482-
)
483-
)
484-
485-
# Create tasks for other ways to determine page readiness
486-
dom_task = asyncio.create_task(
487-
self._page.wait_for_load_state("domcontentloaded")
488-
)
489-
body_task = asyncio.create_task(self._page.wait_for_selector("body"))
479+
timeout = timeout_ms or getattr(self._stagehand, "dom_settle_timeout_ms", 30000)
480+
client = await self.get_cdp_client()
490481

491-
# Wait for the first task to complete
492-
done, pending = await asyncio.wait(
493-
[eval_task, dom_task, body_task, timeout_task],
494-
return_when=asyncio.FIRST_COMPLETED,
495-
)
482+
# Check if document exists
483+
try:
484+
await self._page.title()
485+
except Exception:
486+
await self._page.wait_for_load_state("domcontentloaded")
496487

497-
# Cancel any pending tasks
498-
for task in pending:
499-
task.cancel()
488+
# Enable CDP domains
489+
await client.send("Network.enable")
490+
await client.send("Page.enable")
491+
await client.send(
492+
"Target.setAutoAttach",
493+
{
494+
"autoAttach": True,
495+
"waitForDebuggerOnStart": False,
496+
"flatten": True,
497+
"filter": [
498+
{"type": "worker", "exclude": True},
499+
{"type": "shared_worker", "exclude": True},
500+
],
501+
},
502+
)
500503

501-
# If the timeout was hit, log a warning
502-
if timeout_task in done:
504+
# Set up tracking structures
505+
inflight = set() # Set of request IDs
506+
meta = {} # Dict of request ID -> {"url": str, "start": float}
507+
doc_by_frame = {} # Dict of frame ID -> request ID
508+
509+
# Event tracking
510+
quiet_timer = None
511+
stalled_request_sweep_task = None
512+
loop = asyncio.get_event_loop()
513+
done_event = asyncio.Event()
514+
515+
def clear_quiet():
516+
nonlocal quiet_timer
517+
if quiet_timer:
518+
quiet_timer.cancel()
519+
quiet_timer = None
520+
521+
def resolve_done():
522+
"""Cleanup and mark as done"""
523+
clear_quiet()
524+
if stalled_request_sweep_task and not stalled_request_sweep_task.done():
525+
stalled_request_sweep_task.cancel()
526+
done_event.set()
527+
528+
def maybe_quiet():
529+
"""Start quiet timer if no requests are in flight"""
530+
nonlocal quiet_timer
531+
if len(inflight) == 0 and not quiet_timer:
532+
quiet_timer = loop.call_later(0.5, resolve_done)
533+
534+
def finish_req(request_id: str):
535+
"""Mark a request as finished"""
536+
if request_id not in inflight:
537+
return
538+
inflight.remove(request_id)
539+
meta.pop(request_id, None)
540+
# Remove from frame mapping
541+
for fid, rid in list(doc_by_frame.items()):
542+
if rid == request_id:
543+
doc_by_frame.pop(fid)
544+
clear_quiet()
545+
maybe_quiet()
546+
547+
# Event handlers
548+
def on_request(params):
549+
"""Handle Network.requestWillBeSent"""
550+
if params.get("type") in ["WebSocket", "EventSource"]:
551+
return
552+
553+
request_id = params["requestId"]
554+
inflight.add(request_id)
555+
meta[request_id] = {"url": params["request"]["url"], "start": time.time()}
556+
557+
if params.get("type") == "Document" and params.get("frameId"):
558+
doc_by_frame[params["frameId"]] = request_id
559+
560+
clear_quiet()
561+
562+
def on_finish(params):
563+
"""Handle Network.loadingFinished"""
564+
finish_req(params["requestId"])
565+
566+
def on_failed(params):
567+
"""Handle Network.loadingFailed"""
568+
finish_req(params["requestId"])
569+
570+
def on_cached(params):
571+
"""Handle Network.requestServedFromCache"""
572+
finish_req(params["requestId"])
573+
574+
def on_data_url(params):
575+
"""Handle Network.responseReceived for data: URLs"""
576+
if params.get("response", {}).get("url", "").startswith("data:"):
577+
finish_req(params["requestId"])
578+
579+
def on_frame_stop(params):
580+
"""Handle Page.frameStoppedLoading"""
581+
frame_id = params["frameId"]
582+
if frame_id in doc_by_frame:
583+
finish_req(doc_by_frame[frame_id])
584+
585+
# Register event handlers
586+
client.on("Network.requestWillBeSent", on_request)
587+
client.on("Network.loadingFinished", on_finish)
588+
client.on("Network.loadingFailed", on_failed)
589+
client.on("Network.requestServedFromCache", on_cached)
590+
client.on("Network.responseReceived", on_data_url)
591+
client.on("Page.frameStoppedLoading", on_frame_stop)
592+
593+
async def sweep_stalled_requests():
594+
"""Remove stalled document requests after 2 seconds"""
595+
while not done_event.is_set():
596+
await asyncio.sleep(0.5)
597+
now = time.time()
598+
for request_id, request_meta in list(meta.items()):
599+
if now - request_meta["start"] > 2.0:
600+
inflight.discard(request_id)
601+
meta.pop(request_id, None)
602+
self._stagehand.logger.debug(
603+
"⏳ forcing completion of stalled iframe document",
604+
extra={"url": request_meta["url"][:120]},
605+
)
606+
maybe_quiet()
607+
608+
# Start stalled request sweeper
609+
stalled_request_sweep_task = asyncio.create_task(sweep_stalled_requests())
610+
611+
# Set up timeout guard
612+
async def timeout_guard():
613+
await asyncio.sleep(timeout / 1000)
614+
if not done_event.is_set():
615+
if len(inflight) > 0:
503616
self._stagehand.logger.debug(
504-
"DOM settle timeout exceeded, continuing anyway",
505-
extra={"timeout_ms": timeout},
617+
"⚠️ DOM-settle timeout reached – network requests still pending",
618+
extra={"count": len(inflight)},
506619
)
620+
resolve_done()
507621

508-
except Exception as e:
509-
self._stagehand.logger.debug(f"Error waiting for DOM to settle: {e}")
622+
timeout_task = asyncio.create_task(timeout_guard())
510623

511-
except Exception as e:
512-
self._stagehand.logger.error(f"Error in _wait_for_settled_dom: {e}")
624+
# Initial check
625+
maybe_quiet()
626+
627+
try:
628+
# Wait for completion
629+
await done_event.wait()
630+
finally:
631+
# Cleanup
632+
client.remove_listener("Network.requestWillBeSent", on_request)
633+
client.remove_listener("Network.loadingFinished", on_finish)
634+
client.remove_listener("Network.loadingFailed", on_failed)
635+
client.remove_listener("Network.requestServedFromCache", on_cached)
636+
client.remove_listener("Network.responseReceived", on_data_url)
637+
client.remove_listener("Page.frameStoppedLoading", on_frame_stop)
638+
639+
if quiet_timer:
640+
quiet_timer.cancel()
641+
if stalled_request_sweep_task and not stalled_request_sweep_task.done():
642+
stalled_request_sweep_task.cancel()
643+
try:
644+
await stalled_request_sweep_task
645+
except asyncio.CancelledError:
646+
pass
647+
if timeout_task and not timeout_task.done():
648+
timeout_task.cancel()
649+
try:
650+
await timeout_task
651+
except asyncio.CancelledError:
652+
pass
513653

514654
# Forward other Page methods to underlying Playwright page
515655
def __getattr__(self, name):

tests/conftest.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -395,10 +395,6 @@ def mock_dom_scripts():
395395
return ['//body', '//div[@class="content"]'];
396396
};
397397
398-
window.waitForDomSettle = function() {
399-
return Promise.resolve();
400-
};
401-
402398
window.getElementInfo = function(selector) {
403399
return {
404400
selector: selector,

tests/mocks/mock_browser.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,6 @@ async def evaluate(self, script: str, *args):
7272
# Return different results based on script content
7373
if "getScrollableElementXpaths" in script:
7474
return ["//body", "//div[@class='content']"]
75-
elif "waitForDomSettle" in script:
76-
return True
7775
elif "getElementInfo" in script:
7876
return {
7977
"selector": args[0] if args else "#test",

0 commit comments

Comments
 (0)