Skip to content

fixed deepLocator() pathing bug #880

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions examples/external_clients/google_vertex.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import { Stagehand } from "@browserbasehq/stagehand";
import { z } from "zod";

/**
* Example of using Google Vertex AI directly (not through AI SDK).
* When you provide `vertexai: true` in the client options,
* the system will route to GoogleVertexClient instead of using AI SDK.
*/
async function main() {
const stagehand = new Stagehand({
env: "LOCAL",
enableCaching: false,
modelName: "google/gemini-1.5-pro", // Google model with slash notation
modelClientOptions: {
// Vertex AI specific configuration - bypasses AI SDK
vertexai: true,
project: "your-gcp-project-id",
location: "us-central1",
// Optional: API key if not using default auth
// apiKey: process.env.GOOGLE_API_KEY,
},
});

await stagehand.init();
await stagehand.page.goto("https://docs.stagehand.dev");

// Extract some text using Vertex AI (not AI SDK)
const result = await stagehand.page.extract({
instruction: "extract the main heading of this page",
schema: z.object({
heading: z.string(),
}),
});

console.log("Extracted:", result);
await stagehand.close();
}

main().catch(console.error);
42 changes: 33 additions & 9 deletions lib/StagehandPage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -652,9 +652,17 @@ ${scriptContent} \
}

const requestId = Math.random().toString(36).substring(2);
const llmClient: LLMClient = modelName
? this.stagehand.llmProvider.getClient(modelName, modelClientOptions)
: this.llmClient;

// Use provided modelName if available AND if modelClientOptions has an API key, otherwise use the configured llmClient
const llmClient =
modelName && modelClientOptions?.apiKey
? this.stagehand.llmProvider.getClient(modelName, modelClientOptions)
: this.stagehand.llmClient;

// Add null check for llmClient before accessing modelName
if (!llmClient) {
throw new MissingLLMConfigurationError();
}

this.stagehand.log({
category: "act",
Expand Down Expand Up @@ -746,9 +754,17 @@ ${scriptContent} \
}

const requestId = Math.random().toString(36).substring(2);
const llmClient = modelName
? this.stagehand.llmProvider.getClient(modelName, modelClientOptions)
: this.llmClient;

// Use provided modelName if available AND if modelClientOptions has an API key, otherwise use the configured llmClient
const llmClient =
modelName && modelClientOptions?.apiKey
? this.stagehand.llmProvider.getClient(modelName, modelClientOptions)
: this.stagehand.llmClient;

// Add null check for llmClient before accessing modelName
if (!llmClient) {
throw new MissingLLMConfigurationError();
}

this.stagehand.log({
category: "extract",
Expand Down Expand Up @@ -850,9 +866,17 @@ ${scriptContent} \
}

const requestId = Math.random().toString(36).substring(2);
const llmClient = modelName
? this.stagehand.llmProvider.getClient(modelName, modelClientOptions)
: this.llmClient;

// Use provided modelName if available AND if modelClientOptions has an API key, otherwise use the configured llmClient
const llmClient =
modelName && modelClientOptions?.apiKey
? this.stagehand.llmProvider.getClient(modelName, modelClientOptions)
: this.stagehand.llmClient;

// Add null check for llmClient before accessing modelName
if (!llmClient) {
throw new MissingLLMConfigurationError();
}

this.stagehand.log({
category: "observe",
Expand Down
45 changes: 32 additions & 13 deletions lib/handlers/handlerUtils/actHandlerUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,32 +8,51 @@ import { StagehandClickError } from "@/types/stagehandErrors";

const IFRAME_STEP_RE = /^iframe(\[[^\]]+])?$/i;

export function deepLocator(root: Page | FrameLocator, xpath: string): Locator {
// 1 ─ prepend with slash if not already included
if (!xpath.startsWith("/")) xpath = "/" + xpath;
export function deepLocator(
root: Page | FrameLocator,
rawXPath: string,
): Locator {
// 1 ─ strip optional 'xpath=' prefix and whitespace
const xpath = rawXPath.replace(/^xpath=/i, "").trim();

// Split the path by sequences of slashes, but keep the slashes as
// separate elements in the array. This preserves separators like '//'.
// e.g., '//a/b' becomes ['//', 'a', '/', 'b']
const parts = xpath.split(/(\/+)/).filter(Boolean);

// 2 ─ split into steps, accumulate until we hit an iframe step
const steps = xpath.split("/").filter(Boolean); // tokens
let ctx: Page | FrameLocator = root;
let buffer: string[] = [];

const flushIntoFrame = () => {
if (buffer.length === 0) return;
const selector = "xpath=/" + buffer.join("/");

// Join the buffered parts to form the selector for the iframe.
// .join('') is used because the separators are already in the buffer.
const selector = "xpath=" + buffer.join("");
ctx = (ctx as Page | FrameLocator).frameLocator(selector);
buffer = [];
buffer = []; // Reset buffer for the next path segment.
};

for (const step of steps) {
buffer.push(step);
if (IFRAME_STEP_RE.test(step)) {
// we've included the <iframe> element in buffer ⇒ descend
// Iterate through all parts (which include both steps and their separators).
for (const part of parts) {
buffer.push(part);

// A "step" is a part of the path that is NOT a separator.
// We test if the current step (a non-slash part) is an iframe locator.
const isStep = !/^\/+$/.test(part);
if (isStep && IFRAME_STEP_RE.test(part)) {
flushIntoFrame();
}
}

// 3 ─ whatever is left in buffer addresses the target *inside* the last ctx
const finalSelector = "xpath=/" + buffer.join("/");
// If the XPath ended with an iframe, the buffer will be empty. In this case,
// we return a locator for the root element ('*') within the final frame.
if (buffer.length === 0) {
return (ctx as Page | FrameLocator).locator("xpath=/*");
}

// Join the remaining parts for the final locator.
const finalSelector = "xpath=" + buffer.join("");
return (ctx as Page | FrameLocator).locator(finalSelector);
}

Expand Down
17 changes: 12 additions & 5 deletions lib/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -583,23 +583,30 @@ export class Stagehand {

let modelApiKey: string | undefined;

if (!modelClientOptions?.apiKey) {
const usingVertexAI =
"vertexai" in modelClientOptions && modelClientOptions.vertexai;
const provider = LLMProvider.getModelProvider(
this.modelName,
usingVertexAI,
);

if (!modelClientOptions?.apiKey && !usingVertexAI) {
// If no API key is provided, try to load it from the environment
if (LLMProvider.getModelProvider(this.modelName) === "aisdk") {
if (provider === "aisdk") {
modelApiKey = loadApiKeyFromEnv(
this.modelName.split("/")[0],
this.logger,
);
} else {
// Temporary add for legacy providers
modelApiKey =
LLMProvider.getModelProvider(this.modelName) === "openai"
provider === "openai"
? process.env.OPENAI_API_KEY ||
this.llmClient?.clientOptions?.apiKey
: LLMProvider.getModelProvider(this.modelName) === "anthropic"
: provider === "anthropic"
? process.env.ANTHROPIC_API_KEY ||
this.llmClient?.clientOptions?.apiKey
: LLMProvider.getModelProvider(this.modelName) === "google"
: provider === "google"
? process.env.GOOGLE_API_KEY ||
this.llmClient?.clientOptions?.apiKey
: undefined;
Expand Down
39 changes: 22 additions & 17 deletions lib/llm/GoogleClient.ts
Original file line number Diff line number Diff line change
@@ -1,32 +1,33 @@
import {
Content,
FunctionCall,
GoogleGenAI,
HarmCategory,
GoogleGenAIOptions,
HarmBlockThreshold,
Content,
HarmCategory,
Part,
Tool,
FunctionCall,
Schema,
Tool,
Type,
} from "@google/genai";
import zodToJsonSchema from "zod-to-json-schema";

import {
CreateChatCompletionResponseError,
StagehandError,
} from "@/types/stagehandErrors";
import { LogLine } from "../../types/log";
import { AvailableModel, ClientOptions } from "../../types/model";
import { LLMCache } from "../cache/LLMCache";
import { validateZodSchema, toGeminiSchema, loadApiKeyFromEnv } from "../utils";
import { toGeminiSchema, validateZodSchema } from "../utils";
import {
AnnotatedScreenshotText,
ChatCompletionOptions,
ChatMessage,
CreateChatCompletionOptions,
LLMClient,
LLMResponse,
AnnotatedScreenshotText,
} from "./LLMClient";
import {
CreateChatCompletionResponseError,
StagehandError,
} from "@/types/stagehandErrors";

// Mapping from generic roles to Gemini roles
const roleMap: { [key in ChatMessage["role"]]: string } = {
Expand Down Expand Up @@ -60,7 +61,7 @@ export class GoogleClient extends LLMClient {
private client: GoogleGenAI;
private cache: LLMCache | undefined;
private enableCaching: boolean;
public clientOptions: ClientOptions;
public clientOptions: GoogleGenAIOptions;
public hasVision: boolean;
private logger: (message: LogLine) => void;

Expand All @@ -78,12 +79,16 @@ export class GoogleClient extends LLMClient {
clientOptions?: ClientOptions; // Expecting { apiKey: string } here
}) {
super(modelName);
if (!clientOptions?.apiKey) {
// Try to get the API key from the environment variable GOOGLE_API_KEY
clientOptions.apiKey = loadApiKeyFromEnv("google_legacy", logger);
}
this.clientOptions = clientOptions;
this.client = new GoogleGenAI({ apiKey: clientOptions.apiKey });
this.clientOptions = clientOptions as GoogleGenAIOptions;
this.client = new GoogleGenAI({
apiKey: this.clientOptions.apiKey,
vertexai: this.clientOptions.vertexai,
project: this.clientOptions.project,
location: this.clientOptions.location,
apiVersion: this.clientOptions.apiVersion,
googleAuthOptions: this.clientOptions.googleAuthOptions,
httpOptions: this.clientOptions.httpOptions,
});
this.cache = cache;
this.enableCaching = enableCaching;
this.modelName = modelName;
Expand Down
50 changes: 50 additions & 0 deletions lib/llm/GoogleVertexClient.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import { GoogleGenAIOptions } from "@google/genai";
import { LogLine } from "../../types/log";
import { AvailableModel, ClientOptions } from "../../types/model";
import { LLMCache } from "../cache/LLMCache";
import { GoogleClient } from "./GoogleClient";

export interface GoogleVertexClientOptions extends GoogleGenAIOptions {
vertexai: boolean;
project: string;
location: string;
}

export class GoogleVertexClient extends GoogleClient {
constructor({
logger,
enableCaching = false,
cache,
modelName,
clientOptions,
}: {
logger: (message: LogLine) => void;
enableCaching?: boolean;
cache?: LLMCache;
modelName: AvailableModel;
clientOptions?: ClientOptions;
}) {
// Ensure vertex ai configuration is present
const vertexOptions = clientOptions as GoogleVertexClientOptions;
if (!vertexOptions?.vertexai) {
throw new Error("GoogleVertexClient requires vertexai option to be true");
}
if (!vertexOptions?.project) {
throw new Error("GoogleVertexClient requires project configuration");
}
if (!vertexOptions?.location) {
throw new Error("GoogleVertexClient requires location configuration");
}

super({
logger,
enableCaching,
cache,
modelName,
clientOptions: {
...vertexOptions,
vertexai: true,
},
});
}
}
Loading