Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,14 @@ const result = await zerox({
});
```

// For GPT-5 models, you can control the reasoning effort:
// Allowed values: "minimal", "low", "medium", "high"
// Example:
// ...
// model: ModelOptions.OPENAI_GPT_5,
// reasoning_effort: "medium",
// ...

The `maintainFormat` option tries to return the markdown in a consistent format by passing the output of a prior page in as additional context for the next page. This requires the requests to run synchronously, so it's a lot slower. But valuable if your documents have a lot of tabular data, or frequently have tables that cross pages.

```
Expand Down Expand Up @@ -213,13 +221,19 @@ Zerox supports a wide range of models across different providers:
- GPT-4 Vision Mini (gpt-4o-mini)
- GPT-4.1 (gpt-4.1)
- GPT-4.1 Mini (gpt-4.1-mini)
- GPT-5 (gpt-5)
- GPT-5 Mini (gpt-5-mini)
- GPT-5 Nano (gpt-5-nano)

- **OpenAI**

- GPT-4 Vision (gpt-4o)
- GPT-4 Vision Mini (gpt-4o-mini)
- GPT-4.1 (gpt-4.1)
- GPT-4.1 Mini (gpt-4.1-mini)
- GPT-5 (gpt-5)
- GPT-5 Mini (gpt-5-mini)
- GPT-5 Nano (gpt-5-nano)

- **AWS Bedrock**

Expand Down Expand Up @@ -380,7 +394,6 @@ print(result)

### Parameters

```python
async def zerox(
cleanup: bool = True,
concurrency: int = 10,
Expand All @@ -396,6 +409,12 @@ async def zerox(
...
```

# For GPT-5 models, you can control the reasoning effort:
# Allowed values: "minimal", "low", "medium", "high"
# Example:
# result = await zerox(file_path=..., model="gpt-5", reasoning_effort="medium")


Parameters

- **cleanup** (bool, optional):
Expand All @@ -417,6 +436,8 @@ Parameters
The system prompt to use for the model, this overrides the default system prompt of Zerox.Generally it is not required unless you want some specific behavior. Defaults to None.
- **select_pages** (Optional[Union[int, Iterable[int]]], optional):
Pages to process, can be a single page number or an iterable of page numbers. Defaults to None
- **reasoning_effort** (str, optional, GPT-5 only):
Controls the reasoning effort for GPT-5 models. Allowed values: "minimal", "low", "medium", "high". Defaults to None.
- **kwargs** (dict, optional):
Additional keyword arguments to pass to the litellm.completion method.
Refer to the LiteLLM Documentation and Completion Input for details.
Expand Down
5 changes: 5 additions & 0 deletions node-zerox/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ export const zerox = async ({
imageFormat = "png",
imageHeight,
llmParams = {},
reasoning_effort,
maintainFormat = false,
maxImageSize = 15,
maxRetries = 1,
Expand All @@ -79,6 +80,10 @@ export const zerox = async ({
trimEdges = true,
}: ZeroxArgs): Promise<ZeroxOutput> => {
let extracted: Record<string, unknown> | null = null;
// If reasoning is provided, add to llmParams
if (reasoning_effort) {
llmParams = { ...llmParams, reasoning_effort };
}
let extractedLogprobs: LogprobPage[] = [];
let inputTokenCount: number = 0;
let outputTokenCount: number = 0;
Expand Down
17 changes: 11 additions & 6 deletions node-zerox/src/models/openAI.ts
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ export default class OpenAIModel implements ModelInterface {
priorPage,
prompt,
}: CompletionArgs): Promise<CompletionResponse> {
const systemPrompt = prompt || SYSTEM_PROMPT_BASE;
const systemPrompt = prompt || SYSTEM_PROMPT_BASE;

// Default system message
const messages: any = [{ role: "system", content: systemPrompt }];
Expand All @@ -120,13 +120,18 @@ export default class OpenAIModel implements ModelInterface {
messages.push({ role: "user", content: imageContents });

try {
// If model is GPT-5 and reasoning_effort is provided, add it to payload
let payload: any = {
messages,
model: this.model,
...convertKeysToSnakeCase(this.llmParams ?? null),
};
if (this.model && this.model.startsWith("gpt-5") && this.llmParams?.reasoning_effort) {
payload.reasoning_effort = this.llmParams.reasoning_effort;
}
const response = await axios.post(
"https://api.openai.com/v1/chat/completions",
{
messages,
model: this.model,
...convertKeysToSnakeCase(this.llmParams ?? null),
},
payload,
{
headers: {
Authorization: `Bearer ${this.apiKey}`,
Expand Down
7 changes: 7 additions & 0 deletions node-zerox/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ export interface ZeroxArgs {
imageHeight?: number;
imageFormat?: "png" | "jpeg";
llmParams?: Partial<LLMParams>;
reasoning_effort?: "minimal" | "low" | "medium" | "high";
maintainFormat?: boolean;
maxImageSize?: number;
maxRetries?: number;
Expand Down Expand Up @@ -95,6 +96,11 @@ export enum ModelOptions {
OPENAI_GPT_4O = "gpt-4o",
OPENAI_GPT_4O_MINI = "gpt-4o-mini",

// OpenAI GPT-5 Models
OPENAI_GPT_5 = "gpt-5",
OPENAI_GPT_5_MINI = "gpt-5-mini",
OPENAI_GPT_5_NANO = "gpt-5-nano",

// Google Gemini Models
GOOGLE_GEMINI_1_5_FLASH = "gemini-1.5-flash",
GOOGLE_GEMINI_1_5_FLASH_8B = "gemini-1.5-flash-8b",
Expand Down Expand Up @@ -222,6 +228,7 @@ export interface GoogleLLMParams extends BaseLLMParams {
export interface OpenAILLMParams extends BaseLLMParams {
logprobs: boolean;
maxTokens: number;
reasoning_effort?: "minimal" | "low" | "medium" | "high";
}

// Union type of all provider params
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"heic-convert": "^2.1.0",
"libreoffice-convert": "^1.6.0",
"mime-types": "^2.1.35",
"openai": "^4.82.0",
"openai": "^5.15.0",
"os": "^0.1.2",
"p-limit": "^3.1.0",
"path": "^0.12.7",
Expand Down
Loading