Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions packages/types/src/providers/chutes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ export type ChutesModelId =
| "deepseek-ai/DeepSeek-R1"
| "deepseek-ai/DeepSeek-V3"
| "deepseek-ai/DeepSeek-V3.1"
| "deepseek-ai/DeepSeek-V3.1-Terminus"
| "deepseek-ai/DeepSeek-V3.1-turbo"
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[P2] Potential breaking change: Renaming the model id from "DeepSeek-V3.1-Turbo" to "DeepSeek-V3.1-turbo" will break users who have existing configs referencing the old id. Consider adding a temporary alias/back-compat mapping (accept both ids) or a migration to remap the old value to the new one before lookup to avoid surprising failures.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the correct model id for chutes for the model is "deepseek-ai/DeepSeek-V3.1-turbo"
image

| "unsloth/Llama-3.3-70B-Instruct"
| "chutesai/Llama-4-Scout-17B-16E-Instruct"
| "unsloth/Mistral-Nemo-Instruct-2407"
Expand All @@ -29,6 +31,7 @@ export type ChutesModelId =
| "tngtech/DeepSeek-R1T-Chimera"
| "zai-org/GLM-4.5-Air"
| "zai-org/GLM-4.5-FP8"
| "zai-org/GLM-4.6-FP8"
| "moonshotai/Kimi-K2-Instruct-75k"
| "moonshotai/Kimi-K2-Instruct-0905"
| "Qwen/Qwen3-235B-A22B-Thinking-2507"
Expand Down Expand Up @@ -70,10 +73,31 @@ export const chutesModels = {
contextWindow: 163840,
supportsImages: false,
supportsPromptCache: false,
supportsReasoningEffort: true,
inputPrice: 0,
outputPrice: 0,
description: "DeepSeek V3.1 model.",
},
"deepseek-ai/DeepSeek-V3.1-Terminus": {
maxTokens: 32768,
contextWindow: 163840,
supportsImages: false,
supportsPromptCache: false,
supportsReasoningEffort: true,
inputPrice: 0,
outputPrice: 0,
description: "DeepSeek V3.1 Terminus variant - optimized for complex reasoning and extended context.",
},
"deepseek-ai/DeepSeek-V3.1-turbo": {
maxTokens: 32768,
contextWindow: 163840,
supportsImages: false,
supportsPromptCache: false,
supportsReasoningEffort: true,
inputPrice: 0,
outputPrice: 0,
description: "DeepSeek V3.1 Turbo variant - faster inference with maintained quality.",
},
"unsloth/Llama-3.3-70B-Instruct": {
maxTokens: 32768, // From Groq
contextWindow: 131072, // From Groq
Expand Down Expand Up @@ -259,6 +283,7 @@ export const chutesModels = {
contextWindow: 151329,
supportsImages: false,
supportsPromptCache: false,
supportsReasoningEffort: true,
inputPrice: 0,
outputPrice: 0,
description:
Expand All @@ -269,11 +294,23 @@ export const chutesModels = {
contextWindow: 131072,
supportsImages: false,
supportsPromptCache: false,
supportsReasoningEffort: true,
inputPrice: 0,
outputPrice: 0,
description:
"GLM-4.5-FP8 model with 128k token context window, optimized for agent-based applications with MoE architecture.",
},
"zai-org/GLM-4.6-FP8": {
maxTokens: 32768,
contextWindow: 204800,
supportsImages: false,
supportsPromptCache: false,
supportsReasoningEffort: true,
inputPrice: 0,
outputPrice: 0,
description:
"GLM-4.6-FP8 model with 200K context window, FP8 precision for efficient inference. Improved reasoning, coding, and agent capabilities.",
},
"Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8": {
maxTokens: 32768,
contextWindow: 262144,
Expand Down
203 changes: 203 additions & 0 deletions src/api/providers/__tests__/chutes.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,30 @@ describe("ChutesHandler", () => {
)
})

it("should return zai-org/GLM-4.6-FP8 model with correct configuration", () => {
const testModelId: ChutesModelId = "zai-org/GLM-4.6-FP8"
const handlerWithModel = new ChutesHandler({
apiModelId: testModelId,
chutesApiKey: "test-chutes-api-key",
})
const model = handlerWithModel.getModel()
expect(model.id).toBe(testModelId)
expect(model.info).toEqual(
expect.objectContaining({
maxTokens: 32768,
contextWindow: 204800,
supportsImages: false,
supportsPromptCache: false,
supportsReasoningEffort: true,
inputPrice: 0,
outputPrice: 0,
description:
"GLM-4.6-FP8 model with 200K context window, FP8 precision for efficient inference. Improved reasoning, coding, and agent capabilities.",
temperature: 0.5, // Default temperature for non-DeepSeek models
}),
)
})

it("should return Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 model with correct configuration", () => {
const testModelId: ChutesModelId = "Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8"
const handlerWithModel = new ChutesHandler({
Expand Down Expand Up @@ -297,6 +321,52 @@ describe("ChutesHandler", () => {
)
})

it("should return DeepSeek V3.1 Terminus model with correct configuration", () => {
const testModelId: ChutesModelId = "deepseek-ai/DeepSeek-V3.1-Terminus"
const handlerWithModel = new ChutesHandler({
apiModelId: testModelId,
chutesApiKey: "test-chutes-api-key",
})
const model = handlerWithModel.getModel()
expect(model.id).toBe(testModelId)
expect(model.info).toEqual(
expect.objectContaining({
maxTokens: 32768,
contextWindow: 163840,
supportsImages: false,
supportsPromptCache: false,
supportsReasoningEffort: true,
inputPrice: 0,
outputPrice: 0,
description: "DeepSeek V3.1 Terminus variant - optimized for complex reasoning and extended context.",
temperature: 0.5, // Default temperature for non-R1 DeepSeek models
}),
)
})

it("should return DeepSeek V3.1 turbo model with correct configuration", () => {
const testModelId: ChutesModelId = "deepseek-ai/DeepSeek-V3.1-turbo"
const handlerWithModel = new ChutesHandler({
apiModelId: testModelId,
chutesApiKey: "test-chutes-api-key",
})
const model = handlerWithModel.getModel()
expect(model.id).toBe(testModelId)
expect(model.info).toEqual(
expect.objectContaining({
maxTokens: 32768,
contextWindow: 163840,
supportsImages: false,
supportsPromptCache: false,
supportsReasoningEffort: true,
inputPrice: 0,
outputPrice: 0,
description: "DeepSeek V3.1 Turbo variant - faster inference with maintained quality.",
temperature: 0.5, // Default temperature for non-R1 DeepSeek models
}),
)
})

it("should return moonshotai/Kimi-K2-Instruct-0905 model with correct configuration", () => {
const testModelId: ChutesModelId = "moonshotai/Kimi-K2-Instruct-0905"
const handlerWithModel = new ChutesHandler({
Expand Down Expand Up @@ -470,4 +540,137 @@ describe("ChutesHandler", () => {
const model = handlerWithModel.getModel()
expect(model.info.temperature).toBe(0.5)
})

it("should enable reasoning for DeepSeek V3.1 models when enableReasoningEffort is true", async () => {
const modelId: ChutesModelId = "deepseek-ai/DeepSeek-V3.1"
const handlerWithModel = new ChutesHandler({
apiModelId: modelId,
chutesApiKey: "test-chutes-api-key",
enableReasoningEffort: true,
})

mockCreate.mockImplementationOnce(async () => ({
[Symbol.asyncIterator]: async function* () {
// First yield reasoning content
yield {
choices: [{ delta: { reasoning_content: "Let me think about this..." } }],
}
// Then yield regular content
yield {
choices: [{ delta: { content: "Here's my response." } }],
}
// Finally yield usage
yield {
choices: [],
usage: { prompt_tokens: 100, completion_tokens: 50 },
}
},
}))

const systemPrompt = "You are a helpful assistant"
const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello" }]

const stream = handlerWithModel.createMessage(systemPrompt, messages)
const chunks = []
for await (const chunk of stream) {
chunks.push(chunk)
}

// Should parse reasoning content and regular content separately
expect(chunks).toContainEqual({ type: "reasoning", text: "Let me think about this..." })
expect(chunks).toContainEqual({ type: "text", text: "Here's my response." })
expect(chunks).toContainEqual({ type: "usage", inputTokens: 100, outputTokens: 50 })

// Verify that the API was called with reasoning enabled
expect(mockCreate).toHaveBeenCalledWith(
expect.objectContaining({
chat_template_kwargs: {
thinking: true,
},
}),
)
})

it("should enable reasoning for GLM-4.5 models when enableReasoningEffort is true", async () => {
const modelId: ChutesModelId = "zai-org/GLM-4.5-Air"
const handlerWithModel = new ChutesHandler({
apiModelId: modelId,
chutesApiKey: "test-chutes-api-key",
enableReasoningEffort: true,
})

mockCreate.mockImplementationOnce(async () => ({
[Symbol.asyncIterator]: async function* () {
// First yield reasoning content
yield {
choices: [{ delta: { reasoning_content: "GLM reasoning process..." } }],
}
// Then yield regular content
yield {
choices: [{ delta: { content: "GLM response" } }],
}
// Finally yield usage
yield {
choices: [],
usage: { prompt_tokens: 100, completion_tokens: 50 },
}
},
}))

const systemPrompt = "You are a helpful assistant"
const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello" }]

const stream = handlerWithModel.createMessage(systemPrompt, messages)
const chunks = []
for await (const chunk of stream) {
chunks.push(chunk)
}

// Should parse reasoning content separately
expect(chunks).toContainEqual({ type: "reasoning", text: "GLM reasoning process..." })
expect(chunks).toContainEqual({ type: "text", text: "GLM response" })

// Verify that the API was called with reasoning enabled
expect(mockCreate).toHaveBeenCalledWith(
expect.objectContaining({
chat_template_kwargs: {
thinking: true,
},
}),
)
})

it.skip("should disable reasoning for DeepSeek V3.1 models when enableReasoningEffort is false", async () => {
const modelId: ChutesModelId = "deepseek-ai/DeepSeek-V3.1"
const handlerWithModel = new ChutesHandler({
apiModelId: modelId,
chutesApiKey: "test-chutes-api-key",
enableReasoningEffort: false,
})

mockCreate.mockImplementationOnce(async () => ({
[Symbol.asyncIterator]: async function* () {
yield {
choices: [{ delta: { content: "<think>Reasoning content</think>Regular content" } }],
}
yield {
choices: [],
usage: { prompt_tokens: 100, completion_tokens: 50 },
}
},
}))

const systemPrompt = "You are a helpful assistant"
const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello" }]

const stream = handlerWithModel.createMessage(systemPrompt, messages)
const chunks = []
for await (const chunk of stream) {
chunks.push(chunk)
}

// Should NOT parse reasoning content when disabled
expect(chunks).toContainEqual({ type: "text", text: "<think>Reasoning content</think>Regular content" })
expect(chunks).not.toContainEqual({ type: "reasoning", text: "Reasoning content" })
})
})
55 changes: 54 additions & 1 deletion src/api/providers/chutes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { Anthropic } from "@anthropic-ai/sdk"
import OpenAI from "openai"

import type { ApiHandlerOptions } from "../../shared/api"
import { shouldUseReasoningEffort } from "../../shared/api"
import { XmlMatcher } from "../../utils/xml-matcher"
import { convertToR1Format } from "../transform/r1-format"
import { convertToOpenAiMessages } from "../transform/openai-format"
Expand All @@ -26,6 +27,7 @@ export class ChutesHandler extends BaseOpenAiCompatibleProvider<ChutesModelId> {
private getCompletionParams(
systemPrompt: string,
messages: Anthropic.Messages.MessageParam[],
enableReasoning: boolean = false,
): OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming {
const {
id: model,
Expand All @@ -34,19 +36,29 @@ export class ChutesHandler extends BaseOpenAiCompatibleProvider<ChutesModelId> {

const temperature = this.options.modelTemperature ?? this.getModel().info.temperature

return {
const params: any = {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[P3] Typing: Avoid any here; you can return the exact type to improve maintainability and catch mistakes earlier.

Suggested change
const params: any = {
const params: OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming = {

model,
max_tokens,
temperature,
messages: [{ role: "system", content: systemPrompt }, ...convertToOpenAiMessages(messages)],
stream: true,
stream_options: { include_usage: true },
}

// Add reasoning support for DeepSeek V3.1, GLM-4.5, and GLM-4.6 models
if (enableReasoning) {
params.chat_template_kwargs = {
thinking: true,
}
}

return params
}

override async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
const model = this.getModel()

// Handle DeepSeek R1 models with XML tag parsing
if (model.id.includes("DeepSeek-R1")) {
const stream = await this.client.chat.completions.create({
...this.getCompletionParams(systemPrompt, messages),
Expand Down Expand Up @@ -84,7 +96,48 @@ export class ChutesHandler extends BaseOpenAiCompatibleProvider<ChutesModelId> {
for (const processedChunk of matcher.final()) {
yield processedChunk
}
return
}

// Handle DeepSeek V3.1, GLM-4.5, and GLM-4.6 models with reasoning_content parsing
const isHybridReasoningModel =
model.id.includes("DeepSeek-V3.1") || model.id.includes("GLM-4.5") || model.id.includes("GLM-4.6")
const reasoningEnabled = this.options.enableReasoningEffort === true
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[P2] Consistency with reasoning toggle: This direct check (=== true) bypasses the shared helper and may diverge from global defaults or future logic. Prefer using the existing shouldUseReasoningEffort helper so provider behavior stays consistent across backends. Also remove the unused import if you decide to keep the direct check.


if (isHybridReasoningModel && reasoningEnabled) {
const stream = await this.client.chat.completions.create(
this.getCompletionParams(systemPrompt, messages, true),
)

for await (const chunk of stream) {
const delta = chunk.choices[0]?.delta

// Handle reasoning content from the response
if ((delta as any)?.reasoning_content) {
yield {
type: "reasoning",
text: (delta as any).reasoning_content,
}
}

// Handle regular text content
if (delta?.content) {
yield {
type: "text",
text: delta.content,
}
}

if (chunk.usage) {
yield {
type: "usage",
inputTokens: chunk.usage.prompt_tokens || 0,
outputTokens: chunk.usage.completion_tokens || 0,
}
}
}
} else {
// For non-reasoning models or when reasoning is disabled, use the base implementation
yield* super.createMessage(systemPrompt, messages)
}
}
Expand Down