Skip to content

Commit c9c258b

Browse files
committed
feat: support gpt-4o-audio-preview
1 parent 6d066bb commit c9c258b

File tree

7 files changed

+444
-82
lines changed

7 files changed

+444
-82
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
# Test binary, built with `go test -c`
99
*.test
10+
test.mp3
1011

1112
# Output of the go coverage tool, specifically when used with LiteIDE
1213
*.out

chat.go

+98-45
Original file line numberDiff line numberDiff line change
@@ -78,17 +78,63 @@ type ChatMessageImageURL struct {
7878
Detail ImageURLDetail `json:"detail,omitempty"`
7979
}
8080

81+
type AudioVoice string
82+
83+
const (
84+
AudioVoiceAlloy AudioVoice = "alloy"
85+
AudioVoiceAsh AudioVoice = "ash"
86+
AudioVoiceBallad AudioVoice = "ballad"
87+
AudioVoiceCoral AudioVoice = "coral"
88+
AudioVoiceEcho AudioVoice = "echo"
89+
AudioVoiceSage AudioVoice = "sage"
90+
AudioVoiceShimmer AudioVoice = "shimmer"
91+
AudioVoiceVerse AudioVoice = "verse"
92+
)
93+
94+
type AudioFormat string
95+
96+
const (
97+
AudioFormatWAV AudioFormat = "wav"
98+
AudioFormatMP3 AudioFormat = "mp3"
99+
AudioFormatFLAC AudioFormat = "flac"
100+
AudioFormatOPUS AudioFormat = "opus"
101+
AudioFormatPCM16 AudioFormat = "pcm16"
102+
)
103+
104+
type ChatMessageAudio struct {
105+
// Base64 encoded audio data.
106+
Data string `json:"data,omitempty"`
107+
// The format of the encoded audio data. Currently supports "wav" and "mp3".
108+
Format AudioFormat `json:"format,omitempty"`
109+
}
110+
111+
type Modality string
112+
113+
const (
114+
ModalityAudio Modality = "audio"
115+
ModalityText Modality = "text"
116+
)
117+
118+
type AudioOutput struct {
119+
// The voice the model uses to respond. Supported voices are alloy, ash, ballad, coral, echo, sage, shimmer, and verse.
120+
Voice AudioVoice `json:"voice"`
121+
// Specifies the output audio format. Must be one of wav, mp3, flac, opus, or pcm16.
122+
Format AudioFormat `json:"format"`
123+
}
124+
81125
type ChatMessagePartType string
82126

83127
const (
84-
ChatMessagePartTypeText ChatMessagePartType = "text"
85-
ChatMessagePartTypeImageURL ChatMessagePartType = "image_url"
128+
ChatMessagePartTypeText ChatMessagePartType = "text"
129+
ChatMessagePartTypeImageURL ChatMessagePartType = "image_url"
130+
ChatMessagePartTypeInputAudio ChatMessagePartType = "input_audio"
86131
)
87132

88133
type ChatMessagePart struct {
89-
Type ChatMessagePartType `json:"type,omitempty"`
90-
Text string `json:"text,omitempty"`
91-
ImageURL *ChatMessageImageURL `json:"image_url,omitempty"`
134+
Type ChatMessagePartType `json:"type,omitempty"`
135+
Text string `json:"text,omitempty"`
136+
ImageURL *ChatMessageImageURL `json:"image_url,omitempty"`
137+
InputAudio *ChatMessageAudio `json:"input_audio,omitempty"`
92138
}
93139

94140
type ChatCompletionMessage struct {
@@ -110,72 +156,74 @@ type ChatCompletionMessage struct {
110156

111157
// For Role=tool prompts this should be set to the ID given in the assistant's prior request to call a tool.
112158
ToolCallID string `json:"tool_call_id,omitempty"`
159+
160+
// If the audio output modality is requested, this object contains data about the audio response from the model.
161+
Audio *ChatCompletionAudio `json:"audio,omitempty"`
162+
}
163+
164+
type chatCompletionMessageMultiContent struct {
165+
Role string `json:"role"`
166+
Content string `json:"-"`
167+
Refusal string `json:"refusal,omitempty"`
168+
MultiContent []ChatMessagePart `json:"content,omitempty"`
169+
Name string `json:"name,omitempty"`
170+
FunctionCall *FunctionCall `json:"function_call,omitempty"`
171+
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
172+
ToolCallID string `json:"tool_call_id,omitempty"`
173+
Audio *ChatCompletionAudio `json:"audio,omitempty"`
174+
}
175+
176+
type chatCompletionMessageSingleContent struct {
177+
Role string `json:"role"`
178+
Content string `json:"content"`
179+
Refusal string `json:"refusal,omitempty"`
180+
MultiContent []ChatMessagePart `json:"-"`
181+
Name string `json:"name,omitempty"`
182+
FunctionCall *FunctionCall `json:"function_call,omitempty"`
183+
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
184+
ToolCallID string `json:"tool_call_id,omitempty"`
185+
Audio *ChatCompletionAudio `json:"audio,omitempty"`
113186
}
114187

115188
func (m ChatCompletionMessage) MarshalJSON() ([]byte, error) {
116189
if m.Content != "" && m.MultiContent != nil {
117190
return nil, ErrContentFieldsMisused
118191
}
119192
if len(m.MultiContent) > 0 {
120-
msg := struct {
121-
Role string `json:"role"`
122-
Content string `json:"-"`
123-
Refusal string `json:"refusal,omitempty"`
124-
MultiContent []ChatMessagePart `json:"content,omitempty"`
125-
Name string `json:"name,omitempty"`
126-
FunctionCall *FunctionCall `json:"function_call,omitempty"`
127-
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
128-
ToolCallID string `json:"tool_call_id,omitempty"`
129-
}(m)
193+
msg := chatCompletionMessageMultiContent(m)
130194
return json.Marshal(msg)
131195
}
132196

133-
msg := struct {
134-
Role string `json:"role"`
135-
Content string `json:"content"`
136-
Refusal string `json:"refusal,omitempty"`
137-
MultiContent []ChatMessagePart `json:"-"`
138-
Name string `json:"name,omitempty"`
139-
FunctionCall *FunctionCall `json:"function_call,omitempty"`
140-
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
141-
ToolCallID string `json:"tool_call_id,omitempty"`
142-
}(m)
197+
msg := chatCompletionMessageSingleContent(m)
143198
return json.Marshal(msg)
144199
}
145200

146201
func (m *ChatCompletionMessage) UnmarshalJSON(bs []byte) error {
147-
msg := struct {
148-
Role string `json:"role"`
149-
Content string `json:"content"`
150-
Refusal string `json:"refusal,omitempty"`
151-
MultiContent []ChatMessagePart
152-
Name string `json:"name,omitempty"`
153-
FunctionCall *FunctionCall `json:"function_call,omitempty"`
154-
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
155-
ToolCallID string `json:"tool_call_id,omitempty"`
156-
}{}
202+
msg := chatCompletionMessageSingleContent{}
157203

158204
if err := json.Unmarshal(bs, &msg); err == nil {
159205
*m = ChatCompletionMessage(msg)
160206
return nil
161207
}
162-
multiMsg := struct {
163-
Role string `json:"role"`
164-
Content string
165-
Refusal string `json:"refusal,omitempty"`
166-
MultiContent []ChatMessagePart `json:"content"`
167-
Name string `json:"name,omitempty"`
168-
FunctionCall *FunctionCall `json:"function_call,omitempty"`
169-
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
170-
ToolCallID string `json:"tool_call_id,omitempty"`
171-
}{}
208+
multiMsg := chatCompletionMessageMultiContent{}
172209
if err := json.Unmarshal(bs, &multiMsg); err != nil {
173210
return err
174211
}
175212
*m = ChatCompletionMessage(multiMsg)
176213
return nil
177214
}
178215

216+
type ChatCompletionAudio struct {
217+
// Unique identifier for this audio response.
218+
ID string `json:"id"`
219+
// The Unix timestamp (in seconds) for when this audio response will no longer be accessible on the server for use in multi-turn conversations.
220+
ExpiresAt int64 `json:"expires_at"`
221+
// Base64 encoded audio bytes generated by the model, in the format specified in the request.
222+
Data string `json:"data"`
223+
// Transcript of the audio generated by the model.
224+
Transcript string `json:"transcript"`
225+
}
226+
179227
type ToolCall struct {
180228
// Index is not nil only in chat completion chunk object
181229
Index *int `json:"index,omitempty"`
@@ -260,6 +308,11 @@ type ChatCompletionRequest struct {
260308
Store bool `json:"store,omitempty"`
261309
// Metadata to store with the completion.
262310
Metadata map[string]string `json:"metadata,omitempty"`
311+
// Output types that you would like the model to generate for this request. Most models are capable of generating text, which is the default: ["text"]
312+
// The gpt-4o-audio-preview model can also be used to generate audio. To request that this model generate both text and audio responses, you can use: ["text", "audio"]
313+
Modalities []Modality `json:"modalities,omitempty"`
314+
// Parameters for audio output. Required when audio output is requested with modalities: ["audio"]
315+
Audio *AudioOutput `json:"audio,omitempty"`
263316
}
264317

265318
type StreamOptions struct {

chat_stream.go

+12-5
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,19 @@ import (
55
"net/http"
66
)
77

8+
type ChatCompletionStreamChoiceDeltaAudio struct {
9+
ID string `json:"id,omitempty"`
10+
Transcript string `json:"transcript,omitempty"`
11+
Data string `json:"data,omitempty"`
12+
}
13+
814
type ChatCompletionStreamChoiceDelta struct {
9-
Content string `json:"content,omitempty"`
10-
Role string `json:"role,omitempty"`
11-
FunctionCall *FunctionCall `json:"function_call,omitempty"`
12-
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
13-
Refusal string `json:"refusal,omitempty"`
15+
Content string `json:"content,omitempty"`
16+
Role string `json:"role,omitempty"`
17+
FunctionCall *FunctionCall `json:"function_call,omitempty"`
18+
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
19+
Refusal string `json:"refusal,omitempty"`
20+
Audio *ChatCompletionStreamChoiceDeltaAudio `json:"audio,omitempty"`
1421
}
1522

1623
type ChatCompletionStreamChoiceLogprobs struct {

0 commit comments

Comments
 (0)