@@ -175,16 +175,6 @@ func (o *openAIToGCPVertexAITranslatorV1ChatCompletion) handleStreamingResponse(
175175 // Convert GCP chunk to OpenAI chunk.
176176 openAIChunk := o .convertGCPChunkToOpenAI (chunk )
177177
178- // Extract token usage if present in this chunk (typically in the last chunk).
179- if chunk .UsageMetadata != nil {
180- tokenUsage = LLMTokenUsage {
181- InputTokens : uint32 (chunk .UsageMetadata .PromptTokenCount ), //nolint:gosec
182- OutputTokens : uint32 (chunk .UsageMetadata .CandidatesTokenCount ), //nolint:gosec
183- TotalTokens : uint32 (chunk .UsageMetadata .TotalTokenCount ), //nolint:gosec
184- CachedInputTokens : uint32 (chunk .UsageMetadata .CachedContentTokenCount ), //nolint:gosec
185- }
186- }
187-
188178 // Serialize to SSE format as expected by OpenAI API.
189179 var chunkBytes []byte
190180 chunkBytes , err = json .Marshal (openAIChunk )
@@ -198,6 +188,40 @@ func (o *openAIToGCPVertexAITranslatorV1ChatCompletion) handleStreamingResponse(
198188 if span != nil {
199189 span .RecordResponseChunk (openAIChunk )
200190 }
191+
192+ // Extract token usage only in the last chunk.
193+ if chunk .UsageMetadata != nil && chunk .UsageMetadata .PromptTokenCount > 0 {
194+ // Convert usage to pointer if available.
195+ usage := ptr .To (geminiUsageToOpenAIUsage (chunk .UsageMetadata ))
196+
197+ usageChunk := openai.ChatCompletionResponseChunk {
198+ Object : "chat.completion.chunk" ,
199+ Choices : []openai.ChatCompletionResponseChunkChoice {},
200+ // usage is nil for all chunks other than the last chunk
201+ Usage : usage ,
202+ }
203+
204+ // Serialize to SSE format as expected by OpenAI API.
205+ var chunkBytes []byte
206+ chunkBytes , err = json .Marshal (usageChunk )
207+ if err != nil {
208+ return nil , nil , LLMTokenUsage {}, "" , fmt .Errorf ("error marshaling OpenAI chunk: %w" , err )
209+ }
210+ sseChunkBuf .WriteString ("data: " )
211+ sseChunkBuf .Write (chunkBytes )
212+ sseChunkBuf .WriteString ("\n \n " )
213+
214+ if span != nil {
215+ span .RecordResponseChunk (openAIChunk )
216+ }
217+
218+ tokenUsage = LLMTokenUsage {
219+ InputTokens : uint32 (chunk .UsageMetadata .PromptTokenCount ), //nolint:gosec
220+ OutputTokens : uint32 (chunk .UsageMetadata .CandidatesTokenCount ), //nolint:gosec
221+ TotalTokens : uint32 (chunk .UsageMetadata .TotalTokenCount ), //nolint:gosec
222+ CachedInputTokens : uint32 (chunk .UsageMetadata .CachedContentTokenCount ), //nolint:gosec
223+ }
224+ }
201225 }
202226 mut := & extprocv3.BodyMutation_Body {
203227 Body : sseChunkBuf .Bytes (),
@@ -251,16 +275,11 @@ func (o *openAIToGCPVertexAITranslatorV1ChatCompletion) convertGCPChunkToOpenAI(
251275 choices = []openai.ChatCompletionResponseChunkChoice {}
252276 }
253277
254- // Convert usage to pointer if available.
255- var usage * openai.Usage
256- if chunk .UsageMetadata != nil {
257- usage = ptr .To (geminiUsageToOpenAIUsage (chunk .UsageMetadata ))
258- }
259-
260278 return & openai.ChatCompletionResponseChunk {
261279 Object : "chat.completion.chunk" ,
262280 Choices : choices ,
263- Usage : usage ,
281+ // usage is nil for all chunks other than the last chunk
282+ Usage : nil ,
264283 }
265284}
266285
0 commit comments