From 1eb5e759724a10fea90a2f8e9ab7c292e7287191 Mon Sep 17 00:00:00 2001 From: Devon Rifkin Date: Thu, 11 Dec 2025 15:37:10 -0800 Subject: [PATCH] openai: add v1/responses support (#13351) Only supporting the stateless part of the API. Doc updates to come once this is shipped. Closes: #9659 --- middleware/openai.go | 108 +++ openai/openai.go | 53 +- openai/responses.go | 1004 +++++++++++++++++++++++++ openai/responses_test.go | 1543 ++++++++++++++++++++++++++++++++++++++ server/routes.go | 2 + 5 files changed, 2688 insertions(+), 22 deletions(-) create mode 100644 openai/responses.go create mode 100644 openai/responses_test.go diff --git a/middleware/openai.go b/middleware/openai.go index b2e43f165..5e526416e 100644 --- a/middleware/openai.go +++ b/middleware/openai.go @@ -433,3 +433,111 @@ func ChatMiddleware() gin.HandlerFunc { c.Next() } } + +type ResponsesWriter struct { + BaseWriter + converter *openai.ResponsesStreamConverter + model string + stream bool + responseID string + itemID string +} + +func (w *ResponsesWriter) writeEvent(eventType string, data any) error { + d, err := json.Marshal(data) + if err != nil { + return err + } + _, err = w.ResponseWriter.Write([]byte(fmt.Sprintf("event: %s\ndata: %s\n\n", eventType, d))) + if err != nil { + return err + } + if f, ok := w.ResponseWriter.(http.Flusher); ok { + f.Flush() + } + return nil +} + +func (w *ResponsesWriter) writeResponse(data []byte) (int, error) { + var chatResponse api.ChatResponse + if err := json.Unmarshal(data, &chatResponse); err != nil { + return 0, err + } + + if w.stream { + w.ResponseWriter.Header().Set("Content-Type", "text/event-stream") + + events := w.converter.Process(chatResponse) + for _, event := range events { + if err := w.writeEvent(event.Event, event.Data); err != nil { + return 0, err + } + } + return len(data), nil + } + + // Non-streaming response + w.ResponseWriter.Header().Set("Content-Type", "application/json") + response := openai.ToResponse(w.model, w.responseID, w.itemID, chatResponse) + return len(data), json.NewEncoder(w.ResponseWriter).Encode(response) +} + +func (w *ResponsesWriter) Write(data []byte) (int, error) { + code := w.ResponseWriter.Status() + if code != http.StatusOK { + return w.writeError(data) + } + return w.writeResponse(data) +} + +func ResponsesMiddleware() gin.HandlerFunc { + return func(c *gin.Context) { + var req openai.ResponsesRequest + if err := c.ShouldBindJSON(&req); err != nil { + c.AbortWithStatusJSON(http.StatusBadRequest, openai.NewError(http.StatusBadRequest, err.Error())) + return + } + + chatReq, err := openai.FromResponsesRequest(req) + if err != nil { + c.AbortWithStatusJSON(http.StatusBadRequest, openai.NewError(http.StatusBadRequest, err.Error())) + return + } + + // Check if client requested streaming (defaults to false) + streamRequested := req.Stream != nil && *req.Stream + + // Pass streaming preference to the underlying chat request + chatReq.Stream = &streamRequested + + var b bytes.Buffer + if err := json.NewEncoder(&b).Encode(chatReq); err != nil { + c.AbortWithStatusJSON(http.StatusInternalServerError, openai.NewError(http.StatusInternalServerError, err.Error())) + return + } + + c.Request.Body = io.NopCloser(&b) + + responseID := fmt.Sprintf("resp_%d", rand.Intn(999999)) + itemID := fmt.Sprintf("msg_%d", rand.Intn(999999)) + + w := &ResponsesWriter{ + BaseWriter: BaseWriter{ResponseWriter: c.Writer}, + converter: openai.NewResponsesStreamConverter(responseID, itemID, req.Model), + model: req.Model, + stream: streamRequested, + responseID: responseID, + itemID: itemID, + } + + // Set headers based on streaming mode + if streamRequested { + c.Writer.Header().Set("Content-Type", "text/event-stream") + c.Writer.Header().Set("Cache-Control", "no-cache") + c.Writer.Header().Set("Connection", "keep-alive") + } + + c.Writer = w + c.Next() + } +} diff --git a/openai/openai.go b/openai/openai.go index 4713d481b..9dcba3000 100644 --- a/openai/openai.go +++ b/openai/openai.go @@ -487,29 +487,9 @@ func FromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { } } - types := []string{"jpeg", "jpg", "png", "webp"} - valid := false - // support blank mime type to match api/chat taking just unadorned base64 - if strings.HasPrefix(url, "data:;base64,") { - url = strings.TrimPrefix(url, "data:;base64,") - valid = true - } - for _, t := range types { - prefix := "data:image/" + t + ";base64," - if strings.HasPrefix(url, prefix) { - url = strings.TrimPrefix(url, prefix) - valid = true - break - } - } - - if !valid { - return nil, errors.New("invalid image input") - } - - img, err := base64.StdEncoding.DecodeString(url) + img, err := decodeImageURL(url) if err != nil { - return nil, errors.New("invalid message format") + return nil, err } messages = append(messages, api.Message{Role: msg.Role, Images: []api.ImageData{img}}) @@ -648,6 +628,35 @@ func nameFromToolCallID(messages []Message, toolCallID string) string { return "" } +// decodeImageURL decodes a base64 data URI into raw image bytes. +func decodeImageURL(url string) (api.ImageData, error) { + types := []string{"jpeg", "jpg", "png", "webp"} + + // Support blank mime type to match /api/chat's behavior of taking just unadorned base64 + if strings.HasPrefix(url, "data:;base64,") { + url = strings.TrimPrefix(url, "data:;base64,") + } else { + valid := false + for _, t := range types { + prefix := "data:image/" + t + ";base64," + if strings.HasPrefix(url, prefix) { + url = strings.TrimPrefix(url, prefix) + valid = true + break + } + } + if !valid { + return nil, errors.New("invalid image input") + } + } + + img, err := base64.StdEncoding.DecodeString(url) + if err != nil { + return nil, errors.New("invalid image input") + } + return img, nil +} + // FromCompletionToolCall converts OpenAI ToolCall format to api.ToolCall func FromCompletionToolCall(toolCalls []ToolCall) ([]api.ToolCall, error) { apiToolCalls := make([]api.ToolCall, len(toolCalls)) diff --git a/openai/responses.go b/openai/responses.go new file mode 100644 index 000000000..8f6b1d94c --- /dev/null +++ b/openai/responses.go @@ -0,0 +1,1004 @@ +package openai + +import ( + "encoding/json" + "fmt" + "math/rand" + + "github.com/ollama/ollama/api" +) + +// ResponsesContent is a discriminated union for input content types. +// Concrete types: ResponsesTextContent, ResponsesImageContent +type ResponsesContent interface { + responsesContent() // unexported marker method +} + +type ResponsesTextContent struct { + Type string `json:"type"` // always "input_text" + Text string `json:"text"` +} + +func (ResponsesTextContent) responsesContent() {} + +type ResponsesImageContent struct { + Type string `json:"type"` // always "input_image" + // TODO(drifkin): is this really required? that seems verbose and a default is specified in the docs + Detail string `json:"detail"` // required + FileID string `json:"file_id,omitempty"` // optional + ImageURL string `json:"image_url,omitempty"` // optional +} + +func (ResponsesImageContent) responsesContent() {} + +// ResponsesOutputTextContent represents output text from a previous assistant response +// that is being passed back as part of the conversation history. +type ResponsesOutputTextContent struct { + Type string `json:"type"` // always "output_text" + Text string `json:"text"` +} + +func (ResponsesOutputTextContent) responsesContent() {} + +type ResponsesInputMessage struct { + Type string `json:"type"` // always "message" + Role string `json:"role"` // one of `user`, `system`, `developer` + Content []ResponsesContent `json:"content,omitempty"` +} + +func (m *ResponsesInputMessage) UnmarshalJSON(data []byte) error { + var aux struct { + Type string `json:"type"` + Role string `json:"role"` + Content json.RawMessage `json:"content"` + } + + if err := json.Unmarshal(data, &aux); err != nil { + return err + } + + m.Type = aux.Type + m.Role = aux.Role + + if len(aux.Content) == 0 { + return nil + } + + // Try to parse content as a string first (shorthand format) + var contentStr string + if err := json.Unmarshal(aux.Content, &contentStr); err == nil { + m.Content = []ResponsesContent{ + ResponsesTextContent{Type: "input_text", Text: contentStr}, + } + return nil + } + + // Otherwise, parse as an array of content items + var rawItems []json.RawMessage + if err := json.Unmarshal(aux.Content, &rawItems); err != nil { + return fmt.Errorf("content must be a string or array: %w", err) + } + + m.Content = make([]ResponsesContent, 0, len(rawItems)) + for i, raw := range rawItems { + // Peek at the type field to determine which concrete type to use + var typeField struct { + Type string `json:"type"` + } + if err := json.Unmarshal(raw, &typeField); err != nil { + return fmt.Errorf("content[%d]: %w", i, err) + } + + switch typeField.Type { + case "input_text": + var content ResponsesTextContent + if err := json.Unmarshal(raw, &content); err != nil { + return fmt.Errorf("content[%d]: %w", i, err) + } + m.Content = append(m.Content, content) + case "input_image": + var content ResponsesImageContent + if err := json.Unmarshal(raw, &content); err != nil { + return fmt.Errorf("content[%d]: %w", i, err) + } + m.Content = append(m.Content, content) + case "output_text": + var content ResponsesOutputTextContent + if err := json.Unmarshal(raw, &content); err != nil { + return fmt.Errorf("content[%d]: %w", i, err) + } + m.Content = append(m.Content, content) + default: + return fmt.Errorf("content[%d]: unknown content type: %s", i, typeField.Type) + } + } + + return nil +} + +type ResponsesOutputMessage struct{} + +// ResponsesInputItem is a discriminated union for input items. +// Concrete types: ResponsesInputMessage (more to come) +type ResponsesInputItem interface { + responsesInputItem() // unexported marker method +} + +func (ResponsesInputMessage) responsesInputItem() {} + +// ResponsesFunctionCall represents an assistant's function call in conversation history. +type ResponsesFunctionCall struct { + ID string `json:"id,omitempty"` // item ID + Type string `json:"type"` // always "function_call" + CallID string `json:"call_id"` // the tool call ID + Name string `json:"name"` // function name + Arguments string `json:"arguments"` // JSON arguments string +} + +func (ResponsesFunctionCall) responsesInputItem() {} + +// ResponsesFunctionCallOutput represents a function call result from the client. +type ResponsesFunctionCallOutput struct { + Type string `json:"type"` // always "function_call_output" + CallID string `json:"call_id"` // links to the original function call + Output string `json:"output"` // the function result +} + +func (ResponsesFunctionCallOutput) responsesInputItem() {} + +// ResponsesReasoningInput represents a reasoning item passed back as input. +// This is used when the client sends previous reasoning back for context. +type ResponsesReasoningInput struct { + ID string `json:"id,omitempty"` + Type string `json:"type"` // always "reasoning" + Summary []ResponsesReasoningSummary `json:"summary,omitempty"` + EncryptedContent string `json:"encrypted_content,omitempty"` +} + +func (ResponsesReasoningInput) responsesInputItem() {} + +// unmarshalResponsesInputItem unmarshals a single input item from JSON. +func unmarshalResponsesInputItem(data []byte) (ResponsesInputItem, error) { + var typeField struct { + Type string `json:"type"` + Role string `json:"role"` + } + if err := json.Unmarshal(data, &typeField); err != nil { + return nil, err + } + + // Handle shorthand message format: {"role": "...", "content": "..."} + // When type is empty but role is present, treat as a message + itemType := typeField.Type + if itemType == "" && typeField.Role != "" { + itemType = "message" + } + + switch itemType { + case "message": + var msg ResponsesInputMessage + if err := json.Unmarshal(data, &msg); err != nil { + return nil, err + } + return msg, nil + case "function_call": + var fc ResponsesFunctionCall + if err := json.Unmarshal(data, &fc); err != nil { + return nil, err + } + return fc, nil + case "function_call_output": + var output ResponsesFunctionCallOutput + if err := json.Unmarshal(data, &output); err != nil { + return nil, err + } + return output, nil + case "reasoning": + var reasoning ResponsesReasoningInput + if err := json.Unmarshal(data, &reasoning); err != nil { + return nil, err + } + return reasoning, nil + default: + return nil, fmt.Errorf("unknown input item type: %s", typeField.Type) + } +} + +// ResponsesInput can be either: +// - a string (equivalent to a text input with the user role) +// - an array of input items (see ResponsesInputItem) +type ResponsesInput struct { + Text string // set if input was a plain string + Items []ResponsesInputItem // set if input was an array +} + +func (r *ResponsesInput) UnmarshalJSON(data []byte) error { + // Try string first + var s string + if err := json.Unmarshal(data, &s); err == nil { + r.Text = s + return nil + } + + // Otherwise, try array of input items + var rawItems []json.RawMessage + if err := json.Unmarshal(data, &rawItems); err != nil { + return fmt.Errorf("input must be a string or array: %w", err) + } + + r.Items = make([]ResponsesInputItem, 0, len(rawItems)) + for i, raw := range rawItems { + item, err := unmarshalResponsesInputItem(raw) + if err != nil { + return fmt.Errorf("input[%d]: %w", i, err) + } + r.Items = append(r.Items, item) + } + + return nil +} + +type ResponsesReasoning struct { + // originally: optional, default is per-model + Effort string `json:"effort,omitempty"` + + // originally: deprecated, use `summary` instead. One of `auto`, `concise`, `detailed` + GenerateSummary string `json:"generate_summary,omitempty"` + + // originally: optional, one of `auto`, `concise`, `detailed` + Summary string `json:"summary,omitempty"` +} + +type ResponsesTextFormat struct { + Type string `json:"type"` // "text", "json_schema" + Name string `json:"name,omitempty"` // for json_schema + Schema json.RawMessage `json:"schema,omitempty"` // for json_schema + Strict *bool `json:"strict,omitempty"` // for json_schema +} + +type ResponsesText struct { + Format *ResponsesTextFormat `json:"format,omitempty"` +} + +// ResponsesTool represents a tool in the Responses API format. +// Note: This differs from api.Tool which nests fields under "function". +type ResponsesTool struct { + Type string `json:"type"` // "function" + Name string `json:"name"` + Description string `json:"description,omitempty"` + Strict bool `json:"strict,omitempty"` + Parameters map[string]any `json:"parameters,omitempty"` +} + +type ResponsesRequest struct { + Model string `json:"model"` + + // originally: optional, default is false + // for us: not supported + Background bool `json:"background"` + + // originally: optional `string | {id: string}` + // for us: not supported + Conversation json.RawMessage `json:"conversation"` + + // originally: string[] + // for us: ignored + Include []string `json:"include"` + + Input ResponsesInput `json:"input"` + + // optional, inserts a system message at the start of the conversation + Instructions string `json:"instructions,omitempty"` + + // optional, maps to num_predict + MaxOutputTokens *int `json:"max_output_tokens,omitempty"` + + Reasoning ResponsesReasoning `json:"reasoning"` + + // optional, default is 1.0 + Temperature *float64 `json:"temperature"` + + // optional, controls output format (e.g. json_schema) + Text *ResponsesText `json:"text,omitempty"` + + // optional, default is 1.0 + TopP *float64 `json:"top_p"` + + // optional, default is `"disabled"` + Truncation *string `json:"truncation"` + + Tools []ResponsesTool `json:"tools,omitempty"` + + // TODO(drifkin): tool_choice is not supported. We could support "none" by not + // passing tools, but the other controls like `"required"` cannot be generally + // supported. + + // optional, default is false + Stream *bool `json:"stream,omitempty"` +} + +// FromResponsesRequest converts a ResponsesRequest to api.ChatRequest +func FromResponsesRequest(r ResponsesRequest) (*api.ChatRequest, error) { + var messages []api.Message + + // Add instructions as system message if present + if r.Instructions != "" { + messages = append(messages, api.Message{ + Role: "system", + Content: r.Instructions, + }) + } + + // Handle simple string input + if r.Input.Text != "" { + messages = append(messages, api.Message{ + Role: "user", + Content: r.Input.Text, + }) + } + + // Handle array of input items + // Track pending reasoning to merge with the next assistant message + var pendingThinking string + + for _, item := range r.Input.Items { + switch v := item.(type) { + case ResponsesReasoningInput: + // Store thinking to merge with the next assistant message + pendingThinking = v.EncryptedContent + case ResponsesInputMessage: + msg, err := convertInputMessage(v) + if err != nil { + return nil, err + } + // If this is an assistant message, attach pending thinking + if msg.Role == "assistant" && pendingThinking != "" { + msg.Thinking = pendingThinking + pendingThinking = "" + } + messages = append(messages, msg) + case ResponsesFunctionCall: + // Convert function call to assistant message with tool calls + var args api.ToolCallFunctionArguments + if v.Arguments != "" { + if err := json.Unmarshal([]byte(v.Arguments), &args); err != nil { + return nil, fmt.Errorf("failed to parse function call arguments: %w", err) + } + } + msg := api.Message{ + Role: "assistant", + ToolCalls: []api.ToolCall{{ + ID: v.CallID, + Function: api.ToolCallFunction{ + Name: v.Name, + Arguments: args, + }, + }}, + } + // Attach pending thinking + if pendingThinking != "" { + msg.Thinking = pendingThinking + pendingThinking = "" + } + messages = append(messages, msg) + case ResponsesFunctionCallOutput: + messages = append(messages, api.Message{ + Role: "tool", + Content: v.Output, + ToolCallID: v.CallID, + }) + } + } + + // If there's trailing reasoning without a following message, emit it + if pendingThinking != "" { + messages = append(messages, api.Message{ + Role: "assistant", + Thinking: pendingThinking, + }) + } + + options := make(map[string]any) + + if r.Temperature != nil { + options["temperature"] = *r.Temperature + } else { + options["temperature"] = 1.0 + } + + if r.TopP != nil { + options["top_p"] = *r.TopP + } else { //nolint:staticcheck // SA9003: empty branch + // TODO(drifkin): OpenAI defaults to 1.0 here, but we don't follow that here + // in case the model has a different default. It would be best if we + // understood whether there was a model-specific default and if not, we + // should also default to 1.0, but that will require some additional + // plumbing + } + + if r.MaxOutputTokens != nil { + options["num_predict"] = *r.MaxOutputTokens + } + + // Convert tools from Responses API format to api.Tool format + var tools []api.Tool + for _, t := range r.Tools { + tool, err := convertTool(t) + if err != nil { + return nil, err + } + tools = append(tools, tool) + } + + // Handle text format (e.g. json_schema) + var format json.RawMessage + if r.Text != nil && r.Text.Format != nil { + switch r.Text.Format.Type { + case "json_schema": + if r.Text.Format.Schema != nil { + format = r.Text.Format.Schema + } + } + } + + return &api.ChatRequest{ + Model: r.Model, + Messages: messages, + Options: options, + Tools: tools, + Format: format, + }, nil +} + +func convertTool(t ResponsesTool) (api.Tool, error) { + // Convert parameters from map[string]any to api.ToolFunctionParameters + var params api.ToolFunctionParameters + if t.Parameters != nil { + // Marshal and unmarshal to convert + b, err := json.Marshal(t.Parameters) + if err != nil { + return api.Tool{}, fmt.Errorf("failed to marshal tool parameters: %w", err) + } + if err := json.Unmarshal(b, ¶ms); err != nil { + return api.Tool{}, fmt.Errorf("failed to unmarshal tool parameters: %w", err) + } + } + + return api.Tool{ + Type: t.Type, + Function: api.ToolFunction{ + Name: t.Name, + Description: t.Description, + Parameters: params, + }, + }, nil +} + +func convertInputMessage(m ResponsesInputMessage) (api.Message, error) { + var content string + var images []api.ImageData + + for _, c := range m.Content { + switch v := c.(type) { + case ResponsesTextContent: + content += v.Text + case ResponsesOutputTextContent: + content += v.Text + case ResponsesImageContent: + if v.ImageURL == "" { + continue // Skip if no URL (FileID not supported) + } + img, err := decodeImageURL(v.ImageURL) + if err != nil { + return api.Message{}, err + } + images = append(images, img) + } + } + + return api.Message{ + Role: m.Role, + Content: content, + Images: images, + }, nil +} + +// Response types for the Responses API + +type ResponsesResponse struct { + ID string `json:"id"` + Object string `json:"object"` + CreatedAt int64 `json:"created_at"` + Status string `json:"status"` + Model string `json:"model"` + Output []ResponsesOutputItem `json:"output"` + Usage *ResponsesUsage `json:"usage,omitempty"` + // TODO(drifkin): add `temperature` and `top_p` to the response, but this + // requires additional plumbing to find the effective values since the + // defaults can come from the model or the request +} + +type ResponsesOutputItem struct { + ID string `json:"id"` + Type string `json:"type"` // "message", "function_call", or "reasoning" + Status string `json:"status,omitempty"` + Role string `json:"role,omitempty"` // for message + Content []ResponsesOutputContent `json:"content,omitempty"` // for message + CallID string `json:"call_id,omitempty"` // for function_call + Name string `json:"name,omitempty"` // for function_call + Arguments string `json:"arguments,omitempty"` // for function_call + + // Reasoning fields + Summary []ResponsesReasoningSummary `json:"summary,omitempty"` // for reasoning + EncryptedContent string `json:"encrypted_content,omitempty"` // for reasoning +} + +type ResponsesReasoningSummary struct { + Type string `json:"type"` // "summary_text" + Text string `json:"text"` +} + +type ResponsesOutputContent struct { + Type string `json:"type"` // "output_text" + Text string `json:"text"` +} + +type ResponsesUsage struct { + InputTokens int `json:"input_tokens"` + OutputTokens int `json:"output_tokens"` + TotalTokens int `json:"total_tokens"` +} + +// ToResponse converts an api.ChatResponse to a Responses API response +func ToResponse(model, responseID, itemID string, chatResponse api.ChatResponse) ResponsesResponse { + var output []ResponsesOutputItem + + // Add reasoning item if thinking is present + if chatResponse.Message.Thinking != "" { + output = append(output, ResponsesOutputItem{ + ID: fmt.Sprintf("rs_%s", responseID), + Type: "reasoning", + Summary: []ResponsesReasoningSummary{ + { + Type: "summary_text", + Text: chatResponse.Message.Thinking, + }, + }, + EncryptedContent: chatResponse.Message.Thinking, // Plain text for now + }) + } + + if len(chatResponse.Message.ToolCalls) > 0 { + toolCalls := ToToolCalls(chatResponse.Message.ToolCalls) + for i, tc := range toolCalls { + output = append(output, ResponsesOutputItem{ + ID: fmt.Sprintf("fc_%s_%d", responseID, i), + Type: "function_call", + CallID: tc.ID, + Name: tc.Function.Name, + Arguments: tc.Function.Arguments, + }) + } + } else { + output = append(output, ResponsesOutputItem{ + ID: itemID, + Type: "message", + Status: "completed", + Role: "assistant", + Content: []ResponsesOutputContent{ + { + Type: "output_text", + Text: chatResponse.Message.Content, + }, + }, + }) + } + + return ResponsesResponse{ + ID: responseID, + Object: "response", + CreatedAt: chatResponse.CreatedAt.Unix(), + Status: "completed", + Model: model, + Output: output, + Usage: &ResponsesUsage{ + InputTokens: chatResponse.PromptEvalCount, + OutputTokens: chatResponse.EvalCount, + TotalTokens: chatResponse.PromptEvalCount + chatResponse.EvalCount, + }, + } +} + +// Streaming events: + +// ResponsesStreamEvent represents a single Server-Sent Event for the Responses API. +type ResponsesStreamEvent struct { + Event string // The event type (e.g., "response.created") + Data any // The event payload (will be JSON-marshaled) +} + +// ResponsesStreamConverter converts api.ChatResponse objects to Responses API +// streaming events. It maintains state across multiple calls to handle the +// streaming event sequence correctly. +type ResponsesStreamConverter struct { + // Configuration (immutable after creation) + responseID string + itemID string + model string + + // State tracking (mutated across Process calls) + firstWrite bool + outputIndex int + contentIndex int + contentStarted bool + toolCallsSent bool + accumulatedText string + sequenceNumber int + + // Reasoning/thinking state + accumulatedThinking string + reasoningItemID string + reasoningStarted bool + reasoningDone bool + + // Tool calls state (for final output) + toolCallItems []map[string]any +} + +// newEvent creates a ResponsesStreamEvent with the sequence number included in the data. +func (c *ResponsesStreamConverter) newEvent(eventType string, data map[string]any) ResponsesStreamEvent { + data["type"] = eventType + data["sequence_number"] = c.sequenceNumber + c.sequenceNumber++ + return ResponsesStreamEvent{ + Event: eventType, + Data: data, + } +} + +// NewResponsesStreamConverter creates a new converter with the given configuration. +func NewResponsesStreamConverter(responseID, itemID, model string) *ResponsesStreamConverter { + return &ResponsesStreamConverter{ + responseID: responseID, + itemID: itemID, + model: model, + firstWrite: true, + } +} + +// Process takes a ChatResponse and returns the events that should be emitted. +// Events are returned in order. The caller is responsible for serializing +// and sending these events. +func (c *ResponsesStreamConverter) Process(r api.ChatResponse) []ResponsesStreamEvent { + var events []ResponsesStreamEvent + + hasToolCalls := len(r.Message.ToolCalls) > 0 + hasThinking := r.Message.Thinking != "" + + // First chunk - emit initial events + if c.firstWrite { + c.firstWrite = false + events = append(events, c.createResponseCreatedEvent()) + events = append(events, c.createResponseInProgressEvent()) + } + + // Handle reasoning/thinking (before other content) + if hasThinking { + events = append(events, c.processThinking(r.Message.Thinking)...) + } + + // Handle tool calls + if hasToolCalls { + events = append(events, c.processToolCalls(r.Message.ToolCalls)...) + c.toolCallsSent = true + } + + // Handle text content (only if no tool calls) + if !hasToolCalls && !c.toolCallsSent && r.Message.Content != "" { + events = append(events, c.processTextContent(r.Message.Content)...) + } + + // Done - emit closing events + if r.Done { + events = append(events, c.processCompletion(r)...) + } + + return events +} + +func (c *ResponsesStreamConverter) createResponseCreatedEvent() ResponsesStreamEvent { + return c.newEvent("response.created", map[string]any{ + "response": map[string]any{ + "id": c.responseID, + "object": "response", + "status": "in_progress", + "output": []any{}, + }, + }) +} + +func (c *ResponsesStreamConverter) createResponseInProgressEvent() ResponsesStreamEvent { + return c.newEvent("response.in_progress", map[string]any{ + "response": map[string]any{ + "id": c.responseID, + "object": "response", + "status": "in_progress", + "output": []any{}, + }, + }) +} + +func (c *ResponsesStreamConverter) processThinking(thinking string) []ResponsesStreamEvent { + var events []ResponsesStreamEvent + + // Start reasoning item if not started + if !c.reasoningStarted { + c.reasoningStarted = true + c.reasoningItemID = fmt.Sprintf("rs_%d", rand.Intn(999999)) + + events = append(events, c.newEvent("response.output_item.added", map[string]any{ + "output_index": c.outputIndex, + "item": map[string]any{ + "id": c.reasoningItemID, + "type": "reasoning", + "summary": []any{}, + }, + })) + } + + // Accumulate thinking + c.accumulatedThinking += thinking + + // Emit delta + events = append(events, c.newEvent("response.reasoning_summary_text.delta", map[string]any{ + "item_id": c.reasoningItemID, + "output_index": c.outputIndex, + "delta": thinking, + })) + + // TODO(drifkin): consider adding + // [`response.reasoning_text.delta`](https://platform.openai.com/docs/api-reference/responses-streaming/response/reasoning_text/delta), + // but need to do additional research to understand how it's used and how + // widely supported it is + + return events +} + +func (c *ResponsesStreamConverter) finishReasoning() []ResponsesStreamEvent { + if !c.reasoningStarted || c.reasoningDone { + return nil + } + c.reasoningDone = true + + events := []ResponsesStreamEvent{ + c.newEvent("response.reasoning_summary_text.done", map[string]any{ + "item_id": c.reasoningItemID, + "output_index": c.outputIndex, + "text": c.accumulatedThinking, + }), + c.newEvent("response.output_item.done", map[string]any{ + "output_index": c.outputIndex, + "item": map[string]any{ + "id": c.reasoningItemID, + "type": "reasoning", + "summary": []map[string]any{{"type": "summary_text", "text": c.accumulatedThinking}}, + "encrypted_content": c.accumulatedThinking, // Plain text for now + }, + }), + } + + c.outputIndex++ + return events +} + +func (c *ResponsesStreamConverter) processToolCalls(toolCalls []api.ToolCall) []ResponsesStreamEvent { + var events []ResponsesStreamEvent + + // Finish reasoning first if it was started + events = append(events, c.finishReasoning()...) + + converted := ToToolCalls(toolCalls) + + for i, tc := range converted { + fcItemID := fmt.Sprintf("fc_%d_%d", rand.Intn(999999), i) + + // Store for final output (with status: completed) + toolCallItem := map[string]any{ + "id": fcItemID, + "type": "function_call", + "status": "completed", + "call_id": tc.ID, + "name": tc.Function.Name, + "arguments": tc.Function.Arguments, + } + c.toolCallItems = append(c.toolCallItems, toolCallItem) + + // response.output_item.added for function call + events = append(events, c.newEvent("response.output_item.added", map[string]any{ + "output_index": c.outputIndex + i, + "item": map[string]any{ + "id": fcItemID, + "type": "function_call", + "status": "in_progress", + "call_id": tc.ID, + "name": tc.Function.Name, + "arguments": "", + }, + })) + + // response.function_call_arguments.delta + if tc.Function.Arguments != "" { + events = append(events, c.newEvent("response.function_call_arguments.delta", map[string]any{ + "item_id": fcItemID, + "output_index": c.outputIndex + i, + "delta": tc.Function.Arguments, + })) + } + + // response.function_call_arguments.done + events = append(events, c.newEvent("response.function_call_arguments.done", map[string]any{ + "item_id": fcItemID, + "output_index": c.outputIndex + i, + "arguments": tc.Function.Arguments, + })) + + // response.output_item.done for function call + events = append(events, c.newEvent("response.output_item.done", map[string]any{ + "output_index": c.outputIndex + i, + "item": map[string]any{ + "id": fcItemID, + "type": "function_call", + "status": "completed", + "call_id": tc.ID, + "name": tc.Function.Name, + "arguments": tc.Function.Arguments, + }, + })) + } + + return events +} + +func (c *ResponsesStreamConverter) processTextContent(content string) []ResponsesStreamEvent { + var events []ResponsesStreamEvent + + // Finish reasoning first if it was started + events = append(events, c.finishReasoning()...) + + // Emit output item and content part for first text content + if !c.contentStarted { + c.contentStarted = true + + // response.output_item.added + events = append(events, c.newEvent("response.output_item.added", map[string]any{ + "output_index": c.outputIndex, + "item": map[string]any{ + "id": c.itemID, + "type": "message", + "status": "in_progress", + "role": "assistant", + "content": []any{}, + }, + })) + + // response.content_part.added + events = append(events, c.newEvent("response.content_part.added", map[string]any{ + "item_id": c.itemID, + "output_index": c.outputIndex, + "content_index": c.contentIndex, + "part": map[string]any{ + "type": "output_text", + "text": "", + }, + })) + } + + // Accumulate text + c.accumulatedText += content + + // Emit content delta + events = append(events, c.newEvent("response.output_text.delta", map[string]any{ + "item_id": c.itemID, + "output_index": c.outputIndex, + "content_index": 0, + "delta": content, + })) + + return events +} + +func (c *ResponsesStreamConverter) buildFinalOutput() []any { + var output []any + + // Add reasoning item if present + if c.reasoningStarted { + output = append(output, map[string]any{ + "id": c.reasoningItemID, + "type": "reasoning", + "summary": []map[string]any{{"type": "summary_text", "text": c.accumulatedThinking}}, + "encrypted_content": c.accumulatedThinking, + }) + } + + // Add tool calls if present + if len(c.toolCallItems) > 0 { + for _, item := range c.toolCallItems { + output = append(output, item) + } + } else if c.contentStarted { + // Add message item if we had text content + output = append(output, map[string]any{ + "id": c.itemID, + "type": "message", + "status": "completed", + "role": "assistant", + "content": []map[string]any{{ + "type": "output_text", + "text": c.accumulatedText, + }}, + }) + } + + return output +} + +func (c *ResponsesStreamConverter) processCompletion(r api.ChatResponse) []ResponsesStreamEvent { + var events []ResponsesStreamEvent + + // Finish reasoning if not done + events = append(events, c.finishReasoning()...) + + // Emit text completion events if we had text content + if !c.toolCallsSent && c.contentStarted { + // response.output_text.done + events = append(events, c.newEvent("response.output_text.done", map[string]any{ + "item_id": c.itemID, + "output_index": c.outputIndex, + "content_index": 0, + "text": c.accumulatedText, + })) + + // response.content_part.done + events = append(events, c.newEvent("response.content_part.done", map[string]any{ + "item_id": c.itemID, + "output_index": c.outputIndex, + "content_index": 0, + "part": map[string]any{ + "type": "output_text", + "text": c.accumulatedText, + }, + })) + + // response.output_item.done + events = append(events, c.newEvent("response.output_item.done", map[string]any{ + "output_index": c.outputIndex, + "item": map[string]any{ + "id": c.itemID, + "type": "message", + "status": "completed", + "role": "assistant", + "content": []map[string]any{{ + "type": "output_text", + "text": c.accumulatedText, + }}, + }, + })) + } + + // response.completed + events = append(events, c.newEvent("response.completed", map[string]any{ + "response": map[string]any{ + "id": c.responseID, + "object": "response", + "status": "completed", + "output": c.buildFinalOutput(), + "usage": map[string]any{ + "input_tokens": r.PromptEvalCount, + "output_tokens": r.EvalCount, + "total_tokens": r.PromptEvalCount + r.EvalCount, + }, + }, + })) + + return events +} diff --git a/openai/responses_test.go b/openai/responses_test.go new file mode 100644 index 000000000..50fbfdc57 --- /dev/null +++ b/openai/responses_test.go @@ -0,0 +1,1543 @@ +package openai + +import ( + "encoding/json" + "testing" + "time" + + "github.com/ollama/ollama/api" +) + +func TestResponsesInputMessage_UnmarshalJSON(t *testing.T) { + tests := []struct { + name string + json string + want ResponsesInputMessage + wantErr bool + }{ + { + name: "text content", + json: `{"type": "message", "role": "user", "content": [{"type": "input_text", "text": "hello"}]}`, + want: ResponsesInputMessage{ + Type: "message", + Role: "user", + Content: []ResponsesContent{ResponsesTextContent{Type: "input_text", Text: "hello"}}, + }, + }, + { + name: "image content", + json: `{"type": "message", "role": "user", "content": [{"type": "input_image", "detail": "auto", "image_url": "https://example.com/img.png"}]}`, + want: ResponsesInputMessage{ + Type: "message", + Role: "user", + Content: []ResponsesContent{ResponsesImageContent{ + Type: "input_image", + Detail: "auto", + ImageURL: "https://example.com/img.png", + }}, + }, + }, + { + name: "multiple content items", + json: `{"type": "message", "role": "user", "content": [{"type": "input_text", "text": "hello"}, {"type": "input_text", "text": "world"}]}`, + want: ResponsesInputMessage{ + Type: "message", + Role: "user", + Content: []ResponsesContent{ + ResponsesTextContent{Type: "input_text", Text: "hello"}, + ResponsesTextContent{Type: "input_text", Text: "world"}, + }, + }, + }, + { + name: "unknown content type", + json: `{"type": "message", "role": "user", "content": [{"type": "unknown"}]}`, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var got ResponsesInputMessage + err := json.Unmarshal([]byte(tt.json), &got) + + if tt.wantErr { + if err == nil { + t.Error("expected error, got nil") + } + return + } + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if got.Type != tt.want.Type { + t.Errorf("Type = %q, want %q", got.Type, tt.want.Type) + } + + if got.Role != tt.want.Role { + t.Errorf("Role = %q, want %q", got.Role, tt.want.Role) + } + + if len(got.Content) != len(tt.want.Content) { + t.Fatalf("len(Content) = %d, want %d", len(got.Content), len(tt.want.Content)) + } + + for i := range tt.want.Content { + switch wantContent := tt.want.Content[i].(type) { + case ResponsesTextContent: + gotContent, ok := got.Content[i].(ResponsesTextContent) + if !ok { + t.Fatalf("Content[%d] type = %T, want ResponsesTextContent", i, got.Content[i]) + } + if gotContent != wantContent { + t.Errorf("Content[%d] = %+v, want %+v", i, gotContent, wantContent) + } + case ResponsesImageContent: + gotContent, ok := got.Content[i].(ResponsesImageContent) + if !ok { + t.Fatalf("Content[%d] type = %T, want ResponsesImageContent", i, got.Content[i]) + } + if gotContent != wantContent { + t.Errorf("Content[%d] = %+v, want %+v", i, gotContent, wantContent) + } + } + } + }) + } +} + +func TestResponsesInput_UnmarshalJSON(t *testing.T) { + tests := []struct { + name string + json string + wantText string + wantItems int + wantErr bool + }{ + { + name: "plain string", + json: `"hello world"`, + wantText: "hello world", + }, + { + name: "array with one message", + json: `[{"type": "message", "role": "user", "content": [{"type": "input_text", "text": "hello"}]}]`, + wantItems: 1, + }, + { + name: "array with multiple messages", + json: `[{"type": "message", "role": "system", "content": [{"type": "input_text", "text": "you are helpful"}]}, {"type": "message", "role": "user", "content": [{"type": "input_text", "text": "hello"}]}]`, + wantItems: 2, + }, + { + name: "invalid input", + json: `123`, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var got ResponsesInput + err := json.Unmarshal([]byte(tt.json), &got) + + if tt.wantErr { + if err == nil { + t.Error("expected error, got nil") + } + return + } + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if got.Text != tt.wantText { + t.Errorf("Text = %q, want %q", got.Text, tt.wantText) + } + + if len(got.Items) != tt.wantItems { + t.Errorf("len(Items) = %d, want %d", len(got.Items), tt.wantItems) + } + }) + } +} + +func TestUnmarshalResponsesInputItem(t *testing.T) { + t.Run("message item", func(t *testing.T) { + got, err := unmarshalResponsesInputItem([]byte(`{"type": "message", "role": "user", "content": [{"type": "input_text", "text": "hello"}]}`)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + msg, ok := got.(ResponsesInputMessage) + if !ok { + t.Fatalf("got type %T, want ResponsesInputMessage", got) + } + + if msg.Role != "user" { + t.Errorf("Role = %q, want %q", msg.Role, "user") + } + }) + + t.Run("function_call item", func(t *testing.T) { + got, err := unmarshalResponsesInputItem([]byte(`{"type": "function_call", "call_id": "call_abc123", "name": "get_weather", "arguments": "{\"city\":\"Paris\"}"}`)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + fc, ok := got.(ResponsesFunctionCall) + if !ok { + t.Fatalf("got type %T, want ResponsesFunctionCall", got) + } + + if fc.Type != "function_call" { + t.Errorf("Type = %q, want %q", fc.Type, "function_call") + } + if fc.CallID != "call_abc123" { + t.Errorf("CallID = %q, want %q", fc.CallID, "call_abc123") + } + if fc.Name != "get_weather" { + t.Errorf("Name = %q, want %q", fc.Name, "get_weather") + } + }) + + t.Run("function_call_output item", func(t *testing.T) { + got, err := unmarshalResponsesInputItem([]byte(`{"type": "function_call_output", "call_id": "call_abc123", "output": "the result"}`)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + output, ok := got.(ResponsesFunctionCallOutput) + if !ok { + t.Fatalf("got type %T, want ResponsesFunctionCallOutput", got) + } + + if output.Type != "function_call_output" { + t.Errorf("Type = %q, want %q", output.Type, "function_call_output") + } + if output.CallID != "call_abc123" { + t.Errorf("CallID = %q, want %q", output.CallID, "call_abc123") + } + if output.Output != "the result" { + t.Errorf("Output = %q, want %q", output.Output, "the result") + } + }) + + t.Run("unknown item type", func(t *testing.T) { + _, err := unmarshalResponsesInputItem([]byte(`{"type": "unknown_type"}`)) + if err == nil { + t.Error("expected error, got nil") + } + }) +} + +func TestResponsesRequest_UnmarshalJSON(t *testing.T) { + tests := []struct { + name string + json string + check func(t *testing.T, req ResponsesRequest) + wantErr bool + }{ + { + name: "simple string input", + json: `{"model": "gpt-oss:20b", "input": "hello"}`, + check: func(t *testing.T, req ResponsesRequest) { + if req.Model != "gpt-oss:20b" { + t.Errorf("Model = %q, want %q", req.Model, "gpt-oss:20b") + } + if req.Input.Text != "hello" { + t.Errorf("Input.Text = %q, want %q", req.Input.Text, "hello") + } + }, + }, + { + name: "array input with messages", + json: `{"model": "gpt-oss:20b", "input": [{"type": "message", "role": "user", "content": [{"type": "input_text", "text": "hello"}]}]}`, + check: func(t *testing.T, req ResponsesRequest) { + if len(req.Input.Items) != 1 { + t.Fatalf("len(Input.Items) = %d, want 1", len(req.Input.Items)) + } + msg, ok := req.Input.Items[0].(ResponsesInputMessage) + if !ok { + t.Fatalf("Input.Items[0] type = %T, want ResponsesInputMessage", req.Input.Items[0]) + } + if msg.Role != "user" { + t.Errorf("Role = %q, want %q", msg.Role, "user") + } + }, + }, + { + name: "with temperature", + json: `{"model": "gpt-oss:20b", "input": "hello", "temperature": 0.5}`, + check: func(t *testing.T, req ResponsesRequest) { + if req.Temperature == nil || *req.Temperature != 0.5 { + t.Errorf("Temperature = %v, want 0.5", req.Temperature) + } + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var got ResponsesRequest + err := json.Unmarshal([]byte(tt.json), &got) + + if tt.wantErr { + if err == nil { + t.Error("expected error, got nil") + } + return + } + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if tt.check != nil { + tt.check(t, got) + } + }) + } +} + +func TestFromResponsesRequest_Tools(t *testing.T) { + reqJSON := `{ + "model": "gpt-oss:20b", + "input": "hello", + "tools": [ + { + "type": "function", + "name": "shell", + "description": "Runs a shell command", + "strict": false, + "parameters": { + "type": "object", + "properties": { + "command": { + "type": "array", + "items": {"type": "string"}, + "description": "The command to execute" + } + }, + "required": ["command"] + } + } + ] + }` + + var req ResponsesRequest + if err := json.Unmarshal([]byte(reqJSON), &req); err != nil { + t.Fatalf("failed to unmarshal request: %v", err) + } + + // Check that tools were parsed + if len(req.Tools) != 1 { + t.Fatalf("expected 1 tool, got %d", len(req.Tools)) + } + + if req.Tools[0].Name != "shell" { + t.Errorf("expected tool name 'shell', got %q", req.Tools[0].Name) + } + + // Convert and check + chatReq, err := FromResponsesRequest(req) + if err != nil { + t.Fatalf("failed to convert request: %v", err) + } + + if len(chatReq.Tools) != 1 { + t.Fatalf("expected 1 converted tool, got %d", len(chatReq.Tools)) + } + + tool := chatReq.Tools[0] + if tool.Type != "function" { + t.Errorf("expected tool type 'function', got %q", tool.Type) + } + if tool.Function.Name != "shell" { + t.Errorf("expected function name 'shell', got %q", tool.Function.Name) + } + if tool.Function.Description != "Runs a shell command" { + t.Errorf("expected function description 'Runs a shell command', got %q", tool.Function.Description) + } + if tool.Function.Parameters.Type != "object" { + t.Errorf("expected parameters type 'object', got %q", tool.Function.Parameters.Type) + } + if len(tool.Function.Parameters.Required) != 1 || tool.Function.Parameters.Required[0] != "command" { + t.Errorf("expected required ['command'], got %v", tool.Function.Parameters.Required) + } +} + +func TestFromResponsesRequest_FunctionCallOutput(t *testing.T) { + // Test a complete tool call round-trip: + // 1. User message asking about weather + // 2. Assistant's function call (from previous response) + // 3. Function call output (the tool result) + reqJSON := `{ + "model": "gpt-oss:20b", + "input": [ + {"type": "message", "role": "user", "content": [{"type": "input_text", "text": "what is the weather?"}]}, + {"type": "function_call", "call_id": "call_abc123", "name": "get_weather", "arguments": "{\"city\":\"Paris\"}"}, + {"type": "function_call_output", "call_id": "call_abc123", "output": "sunny, 72F"} + ] + }` + + var req ResponsesRequest + if err := json.Unmarshal([]byte(reqJSON), &req); err != nil { + t.Fatalf("failed to unmarshal request: %v", err) + } + + // Check that input items were parsed + if len(req.Input.Items) != 3 { + t.Fatalf("expected 3 input items, got %d", len(req.Input.Items)) + } + + // Verify the function_call item + fc, ok := req.Input.Items[1].(ResponsesFunctionCall) + if !ok { + t.Fatalf("Input.Items[1] type = %T, want ResponsesFunctionCall", req.Input.Items[1]) + } + if fc.Name != "get_weather" { + t.Errorf("Name = %q, want %q", fc.Name, "get_weather") + } + + // Verify the function_call_output item + fcOutput, ok := req.Input.Items[2].(ResponsesFunctionCallOutput) + if !ok { + t.Fatalf("Input.Items[2] type = %T, want ResponsesFunctionCallOutput", req.Input.Items[2]) + } + if fcOutput.CallID != "call_abc123" { + t.Errorf("CallID = %q, want %q", fcOutput.CallID, "call_abc123") + } + + // Convert and check + chatReq, err := FromResponsesRequest(req) + if err != nil { + t.Fatalf("failed to convert request: %v", err) + } + + if len(chatReq.Messages) != 3 { + t.Fatalf("expected 3 messages, got %d", len(chatReq.Messages)) + } + + // Check the user message + userMsg := chatReq.Messages[0] + if userMsg.Role != "user" { + t.Errorf("expected role 'user', got %q", userMsg.Role) + } + + // Check the assistant message with tool call + assistantMsg := chatReq.Messages[1] + if assistantMsg.Role != "assistant" { + t.Errorf("expected role 'assistant', got %q", assistantMsg.Role) + } + if len(assistantMsg.ToolCalls) != 1 { + t.Fatalf("expected 1 tool call, got %d", len(assistantMsg.ToolCalls)) + } + if assistantMsg.ToolCalls[0].ID != "call_abc123" { + t.Errorf("expected tool call ID 'call_abc123', got %q", assistantMsg.ToolCalls[0].ID) + } + if assistantMsg.ToolCalls[0].Function.Name != "get_weather" { + t.Errorf("expected function name 'get_weather', got %q", assistantMsg.ToolCalls[0].Function.Name) + } + + // Check the tool response message + toolMsg := chatReq.Messages[2] + if toolMsg.Role != "tool" { + t.Errorf("expected role 'tool', got %q", toolMsg.Role) + } + if toolMsg.Content != "sunny, 72F" { + t.Errorf("expected content 'sunny, 72F', got %q", toolMsg.Content) + } + if toolMsg.ToolCallID != "call_abc123" { + t.Errorf("expected ToolCallID 'call_abc123', got %q", toolMsg.ToolCallID) + } +} + +func TestDecodeImageURL(t *testing.T) { + // Valid PNG base64 (1x1 red pixel) + validPNG := "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8DwHwAFBQIAX8jx0gAAAABJRU5ErkJggg==" + + t.Run("valid png", func(t *testing.T) { + img, err := decodeImageURL(validPNG) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(img) == 0 { + t.Error("expected non-empty image data") + } + }) + + t.Run("valid jpeg", func(t *testing.T) { + // Just test the prefix validation with minimal base64 + _, err := decodeImageURL("data:image/jpeg;base64,/9j/4AAQSkZJRg==") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + }) + + t.Run("blank mime type", func(t *testing.T) { + _, err := decodeImageURL("data:;base64,dGVzdA==") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + }) + + t.Run("invalid mime type", func(t *testing.T) { + _, err := decodeImageURL("data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7") + if err == nil { + t.Error("expected error for unsupported mime type") + } + }) + + t.Run("invalid base64", func(t *testing.T) { + _, err := decodeImageURL("data:image/png;base64,not-valid-base64!") + if err == nil { + t.Error("expected error for invalid base64") + } + }) + + t.Run("not a data url", func(t *testing.T) { + _, err := decodeImageURL("https://example.com/image.png") + if err == nil { + t.Error("expected error for non-data URL") + } + }) +} + +func TestFromResponsesRequest_Images(t *testing.T) { + // 1x1 red PNG pixel + pngBase64 := "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8z8DwHwAFBQIAX8jx0gAAAABJRU5ErkJggg==" + + reqJSON := `{ + "model": "llava", + "input": [ + {"type": "message", "role": "user", "content": [ + {"type": "input_text", "text": "What is in this image?"}, + {"type": "input_image", "detail": "auto", "image_url": "data:image/png;base64,` + pngBase64 + `"} + ]} + ] + }` + + var req ResponsesRequest + if err := json.Unmarshal([]byte(reqJSON), &req); err != nil { + t.Fatalf("failed to unmarshal request: %v", err) + } + + chatReq, err := FromResponsesRequest(req) + if err != nil { + t.Fatalf("failed to convert request: %v", err) + } + + if len(chatReq.Messages) != 1 { + t.Fatalf("expected 1 message, got %d", len(chatReq.Messages)) + } + + msg := chatReq.Messages[0] + if msg.Role != "user" { + t.Errorf("expected role 'user', got %q", msg.Role) + } + if msg.Content != "What is in this image?" { + t.Errorf("expected content 'What is in this image?', got %q", msg.Content) + } + if len(msg.Images) != 1 { + t.Fatalf("expected 1 image, got %d", len(msg.Images)) + } + if len(msg.Images[0]) == 0 { + t.Error("expected non-empty image data") + } +} + +func TestResponsesStreamConverter_TextOnly(t *testing.T) { + converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b") + + // First chunk with content + events := converter.Process(api.ChatResponse{ + Message: api.Message{ + Content: "Hello", + }, + }) + + // Should have: response.created, response.in_progress, output_item.added, content_part.added, output_text.delta + if len(events) != 5 { + t.Fatalf("expected 5 events, got %d", len(events)) + } + + if events[0].Event != "response.created" { + t.Errorf("events[0].Event = %q, want %q", events[0].Event, "response.created") + } + if events[1].Event != "response.in_progress" { + t.Errorf("events[1].Event = %q, want %q", events[1].Event, "response.in_progress") + } + if events[2].Event != "response.output_item.added" { + t.Errorf("events[2].Event = %q, want %q", events[2].Event, "response.output_item.added") + } + if events[3].Event != "response.content_part.added" { + t.Errorf("events[3].Event = %q, want %q", events[3].Event, "response.content_part.added") + } + if events[4].Event != "response.output_text.delta" { + t.Errorf("events[4].Event = %q, want %q", events[4].Event, "response.output_text.delta") + } + + // Second chunk with more content + events = converter.Process(api.ChatResponse{ + Message: api.Message{ + Content: " World", + }, + }) + + // Should only have output_text.delta (no more created/in_progress/added) + if len(events) != 1 { + t.Fatalf("expected 1 event, got %d", len(events)) + } + if events[0].Event != "response.output_text.delta" { + t.Errorf("events[0].Event = %q, want %q", events[0].Event, "response.output_text.delta") + } + + // Final chunk + events = converter.Process(api.ChatResponse{ + Message: api.Message{}, + Done: true, + }) + + // Should have: output_text.done, content_part.done, output_item.done, response.completed + if len(events) != 4 { + t.Fatalf("expected 4 events, got %d", len(events)) + } + if events[0].Event != "response.output_text.done" { + t.Errorf("events[0].Event = %q, want %q", events[0].Event, "response.output_text.done") + } + // Check that accumulated text is present + data := events[0].Data.(map[string]any) + if data["text"] != "Hello World" { + t.Errorf("accumulated text = %q, want %q", data["text"], "Hello World") + } +} + +func TestResponsesStreamConverter_ToolCalls(t *testing.T) { + converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b") + + events := converter.Process(api.ChatResponse{ + Message: api.Message{ + ToolCalls: []api.ToolCall{ + { + ID: "call_abc", + Function: api.ToolCallFunction{ + Name: "get_weather", + Arguments: api.ToolCallFunctionArguments{"city": "Paris"}, + }, + }, + }, + }, + }) + + // Should have: created, in_progress, output_item.added, arguments.delta, arguments.done, output_item.done + if len(events) != 6 { + t.Fatalf("expected 6 events, got %d", len(events)) + } + + if events[2].Event != "response.output_item.added" { + t.Errorf("events[2].Event = %q, want %q", events[2].Event, "response.output_item.added") + } + if events[3].Event != "response.function_call_arguments.delta" { + t.Errorf("events[3].Event = %q, want %q", events[3].Event, "response.function_call_arguments.delta") + } + if events[4].Event != "response.function_call_arguments.done" { + t.Errorf("events[4].Event = %q, want %q", events[4].Event, "response.function_call_arguments.done") + } + if events[5].Event != "response.output_item.done" { + t.Errorf("events[5].Event = %q, want %q", events[5].Event, "response.output_item.done") + } +} + +func TestResponsesStreamConverter_Reasoning(t *testing.T) { + converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b") + + // First chunk with thinking + events := converter.Process(api.ChatResponse{ + Message: api.Message{ + Thinking: "Let me think...", + }, + }) + + // Should have: created, in_progress, output_item.added (reasoning), reasoning_summary_text.delta + if len(events) != 4 { + t.Fatalf("expected 4 events, got %d", len(events)) + } + + if events[2].Event != "response.output_item.added" { + t.Errorf("events[2].Event = %q, want %q", events[2].Event, "response.output_item.added") + } + // Check it's a reasoning item + data := events[2].Data.(map[string]any) + item := data["item"].(map[string]any) + if item["type"] != "reasoning" { + t.Errorf("item type = %q, want %q", item["type"], "reasoning") + } + + if events[3].Event != "response.reasoning_summary_text.delta" { + t.Errorf("events[3].Event = %q, want %q", events[3].Event, "response.reasoning_summary_text.delta") + } + + // Second chunk with text content (reasoning should close first) + events = converter.Process(api.ChatResponse{ + Message: api.Message{ + Content: "The answer is 42", + }, + }) + + // Should have: reasoning_summary_text.done, output_item.done (reasoning), output_item.added (message), content_part.added, output_text.delta + if len(events) != 5 { + t.Fatalf("expected 5 events, got %d", len(events)) + } + + if events[0].Event != "response.reasoning_summary_text.done" { + t.Errorf("events[0].Event = %q, want %q", events[0].Event, "response.reasoning_summary_text.done") + } + if events[1].Event != "response.output_item.done" { + t.Errorf("events[1].Event = %q, want %q", events[1].Event, "response.output_item.done") + } + // Check the reasoning done item has encrypted_content + doneData := events[1].Data.(map[string]any) + doneItem := doneData["item"].(map[string]any) + if doneItem["encrypted_content"] != "Let me think..." { + t.Errorf("encrypted_content = %q, want %q", doneItem["encrypted_content"], "Let me think...") + } +} + +func TestFromResponsesRequest_ReasoningMerge(t *testing.T) { + t.Run("reasoning merged with following message", func(t *testing.T) { + reqJSON := `{ + "model": "qwen3", + "input": [ + {"type": "message", "role": "user", "content": [{"type": "input_text", "text": "solve 2+2"}]}, + {"type": "reasoning", "id": "rs_123", "encrypted_content": "Let me think about this math problem...", "summary": [{"type": "summary_text", "text": "Thinking about math"}]}, + {"type": "message", "role": "assistant", "content": [{"type": "input_text", "text": "The answer is 4"}]} + ] + }` + + var req ResponsesRequest + if err := json.Unmarshal([]byte(reqJSON), &req); err != nil { + t.Fatalf("failed to unmarshal request: %v", err) + } + + chatReq, err := FromResponsesRequest(req) + if err != nil { + t.Fatalf("failed to convert request: %v", err) + } + + // Should have 2 messages: user and assistant (with thinking merged) + if len(chatReq.Messages) != 2 { + t.Fatalf("expected 2 messages, got %d", len(chatReq.Messages)) + } + + // Check user message + if chatReq.Messages[0].Role != "user" { + t.Errorf("Messages[0].Role = %q, want %q", chatReq.Messages[0].Role, "user") + } + + // Check assistant message has both content and thinking + assistantMsg := chatReq.Messages[1] + if assistantMsg.Role != "assistant" { + t.Errorf("Messages[1].Role = %q, want %q", assistantMsg.Role, "assistant") + } + if assistantMsg.Content != "The answer is 4" { + t.Errorf("Messages[1].Content = %q, want %q", assistantMsg.Content, "The answer is 4") + } + if assistantMsg.Thinking != "Let me think about this math problem..." { + t.Errorf("Messages[1].Thinking = %q, want %q", assistantMsg.Thinking, "Let me think about this math problem...") + } + }) + + t.Run("reasoning merged with following function call", func(t *testing.T) { + reqJSON := `{ + "model": "qwen3", + "input": [ + {"type": "message", "role": "user", "content": [{"type": "input_text", "text": "what is the weather?"}]}, + {"type": "reasoning", "id": "rs_123", "encrypted_content": "I need to call a tool for this...", "summary": []}, + {"type": "function_call", "call_id": "call_abc", "name": "get_weather", "arguments": "{\"city\":\"Paris\"}"} + ] + }` + + var req ResponsesRequest + if err := json.Unmarshal([]byte(reqJSON), &req); err != nil { + t.Fatalf("failed to unmarshal request: %v", err) + } + + chatReq, err := FromResponsesRequest(req) + if err != nil { + t.Fatalf("failed to convert request: %v", err) + } + + // Should have 2 messages: user and assistant (with thinking + tool call) + if len(chatReq.Messages) != 2 { + t.Fatalf("expected 2 messages, got %d", len(chatReq.Messages)) + } + + // Check assistant message has both tool call and thinking + assistantMsg := chatReq.Messages[1] + if assistantMsg.Role != "assistant" { + t.Errorf("Messages[1].Role = %q, want %q", assistantMsg.Role, "assistant") + } + if assistantMsg.Thinking != "I need to call a tool for this..." { + t.Errorf("Messages[1].Thinking = %q, want %q", assistantMsg.Thinking, "I need to call a tool for this...") + } + if len(assistantMsg.ToolCalls) != 1 { + t.Fatalf("expected 1 tool call, got %d", len(assistantMsg.ToolCalls)) + } + if assistantMsg.ToolCalls[0].Function.Name != "get_weather" { + t.Errorf("ToolCalls[0].Function.Name = %q, want %q", assistantMsg.ToolCalls[0].Function.Name, "get_weather") + } + }) + + t.Run("multi-turn conversation with reasoning", func(t *testing.T) { + // Simulates: user asks -> model thinks + responds -> user follows up + reqJSON := `{ + "model": "qwen3", + "input": [ + {"type": "message", "role": "user", "content": [{"type": "input_text", "text": "What is 2+2?"}]}, + {"type": "reasoning", "id": "rs_001", "encrypted_content": "This is a simple arithmetic problem. 2+2=4.", "summary": [{"type": "summary_text", "text": "Calculating 2+2"}]}, + {"type": "message", "role": "assistant", "content": [{"type": "input_text", "text": "The answer is 4."}]}, + {"type": "message", "role": "user", "content": [{"type": "input_text", "text": "Now multiply that by 3"}]} + ] + }` + + var req ResponsesRequest + if err := json.Unmarshal([]byte(reqJSON), &req); err != nil { + t.Fatalf("failed to unmarshal request: %v", err) + } + + chatReq, err := FromResponsesRequest(req) + if err != nil { + t.Fatalf("failed to convert request: %v", err) + } + + // Should have 3 messages: + // 1. user: "What is 2+2?" + // 2. assistant: thinking + "The answer is 4." + // 3. user: "Now multiply that by 3" + if len(chatReq.Messages) != 3 { + t.Fatalf("expected 3 messages, got %d", len(chatReq.Messages)) + } + + // Check first user message + if chatReq.Messages[0].Role != "user" || chatReq.Messages[0].Content != "What is 2+2?" { + t.Errorf("Messages[0] = {Role: %q, Content: %q}, want {Role: \"user\", Content: \"What is 2+2?\"}", + chatReq.Messages[0].Role, chatReq.Messages[0].Content) + } + + // Check assistant message has merged thinking + content + if chatReq.Messages[1].Role != "assistant" { + t.Errorf("Messages[1].Role = %q, want \"assistant\"", chatReq.Messages[1].Role) + } + if chatReq.Messages[1].Content != "The answer is 4." { + t.Errorf("Messages[1].Content = %q, want \"The answer is 4.\"", chatReq.Messages[1].Content) + } + if chatReq.Messages[1].Thinking != "This is a simple arithmetic problem. 2+2=4." { + t.Errorf("Messages[1].Thinking = %q, want \"This is a simple arithmetic problem. 2+2=4.\"", + chatReq.Messages[1].Thinking) + } + + // Check second user message + if chatReq.Messages[2].Role != "user" || chatReq.Messages[2].Content != "Now multiply that by 3" { + t.Errorf("Messages[2] = {Role: %q, Content: %q}, want {Role: \"user\", Content: \"Now multiply that by 3\"}", + chatReq.Messages[2].Role, chatReq.Messages[2].Content) + } + }) + + t.Run("multi-turn with tool calls and reasoning", func(t *testing.T) { + // Simulates: user asks -> model thinks + calls tool -> tool responds -> model thinks + responds -> user follows up + reqJSON := `{ + "model": "qwen3", + "input": [ + {"type": "message", "role": "user", "content": [{"type": "input_text", "text": "What is the weather in Paris?"}]}, + {"type": "reasoning", "id": "rs_001", "encrypted_content": "I need to call the weather API for Paris.", "summary": []}, + {"type": "function_call", "call_id": "call_abc", "name": "get_weather", "arguments": "{\"city\":\"Paris\"}"}, + {"type": "function_call_output", "call_id": "call_abc", "output": "Sunny, 72°F"}, + {"type": "reasoning", "id": "rs_002", "encrypted_content": "The weather API returned sunny and 72°F. I should format this nicely.", "summary": []}, + {"type": "message", "role": "assistant", "content": [{"type": "input_text", "text": "It's sunny and 72°F in Paris!"}]}, + {"type": "message", "role": "user", "content": [{"type": "input_text", "text": "What about London?"}]} + ] + }` + + var req ResponsesRequest + if err := json.Unmarshal([]byte(reqJSON), &req); err != nil { + t.Fatalf("failed to unmarshal request: %v", err) + } + + chatReq, err := FromResponsesRequest(req) + if err != nil { + t.Fatalf("failed to convert request: %v", err) + } + + // Should have 5 messages: + // 1. user: "What is the weather in Paris?" + // 2. assistant: thinking + tool call + // 3. tool: "Sunny, 72°F" + // 4. assistant: thinking + "It's sunny and 72°F in Paris!" + // 5. user: "What about London?" + if len(chatReq.Messages) != 5 { + t.Fatalf("expected 5 messages, got %d", len(chatReq.Messages)) + } + + // Message 1: user + if chatReq.Messages[0].Role != "user" { + t.Errorf("Messages[0].Role = %q, want \"user\"", chatReq.Messages[0].Role) + } + + // Message 2: assistant with thinking + tool call + if chatReq.Messages[1].Role != "assistant" { + t.Errorf("Messages[1].Role = %q, want \"assistant\"", chatReq.Messages[1].Role) + } + if chatReq.Messages[1].Thinking != "I need to call the weather API for Paris." { + t.Errorf("Messages[1].Thinking = %q, want \"I need to call the weather API for Paris.\"", chatReq.Messages[1].Thinking) + } + if len(chatReq.Messages[1].ToolCalls) != 1 || chatReq.Messages[1].ToolCalls[0].Function.Name != "get_weather" { + t.Errorf("Messages[1].ToolCalls not as expected") + } + + // Message 3: tool response + if chatReq.Messages[2].Role != "tool" || chatReq.Messages[2].Content != "Sunny, 72°F" { + t.Errorf("Messages[2] = {Role: %q, Content: %q}, want {Role: \"tool\", Content: \"Sunny, 72°F\"}", + chatReq.Messages[2].Role, chatReq.Messages[2].Content) + } + + // Message 4: assistant with thinking + content + if chatReq.Messages[3].Role != "assistant" { + t.Errorf("Messages[3].Role = %q, want \"assistant\"", chatReq.Messages[3].Role) + } + if chatReq.Messages[3].Thinking != "The weather API returned sunny and 72°F. I should format this nicely." { + t.Errorf("Messages[3].Thinking = %q, want correct thinking", chatReq.Messages[3].Thinking) + } + if chatReq.Messages[3].Content != "It's sunny and 72°F in Paris!" { + t.Errorf("Messages[3].Content = %q, want \"It's sunny and 72°F in Paris!\"", chatReq.Messages[3].Content) + } + + // Message 5: user follow-up + if chatReq.Messages[4].Role != "user" || chatReq.Messages[4].Content != "What about London?" { + t.Errorf("Messages[4] = {Role: %q, Content: %q}, want {Role: \"user\", Content: \"What about London?\"}", + chatReq.Messages[4].Role, chatReq.Messages[4].Content) + } + }) + + t.Run("trailing reasoning creates separate message", func(t *testing.T) { + reqJSON := `{ + "model": "qwen3", + "input": [ + {"type": "message", "role": "user", "content": [{"type": "input_text", "text": "think about this"}]}, + {"type": "reasoning", "id": "rs_123", "encrypted_content": "Still thinking...", "summary": []} + ] + }` + + var req ResponsesRequest + if err := json.Unmarshal([]byte(reqJSON), &req); err != nil { + t.Fatalf("failed to unmarshal request: %v", err) + } + + chatReq, err := FromResponsesRequest(req) + if err != nil { + t.Fatalf("failed to convert request: %v", err) + } + + // Should have 2 messages: user and assistant (thinking only) + if len(chatReq.Messages) != 2 { + t.Fatalf("expected 2 messages, got %d", len(chatReq.Messages)) + } + + // Check assistant message has only thinking + assistantMsg := chatReq.Messages[1] + if assistantMsg.Role != "assistant" { + t.Errorf("Messages[1].Role = %q, want %q", assistantMsg.Role, "assistant") + } + if assistantMsg.Thinking != "Still thinking..." { + t.Errorf("Messages[1].Thinking = %q, want %q", assistantMsg.Thinking, "Still thinking...") + } + if assistantMsg.Content != "" { + t.Errorf("Messages[1].Content = %q, want empty", assistantMsg.Content) + } + }) +} + +func TestToResponse_WithReasoning(t *testing.T) { + response := ToResponse("gpt-oss:20b", "resp_123", "msg_456", api.ChatResponse{ + CreatedAt: time.Now(), + Message: api.Message{ + Thinking: "Analyzing the question...", + Content: "The answer is 42", + }, + Done: true, + }) + + // Should have 2 output items: reasoning + message + if len(response.Output) != 2 { + t.Fatalf("expected 2 output items, got %d", len(response.Output)) + } + + // First item should be reasoning + if response.Output[0].Type != "reasoning" { + t.Errorf("Output[0].Type = %q, want %q", response.Output[0].Type, "reasoning") + } + if len(response.Output[0].Summary) != 1 { + t.Fatalf("expected 1 summary item, got %d", len(response.Output[0].Summary)) + } + if response.Output[0].Summary[0].Text != "Analyzing the question..." { + t.Errorf("Summary[0].Text = %q, want %q", response.Output[0].Summary[0].Text, "Analyzing the question...") + } + if response.Output[0].EncryptedContent != "Analyzing the question..." { + t.Errorf("EncryptedContent = %q, want %q", response.Output[0].EncryptedContent, "Analyzing the question...") + } + + // Second item should be message + if response.Output[1].Type != "message" { + t.Errorf("Output[1].Type = %q, want %q", response.Output[1].Type, "message") + } + if response.Output[1].Content[0].Text != "The answer is 42" { + t.Errorf("Content[0].Text = %q, want %q", response.Output[1].Content[0].Text, "The answer is 42") + } +} + +func TestFromResponsesRequest_Instructions(t *testing.T) { + reqJSON := `{ + "model": "gpt-oss:20b", + "instructions": "You are a helpful pirate. Always respond in pirate speak.", + "input": "Hello" + }` + + var req ResponsesRequest + if err := json.Unmarshal([]byte(reqJSON), &req); err != nil { + t.Fatalf("failed to unmarshal request: %v", err) + } + + chatReq, err := FromResponsesRequest(req) + if err != nil { + t.Fatalf("failed to convert request: %v", err) + } + + // Should have 2 messages: system (instructions) + user + if len(chatReq.Messages) != 2 { + t.Fatalf("expected 2 messages, got %d", len(chatReq.Messages)) + } + + // First message should be system with instructions + if chatReq.Messages[0].Role != "system" { + t.Errorf("Messages[0].Role = %q, want %q", chatReq.Messages[0].Role, "system") + } + if chatReq.Messages[0].Content != "You are a helpful pirate. Always respond in pirate speak." { + t.Errorf("Messages[0].Content = %q, want instructions", chatReq.Messages[0].Content) + } + + // Second message should be user + if chatReq.Messages[1].Role != "user" { + t.Errorf("Messages[1].Role = %q, want %q", chatReq.Messages[1].Role, "user") + } + if chatReq.Messages[1].Content != "Hello" { + t.Errorf("Messages[1].Content = %q, want %q", chatReq.Messages[1].Content, "Hello") + } +} + +func TestFromResponsesRequest_MaxOutputTokens(t *testing.T) { + reqJSON := `{ + "model": "gpt-oss:20b", + "input": "Write a story", + "max_output_tokens": 100 + }` + + var req ResponsesRequest + if err := json.Unmarshal([]byte(reqJSON), &req); err != nil { + t.Fatalf("failed to unmarshal request: %v", err) + } + + chatReq, err := FromResponsesRequest(req) + if err != nil { + t.Fatalf("failed to convert request: %v", err) + } + + // Check that num_predict is set in options + numPredict, ok := chatReq.Options["num_predict"] + if !ok { + t.Fatal("expected num_predict in options") + } + if numPredict != 100 { + t.Errorf("num_predict = %v, want 100", numPredict) + } +} + +func TestFromResponsesRequest_TextFormatJsonSchema(t *testing.T) { + reqJSON := `{ + "model": "gpt-oss:20b", + "input": "Give me info about John who is 30", + "text": { + "format": { + "type": "json_schema", + "name": "person", + "strict": true, + "schema": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"} + }, + "required": ["name", "age"] + } + } + } + }` + + var req ResponsesRequest + if err := json.Unmarshal([]byte(reqJSON), &req); err != nil { + t.Fatalf("failed to unmarshal request: %v", err) + } + + // Verify the text format was parsed + if req.Text == nil || req.Text.Format == nil { + t.Fatal("expected Text.Format to be set") + } + if req.Text.Format.Type != "json_schema" { + t.Errorf("Text.Format.Type = %q, want %q", req.Text.Format.Type, "json_schema") + } + + chatReq, err := FromResponsesRequest(req) + if err != nil { + t.Fatalf("failed to convert request: %v", err) + } + + // Check that Format is set + if chatReq.Format == nil { + t.Fatal("expected Format to be set") + } + + // Verify the schema is passed through + var schema map[string]any + if err := json.Unmarshal(chatReq.Format, &schema); err != nil { + t.Fatalf("failed to unmarshal format: %v", err) + } + if schema["type"] != "object" { + t.Errorf("schema type = %v, want %q", schema["type"], "object") + } + props, ok := schema["properties"].(map[string]any) + if !ok { + t.Fatal("expected properties in schema") + } + if _, ok := props["name"]; !ok { + t.Error("expected 'name' in schema properties") + } + if _, ok := props["age"]; !ok { + t.Error("expected 'age' in schema properties") + } +} + +func TestFromResponsesRequest_TextFormatText(t *testing.T) { + // When format type is "text", Format should be nil (no constraint) + reqJSON := `{ + "model": "gpt-oss:20b", + "input": "Hello", + "text": { + "format": { + "type": "text" + } + } + }` + + var req ResponsesRequest + if err := json.Unmarshal([]byte(reqJSON), &req); err != nil { + t.Fatalf("failed to unmarshal request: %v", err) + } + + chatReq, err := FromResponsesRequest(req) + if err != nil { + t.Fatalf("failed to convert request: %v", err) + } + + // Format should be nil for "text" type + if chatReq.Format != nil { + t.Errorf("expected Format to be nil for text type, got %s", string(chatReq.Format)) + } +} + +func TestResponsesInputMessage_ShorthandFormats(t *testing.T) { + t.Run("string content shorthand", func(t *testing.T) { + // Content can be a plain string instead of an array of content items + jsonStr := `{"type": "message", "role": "user", "content": "Hello world"}` + + var msg ResponsesInputMessage + if err := json.Unmarshal([]byte(jsonStr), &msg); err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if msg.Role != "user" { + t.Errorf("Role = %q, want %q", msg.Role, "user") + } + if len(msg.Content) != 1 { + t.Fatalf("len(Content) = %d, want 1", len(msg.Content)) + } + + textContent, ok := msg.Content[0].(ResponsesTextContent) + if !ok { + t.Fatalf("Content[0] type = %T, want ResponsesTextContent", msg.Content[0]) + } + if textContent.Text != "Hello world" { + t.Errorf("Content[0].Text = %q, want %q", textContent.Text, "Hello world") + } + if textContent.Type != "input_text" { + t.Errorf("Content[0].Type = %q, want %q", textContent.Type, "input_text") + } + }) + + t.Run("output_text content type", func(t *testing.T) { + // Previous assistant responses come back with output_text content type + jsonStr := `{"type": "message", "role": "assistant", "content": [{"type": "output_text", "text": "I am an assistant"}]}` + + var msg ResponsesInputMessage + if err := json.Unmarshal([]byte(jsonStr), &msg); err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if msg.Role != "assistant" { + t.Errorf("Role = %q, want %q", msg.Role, "assistant") + } + if len(msg.Content) != 1 { + t.Fatalf("len(Content) = %d, want 1", len(msg.Content)) + } + + outputContent, ok := msg.Content[0].(ResponsesOutputTextContent) + if !ok { + t.Fatalf("Content[0] type = %T, want ResponsesOutputTextContent", msg.Content[0]) + } + if outputContent.Text != "I am an assistant" { + t.Errorf("Content[0].Text = %q, want %q", outputContent.Text, "I am an assistant") + } + }) +} + +func TestUnmarshalResponsesInputItem_ShorthandMessage(t *testing.T) { + t.Run("message without type field", func(t *testing.T) { + // When type is omitted but role is present, treat as message + jsonStr := `{"role": "user", "content": "Hello"}` + + item, err := unmarshalResponsesInputItem([]byte(jsonStr)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + msg, ok := item.(ResponsesInputMessage) + if !ok { + t.Fatalf("got type %T, want ResponsesInputMessage", item) + } + if msg.Role != "user" { + t.Errorf("Role = %q, want %q", msg.Role, "user") + } + if len(msg.Content) != 1 { + t.Fatalf("len(Content) = %d, want 1", len(msg.Content)) + } + }) + + t.Run("message with both type and role", func(t *testing.T) { + // Explicit type should still work + jsonStr := `{"type": "message", "role": "system", "content": "You are helpful"}` + + item, err := unmarshalResponsesInputItem([]byte(jsonStr)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + msg, ok := item.(ResponsesInputMessage) + if !ok { + t.Fatalf("got type %T, want ResponsesInputMessage", item) + } + if msg.Role != "system" { + t.Errorf("Role = %q, want %q", msg.Role, "system") + } + }) +} + +func TestFromResponsesRequest_ShorthandFormats(t *testing.T) { + t.Run("shorthand message without type", func(t *testing.T) { + // Real-world format from OpenAI SDK + reqJSON := `{ + "model": "gpt-4.1", + "input": [ + {"role": "user", "content": "What is the weather in Tokyo?"} + ] + }` + + var req ResponsesRequest + if err := json.Unmarshal([]byte(reqJSON), &req); err != nil { + t.Fatalf("failed to unmarshal request: %v", err) + } + + if len(req.Input.Items) != 1 { + t.Fatalf("expected 1 input item, got %d", len(req.Input.Items)) + } + + msg, ok := req.Input.Items[0].(ResponsesInputMessage) + if !ok { + t.Fatalf("Input.Items[0] type = %T, want ResponsesInputMessage", req.Input.Items[0]) + } + if msg.Role != "user" { + t.Errorf("Role = %q, want %q", msg.Role, "user") + } + + chatReq, err := FromResponsesRequest(req) + if err != nil { + t.Fatalf("failed to convert request: %v", err) + } + + if len(chatReq.Messages) != 1 { + t.Fatalf("expected 1 message, got %d", len(chatReq.Messages)) + } + if chatReq.Messages[0].Content != "What is the weather in Tokyo?" { + t.Errorf("Content = %q, want %q", chatReq.Messages[0].Content, "What is the weather in Tokyo?") + } + }) + + t.Run("conversation with output_text from previous response", func(t *testing.T) { + // Simulates a multi-turn conversation where previous assistant response is sent back + reqJSON := `{ + "model": "gpt-4.1", + "input": [ + {"role": "user", "content": "Hello"}, + {"type": "message", "role": "assistant", "content": [{"type": "output_text", "text": "Hi there!"}]}, + {"role": "user", "content": "How are you?"} + ] + }` + + var req ResponsesRequest + if err := json.Unmarshal([]byte(reqJSON), &req); err != nil { + t.Fatalf("failed to unmarshal request: %v", err) + } + + chatReq, err := FromResponsesRequest(req) + if err != nil { + t.Fatalf("failed to convert request: %v", err) + } + + if len(chatReq.Messages) != 3 { + t.Fatalf("expected 3 messages, got %d", len(chatReq.Messages)) + } + + // Check first user message + if chatReq.Messages[0].Role != "user" || chatReq.Messages[0].Content != "Hello" { + t.Errorf("Messages[0] = {Role: %q, Content: %q}, want {Role: \"user\", Content: \"Hello\"}", + chatReq.Messages[0].Role, chatReq.Messages[0].Content) + } + + // Check assistant message (output_text should be converted to content) + if chatReq.Messages[1].Role != "assistant" || chatReq.Messages[1].Content != "Hi there!" { + t.Errorf("Messages[1] = {Role: %q, Content: %q}, want {Role: \"assistant\", Content: \"Hi there!\"}", + chatReq.Messages[1].Role, chatReq.Messages[1].Content) + } + + // Check second user message + if chatReq.Messages[2].Role != "user" || chatReq.Messages[2].Content != "How are you?" { + t.Errorf("Messages[2] = {Role: %q, Content: %q}, want {Role: \"user\", Content: \"How are you?\"}", + chatReq.Messages[2].Role, chatReq.Messages[2].Content) + } + }) +} + +func TestResponsesStreamConverter_OutputIncludesContent(t *testing.T) { + // Verify that response.output_item.done includes content field for messages + converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b") + + // First chunk + converter.Process(api.ChatResponse{ + Message: api.Message{Content: "Hello World"}, + }) + + // Final chunk + events := converter.Process(api.ChatResponse{ + Message: api.Message{}, + Done: true, + }) + + // Find the output_item.done event + var outputItemDone map[string]any + for _, event := range events { + if event.Event == "response.output_item.done" { + outputItemDone = event.Data.(map[string]any) + break + } + } + + if outputItemDone == nil { + t.Fatal("expected response.output_item.done event") + } + + item := outputItemDone["item"].(map[string]any) + if item["type"] != "message" { + t.Errorf("item.type = %q, want %q", item["type"], "message") + } + + content, ok := item["content"].([]map[string]any) + if !ok { + t.Fatalf("item.content type = %T, want []map[string]any", item["content"]) + } + if len(content) != 1 { + t.Fatalf("len(content) = %d, want 1", len(content)) + } + if content[0]["type"] != "output_text" { + t.Errorf("content[0].type = %q, want %q", content[0]["type"], "output_text") + } + if content[0]["text"] != "Hello World" { + t.Errorf("content[0].text = %q, want %q", content[0]["text"], "Hello World") + } +} + +func TestResponsesStreamConverter_ResponseCompletedIncludesOutput(t *testing.T) { + // Verify that response.completed includes the output array + converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b") + + // Process some content + converter.Process(api.ChatResponse{ + Message: api.Message{Content: "Test response"}, + }) + + // Final chunk + events := converter.Process(api.ChatResponse{ + Message: api.Message{}, + Done: true, + }) + + // Find the response.completed event + var responseCompleted map[string]any + for _, event := range events { + if event.Event == "response.completed" { + responseCompleted = event.Data.(map[string]any) + break + } + } + + if responseCompleted == nil { + t.Fatal("expected response.completed event") + } + + response := responseCompleted["response"].(map[string]any) + output, ok := response["output"].([]any) + if !ok { + t.Fatalf("response.output type = %T, want []any", response["output"]) + } + + if len(output) != 1 { + t.Fatalf("len(output) = %d, want 1", len(output)) + } + + item := output[0].(map[string]any) + if item["type"] != "message" { + t.Errorf("output[0].type = %q, want %q", item["type"], "message") + } +} + +func TestResponsesStreamConverter_ResponseCreatedIncludesOutput(t *testing.T) { + // Verify that response.created includes an empty output array + converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b") + + events := converter.Process(api.ChatResponse{ + Message: api.Message{Content: "Hi"}, + }) + + // First event should be response.created + if events[0].Event != "response.created" { + t.Fatalf("events[0].Event = %q, want %q", events[0].Event, "response.created") + } + + data := events[0].Data.(map[string]any) + response := data["response"].(map[string]any) + + output, ok := response["output"].([]any) + if !ok { + t.Fatalf("response.output type = %T, want []any", response["output"]) + } + + // Should be empty array initially + if len(output) != 0 { + t.Errorf("len(output) = %d, want 0", len(output)) + } +} + +func TestResponsesStreamConverter_SequenceNumbers(t *testing.T) { + // Verify that events include incrementing sequence numbers + converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b") + + events := converter.Process(api.ChatResponse{ + Message: api.Message{Content: "Hello"}, + }) + + for i, event := range events { + data := event.Data.(map[string]any) + seqNum, ok := data["sequence_number"].(int) + if !ok { + t.Fatalf("events[%d] missing sequence_number", i) + } + if seqNum != i { + t.Errorf("events[%d].sequence_number = %d, want %d", i, seqNum, i) + } + } + + // Process more content, sequence should continue + moreEvents := converter.Process(api.ChatResponse{ + Message: api.Message{Content: " World"}, + }) + + expectedSeq := len(events) + for i, event := range moreEvents { + data := event.Data.(map[string]any) + seqNum := data["sequence_number"].(int) + if seqNum != expectedSeq+i { + t.Errorf("moreEvents[%d].sequence_number = %d, want %d", i, seqNum, expectedSeq+i) + } + } +} + +func TestResponsesStreamConverter_FunctionCallStatus(t *testing.T) { + // Verify that function call items include status field + converter := NewResponsesStreamConverter("resp_123", "msg_456", "gpt-oss:20b") + + events := converter.Process(api.ChatResponse{ + Message: api.Message{ + ToolCalls: []api.ToolCall{ + { + ID: "call_abc", + Function: api.ToolCallFunction{ + Name: "get_weather", + Arguments: api.ToolCallFunctionArguments{"city": "Paris"}, + }, + }, + }, + }, + }) + + // Find output_item.added event + var addedItem map[string]any + var doneItem map[string]any + for _, event := range events { + data := event.Data.(map[string]any) + if data["type"] == "response.output_item.added" { + item := data["item"].(map[string]any) + if item["type"] == "function_call" { + addedItem = item + } + } + if data["type"] == "response.output_item.done" { + item := data["item"].(map[string]any) + if item["type"] == "function_call" { + doneItem = item + } + } + } + + if addedItem == nil { + t.Fatal("expected function_call output_item.added event") + } + if addedItem["status"] != "in_progress" { + t.Errorf("output_item.added status = %q, want %q", addedItem["status"], "in_progress") + } + + if doneItem == nil { + t.Fatal("expected function_call output_item.done event") + } + if doneItem["status"] != "completed" { + t.Errorf("output_item.done status = %q, want %q", doneItem["status"], "completed") + } +} diff --git a/server/routes.go b/server/routes.go index bbf6b9b90..54f23d5d5 100644 --- a/server/routes.go +++ b/server/routes.go @@ -1532,6 +1532,7 @@ func (s *Server) GenerateRoutes(rc *ollama.Registry) (http.Handler, error) { r.POST("/v1/embeddings", middleware.EmbeddingsMiddleware(), s.EmbedHandler) r.GET("/v1/models", middleware.ListMiddleware(), s.ListHandler) r.GET("/v1/models/:model", middleware.RetrieveMiddleware(), s.ShowHandler) + r.POST("/v1/responses", middleware.ResponsesMiddleware(), s.ChatHandler) if rc != nil { // wrap old with new @@ -2393,3 +2394,4 @@ func filterThinkTags(msgs []api.Message, m *Model) []api.Message { } return msgs } +