server: add logprobs and top_logprobs support to Ollama's API (#12899)

Adds logprobs support to Ollama's API including support for Ollama's OpenAI-compatible API. By specifying the new 'logprobs' boolean parameter in the API, Ollama will return the log probabilities for each token generated. 'top_logprobs', an integer value can also be specified up to the value 20. When specified, the API will also provide the number of most likely tokens to return at each token position Co-authored-by: Baptiste Jamin <baptiste@crisp.chat>
2025-11-11 17:49:50 +01:00 · 2025-11-11 17:49:50 +01:00 · 59241c5bee
parent 6df4208836
commit 59241c5bee
13 changed files with 1367 additions and 47 deletions
--- a/api/types.go
+++ b/api/types.go
@ -117,6 +117,14 @@ type GenerateRequest struct {
 	// DebugRenderOnly is a debug option that, when set to true, returns the rendered
 	// template instead of calling the model.
 	DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
 	// Logprobs specifies whether to return log probabilities of the output tokens.
 	Logprobs bool `json:"logprobs,omitempty"`
 	// TopLogprobs is the number of most likely tokens to return at each token position,
 	// each with an associated log probability. Only applies when Logprobs is true.
 	// Valid values are 0-20. Default is 0 (only return the selected token's logprob).
 	TopLogprobs int `json:"top_logprobs,omitempty"`
 }
 // ChatRequest describes a request sent by [Client.Chat].
@ -159,6 +167,14 @@ type ChatRequest struct {
 	// DebugRenderOnly is a debug option that, when set to true, returns the rendered
 	// template instead of calling the model.
 	DebugRenderOnly bool `json:"_debug_render_only,omitempty"`
 	// Logprobs specifies whether to return log probabilities of the output tokens.
 	Logprobs bool `json:"logprobs,omitempty"`
 	// TopLogprobs is the number of most likely tokens to return at each token position,
 	// each with an associated log probability. Only applies when Logprobs is true.
 	// Valid values are 0-20. Default is 0 (only return the selected token's logprob).
 	TopLogprobs int `json:"top_logprobs,omitempty"`
 }
 type Tools []Tool
@ -343,6 +359,24 @@ func (t *ToolFunction) String() string {
 	return string(bts)
 }
 // TokenLogprob represents log probability information for a single token alternative.
 type TokenLogprob struct {
 	// Token is the text representation of the token.
 	Token string `json:"token"`
 	// Logprob is the log probability of this token.
 	Logprob float64 `json:"logprob"`
 }
 // Logprob contains log probability information for a generated token.
 type Logprob struct {
 	TokenLogprob
 	// TopLogprobs contains the most likely tokens and their log probabilities
 	// at this position, if requested via TopLogprobs parameter.
 	TopLogprobs []TokenLogprob `json:"top_logprobs,omitempty"`
 }
 // ChatResponse is the response returned by [Client.Chat]. Its fields are
 // similar to [GenerateResponse].
 type ChatResponse struct {
@ -369,6 +403,10 @@ type ChatResponse struct {
 	DebugInfo *DebugInfo `json:"_debug_info,omitempty"`
 	// Logprobs contains log probability information for the generated tokens,
 	// if requested via the Logprobs parameter.
 	Logprobs []Logprob `json:"logprobs,omitempty"`
 	Metrics
 }
@ -677,6 +715,10 @@ type GenerateResponse struct {
 	ToolCalls []ToolCall `json:"tool_calls,omitempty"`
 	DebugInfo *DebugInfo `json:"_debug_info,omitempty"`
 	// Logprobs contains log probability information for the generated tokens,
 	// if requested via the Logprobs parameter.
 	Logprobs []Logprob `json:"logprobs,omitempty"`
 }
 // ModelDetails provides details about a model.
--- a/integration/api_test.go
+++ b/integration/api_test.go
@ -381,3 +381,174 @@ func TestAPIShowModel(t *testing.T) {
 		t.Errorf("%s missing modified_at: %#v", modelName, resp)
 	}
 }
 func TestAPIGenerateLogprobs(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
 	if err := PullIfMissing(ctx, client, smol); err != nil {
 		t.Fatalf("pull failed %s", err)
 	}
 	enableLogprobs := true
 	noStream := false
 	tests := []struct {
 		name        string
 		logprobs    *bool
 		topLogprobs int
 		expectCount int
 	}{
 		{
 			name:        "no_logprobs",
 			logprobs:    nil,
 			topLogprobs: 0,
 			expectCount: 0,
 		},
 		{
 			name:        "logprobs_only",
 			logprobs:    &enableLogprobs,
 			topLogprobs: 0,
 			expectCount: 1,
 		},
 		{
 			name:        "logprobs_with_top_5",
 			logprobs:    &enableLogprobs,
 			topLogprobs: 5,
 			expectCount: 1,
 		},
 	}
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
 			req := api.GenerateRequest{
 				Model:       smol,
 				Prompt:      "Why is the sky blue?",
 				Stream:      &noStream,
 				Logprobs:    test.logprobs != nil && *test.logprobs,
 				TopLogprobs: test.topLogprobs,
 				Options: map[string]interface{}{
 					"temperature": 0,
 					"seed":        123,
 					"num_predict": 10,
 				},
 			}
 			var response api.GenerateResponse
 			err := client.Generate(ctx, &req, func(resp api.GenerateResponse) error {
 				if resp.Done {
 					response = resp
 				}
 				return nil
 			})
 			if err != nil {
 				t.Fatalf("generate failed: %s", err)
 			}
 			// Check logprobs based on expectation
 			if test.expectCount == 0 {
 				if len(response.Logprobs) > 0 {
 					t.Errorf("expected no logprobs but got %d", len(response.Logprobs))
 				}
 			} else {
 				if len(response.Logprobs) == 0 {
 					t.Errorf("expected logprobs but got none")
 				}
 				// Validate each logprob entry
 				for i, lp := range response.Logprobs {
 					if lp.Token == "" {
 						t.Errorf("logprob[%d] has empty token", i)
 					}
 					if lp.Logprob > 0 {
 						t.Errorf("logprob[%d] has positive logprob %f (should be <= 0)", i, lp.Logprob)
 					}
 					// Check top_logprobs if requested
 					if test.topLogprobs > 0 {
 						if len(lp.TopLogprobs) == 0 {
 							t.Errorf("logprob[%d] expected top_logprobs but got none", i)
 						}
 						if len(lp.TopLogprobs) > test.topLogprobs {
 							t.Errorf("logprob[%d] has %d top_logprobs, expected max %d", i, len(lp.TopLogprobs), test.topLogprobs)
 						}
 						// Verify top_logprobs are sorted by probability (descending)
 						for j := 1; j < len(lp.TopLogprobs); j++ {
 							if lp.TopLogprobs[j-1].Logprob < lp.TopLogprobs[j].Logprob {
 								t.Errorf("logprob[%d].top_logprobs not sorted: %f < %f", i, lp.TopLogprobs[j-1].Logprob, lp.TopLogprobs[j].Logprob)
 							}
 						}
 					} else if len(lp.TopLogprobs) > 0 {
 						t.Errorf("logprob[%d] has top_logprobs but none were requested", i)
 					}
 				}
 			}
 		})
 	}
 }
 func TestAPIChatLogprobs(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()
 	if err := PullIfMissing(ctx, client, smol); err != nil {
 		t.Fatalf("pull failed %s", err)
 	}
 	enableLogprobs := true
 	noStream := false
 	req := api.ChatRequest{
 		Model: smol,
 		Messages: []api.Message{
 			{Role: "user", Content: "Say hello in one word"},
 		},
 		Stream:      &noStream,
 		Logprobs:    enableLogprobs,
 		TopLogprobs: 3,
 		Options: map[string]interface{}{
 			"temperature": 0,
 			"seed":        123,
 			"num_predict": 5,
 		},
 	}
 	var response api.ChatResponse
 	err := client.Chat(ctx, &req, func(resp api.ChatResponse) error {
 		if resp.Done {
 			response = resp
 		}
 		return nil
 	})
 	if err != nil {
 		t.Fatalf("chat failed: %s", err)
 	}
 	if len(response.Logprobs) == 0 {
 		t.Fatal("expected logprobs in response but got none")
 	}
 	t.Logf("received %d logprobs for chat response", len(response.Logprobs))
 	for i, lp := range response.Logprobs {
 		if lp.Token == "" {
 			t.Errorf("logprob[%d] has empty token", i)
 		}
 		if lp.Logprob > 0 {
 			t.Errorf("logprob[%d] has positive logprob %f", i, lp.Logprob)
 		}
 		if len(lp.TopLogprobs) == 0 {
 			t.Errorf("logprob[%d] expected top_logprobs but got none", i)
 		}
 		if len(lp.TopLogprobs) > 3 {
 			t.Errorf("logprob[%d] has %d top_logprobs, expected max 3", i, len(lp.TopLogprobs))
 		}
 	}
 }
--- a/llama/llama.go
+++ b/llama/llama.go
@ -217,6 +217,19 @@ func (c *Context) GetEmbeddingsIth(i int) []float32 {
 	return embeddings
 }
 // GetLogitsIth gets the logits for the ith token
 func (c *Context) GetLogitsIth(i int) []float32 {
 	logits := unsafe.Pointer(C.llama_get_logits_ith(c.c, C.int32_t(i)))
 	if logits == nil {
 		return nil
 	}
 	vocabSize := c.Model().NumVocab()
 	result := make([]float32, vocabSize)
 	_ = copy(result, unsafe.Slice((*float32)(logits), vocabSize))
 	return result
 }
 type ModelParams struct {
 	NumGpuLayers int
 	MainGpu      int
--- a/llm/server.go
+++ b/llm/server.go
@ -1362,6 +1362,12 @@ type CompletionRequest struct {
 	Grammar  string // set before sending the request to the subprocess
 	Shift    bool
 	Truncate bool
 	// Logprobs specifies whether to include log probabilities in the response
 	Logprobs bool
 	// TopLogprobs specifies the number of most likely alternative tokens to return (0-20)
 	TopLogprobs int
 }
 // DoneReason represents the reason why a completion response is done
@ -1387,6 +1393,18 @@ func (d DoneReason) String() string {
 	}
 }
 // TokenLogprob represents log probability information for a single token alternative.
 type TokenLogprob struct {
 	Token   string  `json:"token"`
 	Logprob float64 `json:"logprob"`
 }
 // Logprob contains log probability information for a generated token.
 type Logprob struct {
 	TokenLogprob
 	TopLogprobs []TokenLogprob `json:"top_logprobs,omitempty"`
 }
 type CompletionResponse struct {
 	Content            string        `json:"content"`
 	DoneReason         DoneReason    `json:"done_reason"`
@ -1395,6 +1413,9 @@ type CompletionResponse struct {
 	PromptEvalDuration time.Duration `json:"prompt_eval_duration"`
 	EvalCount          int           `json:"eval_count"`
 	EvalDuration       time.Duration `json:"eval_duration"`
 	// Logprobs contains log probability information if requested
 	Logprobs []Logprob `json:"logprobs,omitempty"`
 }
 func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
@ -1530,7 +1551,8 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 			if c.Content != "" {
 				fn(CompletionResponse{
-					Content: c.Content,
+					Content:  c.Content,
 					Logprobs: c.Logprobs,
 				})
 			}
--- a/openai/openai.go
+++ b/openai/openai.go
@ -40,22 +40,29 @@ type Message struct {
 	ToolCallID string     `json:"tool_call_id,omitempty"`
 }
 type ChoiceLogprobs struct {
 	Content []api.Logprob `json:"content"`
 }
 type Choice struct {
-	Index        int     `json:"index"`
+	Index        int             `json:"index"`
-	Message      Message `json:"message"`
+	Message      Message         `json:"message"`
-	FinishReason *string `json:"finish_reason"`
+	FinishReason *string         `json:"finish_reason"`
 	Logprobs     *ChoiceLogprobs `json:"logprobs,omitempty"`
 }
 type ChunkChoice struct {
-	Index        int     `json:"index"`
+	Index        int             `json:"index"`
-	Delta        Message `json:"delta"`
+	Delta        Message         `json:"delta"`
-	FinishReason *string `json:"finish_reason"`
+	FinishReason *string         `json:"finish_reason"`
 	Logprobs     *ChoiceLogprobs `json:"logprobs,omitempty"`
 }
 type CompleteChunkChoice struct {
-	Text         string  `json:"text"`
+	Text         string          `json:"text"`
-	Index        int     `json:"index"`
+	Index        int             `json:"index"`
-	FinishReason *string `json:"finish_reason"`
+	FinishReason *string         `json:"finish_reason"`
 	Logprobs     *ChoiceLogprobs `json:"logprobs,omitempty"`
 }
 type Usage struct {
@ -104,6 +111,8 @@ type ChatCompletionRequest struct {
 	Tools            []api.Tool      `json:"tools"`
 	Reasoning        *Reasoning      `json:"reasoning,omitempty"`
 	ReasoningEffort  *string         `json:"reasoning_effort,omitempty"`
 	Logprobs         *bool           `json:"logprobs"`
 	TopLogprobs      int             `json:"top_logprobs"`
 	DebugRenderOnly  bool            `json:"_debug_render_only"`
 }
@ -142,6 +151,7 @@ type CompletionRequest struct {
 	Temperature      *float32       `json:"temperature"`
 	TopP             float32        `json:"top_p"`
 	Suffix           string         `json:"suffix"`
 	Logprobs         *int           `json:"logprobs"`
 	DebugRenderOnly  bool           `json:"_debug_render_only"`
 }
@ -251,6 +261,12 @@ func ToToolCalls(tc []api.ToolCall) []ToolCall {
 // ToChatCompletion converts an api.ChatResponse to ChatCompletion
 func ToChatCompletion(id string, r api.ChatResponse) ChatCompletion {
 	toolCalls := ToToolCalls(r.Message.ToolCalls)
 	var logprobs *ChoiceLogprobs
 	if len(r.Logprobs) > 0 {
 		logprobs = &ChoiceLogprobs{Content: r.Logprobs}
 	}
 	return ChatCompletion{
 		Id:                id,
 		Object:            "chat.completion",
@ -269,6 +285,7 @@ func ToChatCompletion(id string, r api.ChatResponse) ChatCompletion {
 				}
 				return nil
 			}(r.DoneReason),
 			Logprobs: logprobs,
 		}}, Usage: ToUsage(r),
 		DebugInfo: r.DebugInfo,
 	}
@ -277,6 +294,12 @@ func ToChatCompletion(id string, r api.ChatResponse) ChatCompletion {
 // ToChunk converts an api.ChatResponse to ChatCompletionChunk
 func ToChunk(id string, r api.ChatResponse, toolCallSent bool) ChatCompletionChunk {
 	toolCalls := ToToolCalls(r.Message.ToolCalls)
 	var logprobs *ChoiceLogprobs
 	if len(r.Logprobs) > 0 {
 		logprobs = &ChoiceLogprobs{Content: r.Logprobs}
 	}
 	return ChatCompletionChunk{
 		Id:                id,
 		Object:            "chat.completion.chunk",
@ -295,6 +318,7 @@ func ToChunk(id string, r api.ChatResponse, toolCallSent bool) ChatCompletionChu
 				}
 				return nil
 			}(r.DoneReason),
 			Logprobs: logprobs,
 		}},
 	}
 }
@ -604,6 +628,8 @@ func FromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
 		Stream:          &r.Stream,
 		Tools:           r.Tools,
 		Think:           think,
 		Logprobs:        r.Logprobs != nil && *r.Logprobs,
 		TopLogprobs:     r.TopLogprobs,
 		DebugRenderOnly: r.DebugRenderOnly,
 	}, nil
 }
@ -680,12 +706,21 @@ func FromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) {
 		options["top_p"] = 1.0
 	}
 	var logprobs bool
 	var topLogprobs int
 	if r.Logprobs != nil && *r.Logprobs > 0 {
 		logprobs = true
 		topLogprobs = *r.Logprobs
 	}
 	return api.GenerateRequest{
 		Model:           r.Model,
 		Prompt:          r.Prompt,
 		Options:         options,
 		Stream:          &r.Stream,
 		Suffix:          r.Suffix,
 		Logprobs:        logprobs,
 		TopLogprobs:     topLogprobs,
 		DebugRenderOnly: r.DebugRenderOnly,
 	}, nil
 }
--- a/openai/openai_test.go
+++ b/openai/openai_test.go
@ -3,6 +3,7 @@ package openai
 import (
 	"encoding/base64"
 	"testing"
 	"time"
 	"github.com/google/go-cmp/cmp"
@ -218,3 +219,218 @@ func TestToToolCallsPreservesIDs(t *testing.T) {
 		t.Errorf("input tool calls mutated (-want +got):\n%s", diff)
 	}
 }
 func TestFromChatRequest_WithLogprobs(t *testing.T) {
 	trueVal := true
 	req := ChatCompletionRequest{
 		Model: "test-model",
 		Messages: []Message{
 			{Role: "user", Content: "Hello"},
 		},
 		Logprobs:    &trueVal,
 		TopLogprobs: 5,
 	}
 	result, err := FromChatRequest(req)
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	if !result.Logprobs {
 		t.Error("expected Logprobs to be true")
 	}
 	if result.TopLogprobs != 5 {
 		t.Errorf("expected TopLogprobs to be 5, got %d", result.TopLogprobs)
 	}
 }
 func TestFromChatRequest_LogprobsDefault(t *testing.T) {
 	req := ChatCompletionRequest{
 		Model: "test-model",
 		Messages: []Message{
 			{Role: "user", Content: "Hello"},
 		},
 	}
 	result, err := FromChatRequest(req)
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	if result.Logprobs {
 		t.Error("expected Logprobs to be false by default")
 	}
 	if result.TopLogprobs != 0 {
 		t.Errorf("expected TopLogprobs to be 0 by default, got %d", result.TopLogprobs)
 	}
 }
 func TestFromCompleteRequest_WithLogprobs(t *testing.T) {
 	logprobsVal := 5
 	req := CompletionRequest{
 		Model:    "test-model",
 		Prompt:   "Hello",
 		Logprobs: &logprobsVal,
 	}
 	result, err := FromCompleteRequest(req)
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
 	if !result.Logprobs {
 		t.Error("expected Logprobs to be true")
 	}
 	if result.TopLogprobs != 5 {
 		t.Errorf("expected TopLogprobs to be 5, got %d", result.TopLogprobs)
 	}
 }
 func TestToChatCompletion_WithLogprobs(t *testing.T) {
 	createdAt := time.Unix(1234567890, 0)
 	resp := api.ChatResponse{
 		Model:     "test-model",
 		CreatedAt: createdAt,
 		Message:   api.Message{Role: "assistant", Content: "Hello there"},
 		Logprobs: []api.Logprob{
 			{
 				TokenLogprob: api.TokenLogprob{
 					Token:   "Hello",
 					Logprob: -0.5,
 				},
 				TopLogprobs: []api.TokenLogprob{
 					{Token: "Hello", Logprob: -0.5},
 					{Token: "Hi", Logprob: -1.2},
 				},
 			},
 			{
 				TokenLogprob: api.TokenLogprob{
 					Token:   " there",
 					Logprob: -0.3,
 				},
 				TopLogprobs: []api.TokenLogprob{
 					{Token: " there", Logprob: -0.3},
 					{Token: " world", Logprob: -1.5},
 				},
 			},
 		},
 		Done: true,
 		Metrics: api.Metrics{
 			PromptEvalCount: 5,
 			EvalCount:       2,
 		},
 	}
 	id := "test-id"
 	result := ToChatCompletion(id, resp)
 	if result.Id != id {
 		t.Errorf("expected Id %q, got %q", id, result.Id)
 	}
 	if result.Created != 1234567890 {
 		t.Errorf("expected Created %d, got %d", int64(1234567890), result.Created)
 	}
 	if len(result.Choices) != 1 {
 		t.Fatalf("expected 1 choice, got %d", len(result.Choices))
 	}
 	choice := result.Choices[0]
 	if choice.Message.Content != "Hello there" {
 		t.Errorf("expected content %q, got %q", "Hello there", choice.Message.Content)
 	}
 	if choice.Logprobs == nil {
 		t.Fatal("expected Logprobs to be present")
 	}
 	if len(choice.Logprobs.Content) != 2 {
 		t.Fatalf("expected 2 logprobs, got %d", len(choice.Logprobs.Content))
 	}
 	// Verify first logprob
 	if choice.Logprobs.Content[0].Token != "Hello" {
 		t.Errorf("expected first token %q, got %q", "Hello", choice.Logprobs.Content[0].Token)
 	}
 	if choice.Logprobs.Content[0].Logprob != -0.5 {
 		t.Errorf("expected first logprob -0.5, got %f", choice.Logprobs.Content[0].Logprob)
 	}
 	if len(choice.Logprobs.Content[0].TopLogprobs) != 2 {
 		t.Errorf("expected 2 top_logprobs, got %d", len(choice.Logprobs.Content[0].TopLogprobs))
 	}
 	// Verify second logprob
 	if choice.Logprobs.Content[1].Token != " there" {
 		t.Errorf("expected second token %q, got %q", " there", choice.Logprobs.Content[1].Token)
 	}
 }
 func TestToChatCompletion_WithoutLogprobs(t *testing.T) {
 	createdAt := time.Unix(1234567890, 0)
 	resp := api.ChatResponse{
 		Model:     "test-model",
 		CreatedAt: createdAt,
 		Message:   api.Message{Role: "assistant", Content: "Hello"},
 		Done:      true,
 		Metrics: api.Metrics{
 			PromptEvalCount: 5,
 			EvalCount:       1,
 		},
 	}
 	id := "test-id"
 	result := ToChatCompletion(id, resp)
 	if len(result.Choices) != 1 {
 		t.Fatalf("expected 1 choice, got %d", len(result.Choices))
 	}
 	// When no logprobs, Logprobs should be nil
 	if result.Choices[0].Logprobs != nil {
 		t.Error("expected Logprobs to be nil when not requested")
 	}
 }
 func TestFromChatRequest_TopLogprobsRange(t *testing.T) {
 	tests := []struct {
 		name        string
 		topLogprobs int
 		expectValid bool
 	}{
 		{name: "valid: 0", topLogprobs: 0, expectValid: true},
 		{name: "valid: 1", topLogprobs: 1, expectValid: true},
 		{name: "valid: 10", topLogprobs: 10, expectValid: true},
 		{name: "valid: 20", topLogprobs: 20, expectValid: true},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			trueVal := true
 			req := ChatCompletionRequest{
 				Model: "test-model",
 				Messages: []Message{
 					{Role: "user", Content: "Hello"},
 				},
 				Logprobs:    &trueVal,
 				TopLogprobs: tt.topLogprobs,
 			}
 			result, err := FromChatRequest(req)
 			if err != nil {
 				t.Fatalf("unexpected error: %v", err)
 			}
 			if result.TopLogprobs != tt.topLogprobs {
 				t.Errorf("expected TopLogprobs %d, got %d", tt.topLogprobs, result.TopLogprobs)
 			}
 		})
 	}
 }
--- a/runner/common/logprob.go
+++ b/runner/common/logprob.go
@ -0,0 +1,79 @@
 package common
 import (
 	"math"
 	"sort"
 	"github.com/ollama/ollama/llm"
 )
 // TokenDecoderFunc is a function that converts token IDs to text.
 type TokenDecoderFunc func(tokenID int) string
 // CalculateLogprobs converts raw logits to log probabilities and finds top K tokens.
 // It uses numerically stable softmax to compute log probabilities.
 func CalculateLogprobs(logits []float32, selectedToken int, topK int, decoder TokenDecoderFunc) []llm.Logprob {
 	if len(logits) == 0 {
 		return nil
 	}
 	// Step 1: Convert logits to log probabilities using numerically stable softmax
 	maxLogit := logits[0]
 	for _, logit := range logits[1:] {
 		if logit > maxLogit {
 			maxLogit = logit
 		}
 	}
 	var sumExp float64
 	for _, logit := range logits {
 		sumExp += math.Exp(float64(logit - maxLogit))
 	}
 	logSumExp := float32(math.Log(sumExp))
 	logProbs := make([]float32, len(logits))
 	for i, logit := range logits {
 		logProbs[i] = (logit - maxLogit) - logSumExp
 	}
 	// Step 2: Get selected token's information
 	selectedLogprob := logProbs[selectedToken]
 	selectedText := decoder(selectedToken)
 	result := llm.Logprob{
 		TokenLogprob: llm.TokenLogprob{
 			Token:   selectedText,
 			Logprob: float64(selectedLogprob),
 		},
 	}
 	// Step 3: If topK requested, find the top K tokens
 	if topK > 0 {
 		type tokenLogprobPair struct {
 			tokenID int
 			logprob float32
 		}
 		pairs := make([]tokenLogprobPair, len(logProbs))
 		for i, lp := range logProbs {
 			pairs[i] = tokenLogprobPair{tokenID: i, logprob: lp}
 		}
 		sort.Slice(pairs, func(i, j int) bool {
 			return pairs[i].logprob > pairs[j].logprob
 		})
 		k := min(topK, len(pairs))
 		topLogprobs := make([]llm.TokenLogprob, k)
 		for i := range k {
 			tokenText := decoder(pairs[i].tokenID)
 			topLogprobs[i] = llm.TokenLogprob{
 				Token:   tokenText,
 				Logprob: float64(pairs[i].logprob),
 			}
 		}
 		result.TopLogprobs = topLogprobs
 	}
 	return []llm.Logprob{result}
 }
--- a/runner/common/logprob_test.go
+++ b/runner/common/logprob_test.go
@ -0,0 +1,498 @@
 package common
 import (
 	"math"
 	"testing"
 	"github.com/ollama/ollama/llm"
 )
 func TestCalculateLogprobs(t *testing.T) {
 	tokens := map[int]string{
 		0: "hello",
 		1: "hi",
 		2: "hey",
 		3: "world",
 	}
 	decoder := func(tokenID int) string {
 		if text, ok := tokens[tokenID]; ok {
 			return text
 		}
 		return ""
 	}
 	tests := []struct {
 		name          string
 		logits        []float32
 		selectedToken int
 		topK          int
 		wantLen       int
 		wantToken     string
 	}{
 		{
 			name:          "Empty logits",
 			logits:        []float32{},
 			selectedToken: 0,
 			topK:          0,
 			wantLen:       0,
 		},
 		{
 			name:          "Single token without top logprobs",
 			logits:        []float32{1.0, 0.5, 0.3, 0.1},
 			selectedToken: 0,
 			topK:          0,
 			wantLen:       1,
 			wantToken:     "hello",
 		},
 		{
 			name:          "Single token with top logprobs",
 			logits:        []float32{1.0, 0.5, 0.3, 0.1},
 			selectedToken: 0,
 			topK:          3,
 			wantLen:       1,
 			wantToken:     "hello",
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			result := CalculateLogprobs(tt.logits, tt.selectedToken, tt.topK, decoder)
 			if len(result) != tt.wantLen {
 				t.Errorf("CalculateLogprobs() returned %d results, want %d", len(result), tt.wantLen)
 			}
 			if tt.wantLen > 0 && result[0].Token != tt.wantToken {
 				t.Errorf("CalculateLogprobs() token = %s, want %s", result[0].Token, tt.wantToken)
 			}
 			if tt.topK > 0 && len(result) > 0 {
 				if len(result[0].TopLogprobs) != tt.topK {
 					t.Errorf("CalculateLogprobs() top logprobs count = %d, want %d", len(result[0].TopLogprobs), tt.topK)
 				}
 			}
 		})
 	}
 }
 func TestCalculateLogprobsNumericalStability(t *testing.T) {
 	tokens := map[int]string{
 		0: "a",
 		1: "b",
 		2: "c",
 	}
 	decoder := func(tokenID int) string {
 		if text, ok := tokens[tokenID]; ok {
 			return text
 		}
 		return ""
 	}
 	// Test with very large logits to ensure numerical stability
 	logits := []float32{1000.0, 999.0, 998.0}
 	result := CalculateLogprobs(logits, 0, 3, decoder)
 	if len(result) != 1 {
 		t.Fatalf("Expected 1 result, got %d", len(result))
 	}
 	// Check that log probabilities are finite and reasonable
 	if math.IsInf(result[0].Logprob, 0) || math.IsNaN(result[0].Logprob) {
 		t.Errorf("Selected token logprob is not finite: %f", result[0].Logprob)
 	}
 	for i, tlp := range result[0].TopLogprobs {
 		if math.IsInf(tlp.Logprob, 0) || math.IsNaN(tlp.Logprob) {
 			t.Errorf("Top logprob[%d] is not finite: %f", i, tlp.Logprob)
 		}
 	}
 	// Top logprobs should be in descending order
 	for i := 1; i < len(result[0].TopLogprobs); i++ {
 		if result[0].TopLogprobs[i].Logprob > result[0].TopLogprobs[i-1].Logprob {
 			t.Errorf("Top logprobs not in descending order: %f > %f",
 				result[0].TopLogprobs[i].Logprob, result[0].TopLogprobs[i-1].Logprob)
 		}
 	}
 }
 func TestCalculateLogprobsProbabilityCorrectness(t *testing.T) {
 	tokens := map[int]string{
 		0: "hello",
 		1: "world",
 		2: "foo",
 		3: "bar",
 	}
 	decoder := func(tokenID int) string {
 		if text, ok := tokens[tokenID]; ok {
 			return text
 		}
 		return ""
 	}
 	tests := []struct {
 		name          string
 		logits        []float32
 		selectedToken int
 		topK          int
 	}{
 		{
 			name:          "Uniform logits",
 			logits:        []float32{1.0, 1.0, 1.0, 1.0},
 			selectedToken: 0,
 			topK:          4,
 		},
 		{
 			name:          "Different logits",
 			logits:        []float32{2.0, 1.0, 0.5, 0.1},
 			selectedToken: 0,
 			topK:          4,
 		},
 		{
 			name:          "Negative logits",
 			logits:        []float32{-1.0, -2.0, -3.0, -4.0},
 			selectedToken: 0,
 			topK:          4,
 		},
 		{
 			name:          "Mixed logits",
 			logits:        []float32{5.0, -5.0, 0.0, 2.5},
 			selectedToken: 0,
 			topK:          4,
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			result := CalculateLogprobs(tt.logits, tt.selectedToken, tt.topK, decoder)
 			if len(result) != 1 {
 				t.Fatalf("Expected 1 result, got %d", len(result))
 			}
 			// Verify all probabilities are non-positive (log probabilities should be <= 0)
 			if result[0].Logprob > 0 {
 				t.Errorf("Selected token logprob should be <= 0, got %f", result[0].Logprob)
 			}
 			for i, tlp := range result[0].TopLogprobs {
 				if tlp.Logprob > 0 {
 					t.Errorf("Top logprob[%d] should be <= 0, got %f", i, tlp.Logprob)
 				}
 			}
 			// Verify that probabilities sum to approximately 1
 			// Sum of exp(logprob) for all tokens should equal 1
 			var probSum float64
 			for _, lp := range result[0].TopLogprobs {
 				probSum += math.Exp(lp.Logprob)
 			}
 			// For uniform logits, each probability should be 1/n
 			if tt.name == "Uniform logits" {
 				expectedProb := 1.0 / float64(len(tt.logits))
 				actualProb := math.Exp(result[0].Logprob)
 				if math.Abs(actualProb-expectedProb) > 1e-6 {
 					t.Errorf("For uniform logits, expected probability %f, got %f",
 						expectedProb, actualProb)
 				}
 			}
 			// Verify top logprobs are sorted in descending order
 			for i := 1; i < len(result[0].TopLogprobs); i++ {
 				if result[0].TopLogprobs[i].Logprob > result[0].TopLogprobs[i-1].Logprob {
 					t.Errorf("Top logprobs not sorted: position %d (%f) > position %d (%f)",
 						i, result[0].TopLogprobs[i].Logprob,
 						i-1, result[0].TopLogprobs[i-1].Logprob)
 				}
 			}
 			// Verify the selected token appears in top logprobs
 			selectedText := decoder(tt.selectedToken)
 			found := false
 			for _, tlp := range result[0].TopLogprobs {
 				if tlp.Token == selectedText {
 					found = true
 					// The logprob in top logprobs should match the selected token's logprob
 					if math.Abs(tlp.Logprob-result[0].Logprob) > 1e-6 {
 						t.Errorf("Selected token logprob mismatch: main=%f, in top=%f",
 							result[0].Logprob, tlp.Logprob)
 					}
 					break
 				}
 			}
 			if !found {
 				t.Errorf("Selected token %q not found in top logprobs", selectedText)
 			}
 		})
 	}
 }
 func TestCalculateLogprobsSoftmaxCorrectness(t *testing.T) {
 	// Test that softmax calculation is correct by verifying probabilities sum to 1
 	decoder := func(tokenID int) string {
 		return string(rune('A' + tokenID))
 	}
 	tests := []struct {
 		name   string
 		logits []float32
 	}{
 		{
 			name:   "Small vocabulary",
 			logits: []float32{1.0, 2.0, 3.0},
 		},
 		{
 			name:   "Large differences",
 			logits: []float32{10.0, 0.0, -10.0},
 		},
 		{
 			name:   "All equal",
 			logits: []float32{5.0, 5.0, 5.0, 5.0, 5.0},
 		},
 		{
 			name:   "Very large values",
 			logits: []float32{500.0, 499.0, 498.0},
 		},
 		{
 			name:   "Very small values",
 			logits: []float32{-500.0, -499.0, -498.0},
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			// Calculate logprobs for all tokens
 			var totalProb float64
 			for i := range tt.logits {
 				result := CalculateLogprobs(tt.logits, i, 0, decoder)
 				if len(result) != 1 {
 					t.Fatalf("Expected 1 result, got %d", len(result))
 				}
 				prob := math.Exp(result[0].Logprob)
 				totalProb += prob
 				// Verify each probability is between 0 and 1
 				if prob < 0 || prob > 1 {
 					t.Errorf("Token %d probability %f is out of range [0, 1]", i, prob)
 				}
 			}
 			// Total probability should be very close to 1.0 (allowing for floating point errors)
 			if math.Abs(totalProb-1.0) > 1e-5 {
 				t.Errorf("Total probability sum is %f, expected 1.0", totalProb)
 			}
 		})
 	}
 }
 func TestCalculateLogprobsSelectedTokenCorrectness(t *testing.T) {
 	decoder := func(tokenID int) string {
 		return string(rune('A' + tokenID))
 	}
 	logits := []float32{3.0, 1.0, 2.0, 0.5}
 	// Test that selecting different tokens gives the correct probabilities
 	// and that the highest logit has the highest probability
 	maxLogitIndex := 0
 	maxLogitValue := logits[0]
 	for i, logit := range logits[1:] {
 		if logit > maxLogitValue {
 			maxLogitValue = logit
 			maxLogitIndex = i + 1
 		}
 	}
 	var maxProb float64
 	var maxProbIndex int
 	for i := range logits {
 		result := CalculateLogprobs(logits, i, 0, decoder)
 		prob := math.Exp(result[0].Logprob)
 		if prob > maxProb {
 			maxProb = prob
 			maxProbIndex = i
 		}
 		// Verify the token matches
 		expectedToken := decoder(i)
 		if result[0].Token != expectedToken {
 			t.Errorf("Token %d: expected token %q, got %q", i, expectedToken, result[0].Token)
 		}
 	}
 	// The token with the highest logit should have the highest probability
 	if maxProbIndex != maxLogitIndex {
 		t.Errorf("Token with highest probability (%d) doesn't match token with highest logit (%d)",
 			maxProbIndex, maxLogitIndex)
 	}
 }
 func TestCalculateLogprobsTopKOrdering(t *testing.T) {
 	tokens := map[int]string{
 		0: "first",
 		1: "second",
 		2: "third",
 		3: "fourth",
 		4: "fifth",
 	}
 	decoder := func(tokenID int) string {
 		return tokens[tokenID]
 	}
 	// Logits in non-sorted order
 	logits := []float32{2.0, 5.0, 1.0, 4.0, 3.0}
 	// Expected order by probability: 1 (5.0), 3 (4.0), 4 (3.0), 0 (2.0), 2 (1.0)
 	expectedOrder := []string{"second", "fourth", "fifth", "first", "third"}
 	result := CalculateLogprobs(logits, 0, 5, decoder)
 	if len(result) != 1 {
 		t.Fatalf("Expected 1 result, got %d", len(result))
 	}
 	if len(result[0].TopLogprobs) != 5 {
 		t.Fatalf("Expected 5 top logprobs, got %d", len(result[0].TopLogprobs))
 	}
 	// Verify ordering matches expected
 	for i, tlp := range result[0].TopLogprobs {
 		if tlp.Token != expectedOrder[i] {
 			t.Errorf("Position %d: expected token %q, got %q", i, expectedOrder[i], tlp.Token)
 		}
 	}
 	// Verify probabilities are in descending order
 	for i := 1; i < len(result[0].TopLogprobs); i++ {
 		if result[0].TopLogprobs[i].Logprob > result[0].TopLogprobs[i-1].Logprob {
 			t.Errorf("Probabilities not in descending order at position %d: %f > %f",
 				i, result[0].TopLogprobs[i].Logprob, result[0].TopLogprobs[i-1].Logprob)
 		}
 	}
 }
 func TestLogprobsWithStopSequences(t *testing.T) {
 	tests := []struct {
 		name              string
 		pendingResponses  []string
 		pendingLogprobs   []llm.Logprob
 		stop              string
 		expectedResponses []string
 		expectedLogprobs  int
 	}{
 		{
 			name:             "Single token stop",
 			pendingResponses: []string{"Hello", " world", "!"},
 			pendingLogprobs: []llm.Logprob{
 				{TokenLogprob: llm.TokenLogprob{Token: "Hello", Logprob: -0.1}},
 				{TokenLogprob: llm.TokenLogprob{Token: " world", Logprob: -0.2}},
 				{TokenLogprob: llm.TokenLogprob{Token: "!", Logprob: -0.3}},
 			},
 			stop:              "!",
 			expectedResponses: []string{"Hello", " world"},
 			expectedLogprobs:  2,
 		},
 		{
 			name:             "Multi-token stop sequence",
 			pendingResponses: []string{"Hello", " ", "there", "STOP"},
 			pendingLogprobs: []llm.Logprob{
 				{TokenLogprob: llm.TokenLogprob{Token: "Hello", Logprob: -0.1}},
 				{TokenLogprob: llm.TokenLogprob{Token: " ", Logprob: -0.2}},
 				{TokenLogprob: llm.TokenLogprob{Token: "there", Logprob: -0.3}},
 				{TokenLogprob: llm.TokenLogprob{Token: "STOP", Logprob: -0.4}},
 			},
 			stop:              "STOP",
 			expectedResponses: []string{"Hello", " ", "there"},
 			expectedLogprobs:  3,
 		},
 		{
 			name:             "Partial token stop",
 			pendingResponses: []string{"Hello", " the", "re!"},
 			pendingLogprobs: []llm.Logprob{
 				{TokenLogprob: llm.TokenLogprob{Token: "Hello", Logprob: -0.1}},
 				{TokenLogprob: llm.TokenLogprob{Token: " the", Logprob: -0.2}},
 				{TokenLogprob: llm.TokenLogprob{Token: "re!", Logprob: -0.3}},
 			},
 			stop:              "there!",
 			expectedResponses: []string{"Hello", " "},
 			expectedLogprobs:  2,
 		},
 		{
 			name:             "Stop at beginning of last token",
 			pendingResponses: []string{"Hello", " world", "END"},
 			pendingLogprobs: []llm.Logprob{
 				{TokenLogprob: llm.TokenLogprob{Token: "Hello", Logprob: -0.1}},
 				{TokenLogprob: llm.TokenLogprob{Token: " world", Logprob: -0.2}},
 				{TokenLogprob: llm.TokenLogprob{Token: "END", Logprob: -0.3}},
 			},
 			stop:              "END",
 			expectedResponses: []string{"Hello", " world"},
 			expectedLogprobs:  2,
 		},
 		{
 			name:             "Multi-token stop across tokens",
 			pendingResponses: []string{"Text", " ", "with", " ", "stop", " ", "word"},
 			pendingLogprobs: []llm.Logprob{
 				{TokenLogprob: llm.TokenLogprob{Token: "Text", Logprob: -0.1}},
 				{TokenLogprob: llm.TokenLogprob{Token: " ", Logprob: -0.2}},
 				{TokenLogprob: llm.TokenLogprob{Token: "with", Logprob: -0.3}},
 				{TokenLogprob: llm.TokenLogprob{Token: " ", Logprob: -0.4}},
 				{TokenLogprob: llm.TokenLogprob{Token: "stop", Logprob: -0.5}},
 				{TokenLogprob: llm.TokenLogprob{Token: " ", Logprob: -0.6}},
 				{TokenLogprob: llm.TokenLogprob{Token: "word", Logprob: -0.7}},
 			},
 			stop:              "stop word",
 			expectedResponses: []string{"Text", " ", "with", " "},
 			expectedLogprobs:  4,
 		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			// Simulate the stop sequence detection and truncation
 			origLen := len(tt.pendingResponses)
 			responses, tokenTruncated := TruncateStop(tt.pendingResponses, tt.stop)
 			newLen := len(responses)
 			// Simulate logprobs truncation
 			logprobs := make([]llm.Logprob, len(tt.pendingLogprobs))
 			copy(logprobs, tt.pendingLogprobs)
 			origLogprobsLen := len(logprobs)
 			numTokensRemoved := origLen - newLen
 			newLogprobsLen := origLogprobsLen - numTokensRemoved
 			if newLogprobsLen < 0 {
 				newLogprobsLen = 0
 			}
 			logprobs = logprobs[:newLogprobsLen]
 			// Verify responses were truncated correctly
 			if len(responses) != len(tt.expectedResponses) {
 				t.Errorf("Expected %d responses, got %d", len(tt.expectedResponses), len(responses))
 			}
 			// Verify logprobs count matches truncated responses
 			if len(logprobs) != tt.expectedLogprobs {
 				t.Errorf("Expected %d logprobs after truncation, got %d", tt.expectedLogprobs, len(logprobs))
 			}
 			// Verify logprobs count matches response count
 			if len(logprobs) != len(responses) {
 				t.Errorf("Logprobs count (%d) doesn't match responses count (%d)", len(logprobs), len(responses))
 			}
 			// Verify the correct logprobs were kept (skip last token if it was truncated)
 			// When tokenTruncated is true, the last response token may not match the logprob token
 			checkLen := len(logprobs)
 			if tokenTruncated && checkLen > 0 {
 				checkLen-- // Skip checking the last token when it was partially truncated
 			}
 			for i := range checkLen {
 				if i < len(responses) && logprobs[i].Token != responses[i] {
 					t.Errorf("Logprob[%d] token %q doesn't match response[%d] %q",
 						i, logprobs[i].Token, i, responses[i])
 				}
 			}
 		})
 	}
 }
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@ -28,6 +28,12 @@ import (
 	"github.com/ollama/ollama/runner/common"
 )
 // response contains a piece of generated text along with optional logprobs
 type response struct {
 	content  string
 	logprobs []llm.Logprob
 }
 // input is an element of the prompt to process, either
 // a token or an image embedding (generated from a vision projector)
 type input struct {
@ -53,11 +59,14 @@ type Sequence struct {
 	// tokens that have been generated but not returned yet (e.g. for stop sequences)
 	pendingResponses []string
 	// logprobs for tokens that haven't been returned yet
 	pendingLogprobs []llm.Logprob
 	// input cache being used by this sequence
 	cache *InputCacheSlot
 	// channel to send responses over
-	responses chan string
+	responses chan response
 	// channel to stop decoding (such as if the remote connection is closed)
 	quit chan bool
@ -84,6 +93,10 @@ type Sequence struct {
 	doneReason llm.DoneReason
 	// logprobs configuration
 	logprobs    bool
 	topLogprobs int
 	// Metrics
 	processingDuration time.Duration
 	generationDuration time.Duration
@ -99,6 +112,8 @@ type NewSequenceParams struct {
 	embedding      bool
 	shift          bool
 	truncate       bool
 	logprobs       bool
 	topLogprobs    int
 }
 var errorInputTooLong = errors.New("the input length exceeds the context length")
@ -155,7 +170,7 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
 		numPromptInputs:  len(inputs),
 		numPredict:       params.numPredict,
 		pendingResponses: make([]string, 0),
-		responses:        make(chan string, 100),
+		responses:        make(chan response, 100),
 		quit:             make(chan bool, 1),
 		embedding:        make(chan []float32, 1),
 		samplingCtx:      sc,
@ -163,9 +178,16 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
 		stop:             params.stop,
 		numKeep:          params.numKeep,
 		shift:            params.shift,
 		logprobs:         params.logprobs,
 		topLogprobs:      params.topLogprobs,
 	}, nil
 }
 // calculateLogprobsLlama converts raw logits to log probabilities and finds top K tokens
 func calculateLogprobsLlama(logits []float32, selectedToken int, topK int, model *llama.Model) []llm.Logprob {
 	return common.CalculateLogprobs(logits, selectedToken, topK, model.TokenToPiece)
 }
 // inputs processes the prompt and images into a list of inputs
 // by splitting the prompt on [img-<n>] tags, tokenizing text and
 // generating image embeddings for each image
@ -294,7 +316,9 @@ func (s *Server) allNil() bool {
 func flushPending(seq *Sequence) bool {
 	joined := strings.Join(seq.pendingResponses, "")
 	logprobs := seq.pendingLogprobs
 	seq.pendingResponses = []string{}
 	seq.pendingLogprobs = []llm.Logprob{}
 	// Check if there are any partial UTF-8 characters remaining.
 	// We already check and queue as we are generating but some may
@ -311,7 +335,7 @@ func flushPending(seq *Sequence) bool {
 	}
 	select {
-	case seq.responses <- joined:
+	case seq.responses <- response{content: joined, logprobs: logprobs}:
 		return true
 	case <-seq.quit:
 		return false
@ -526,6 +550,15 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			continue
 		}
 		// Calculate logprobs if requested (after EOS check to avoid logprobs for EOS tokens)
 		if seq.logprobs {
 			logits := s.lc.GetLogitsIth(seq.iBatch)
 			if logits != nil {
 				logprobs := calculateLogprobsLlama(logits, token, seq.topLogprobs, s.model)
 				seq.pendingLogprobs = append(seq.pendingLogprobs, logprobs...)
 			}
 		}
 		seq.inputs = []input{{token: token}}
 		seq.pendingResponses = append(seq.pendingResponses, piece)
@ -539,6 +572,17 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			seq.pendingResponses, tokenTruncated = common.TruncateStop(seq.pendingResponses, stop)
 			newLen := len(seq.pendingResponses)
 			// Truncate logprobs to match the truncated responses
 			if seq.logprobs {
 				origLogprobsLen := len(seq.pendingLogprobs)
 				numTokensRemoved := origLen - newLen
 				newLogprobsLen := origLogprobsLen - numTokensRemoved
 				if newLogprobsLen < 0 {
 					newLogprobsLen = 0
 				}
 				seq.pendingLogprobs = seq.pendingLogprobs[:newLogprobsLen]
 			}
 			// Update the cache based on the tokens that will be returned:
 			// - We have 1 token more than is currently in the cache because
 			// the last one generated wasn't submitted to Decode
@ -618,6 +662,8 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		embedding:      false,
 		shift:          req.Shift,
 		truncate:       req.Truncate,
 		logprobs:       req.Logprobs,
 		topLogprobs:    req.TopLogprobs,
 	})
 	if err != nil {
 		if errors.Is(err, errorInputTooLong) {
@ -669,10 +715,11 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		case <-r.Context().Done():
 			close(seq.quit)
 			return
-		case content, ok := <-seq.responses:
+		case resp, ok := <-seq.responses:
 			if ok {
 				if err := json.NewEncoder(w).Encode(&llm.CompletionResponse{
-					Content: content,
+					Content:  resp.content,
 					Logprobs: resp.logprobs,
 				}); err != nil {
 					http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
 					close(seq.quit)
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@ -41,6 +41,12 @@ import (
 	_ "github.com/ollama/ollama/model/models"
 )
 // response contains a piece of generated text along with optional logprobs
 type response struct {
 	content  string
 	logprobs []llm.Logprob
 }
 type Sequence struct {
 	// ctxs are used for allocating tensors that last the lifetime of the sequence, such as
 	// multimodal embeddings
@ -61,11 +67,14 @@ type Sequence struct {
 	// tokens that have been generated but not returned yet (e.g. for stop sequences)
 	pendingResponses []string
 	// logprobs for tokens that haven't been returned yet
 	pendingLogprobs []llm.Logprob
 	// input cache being used by this sequence
 	cache *InputCacheSlot
 	// channel to send responses over
-	responses chan string
+	responses chan response
 	// channel to stop decoding (such as if the remote connection is closed)
 	quit chan bool
@ -93,6 +102,10 @@ type Sequence struct {
 	doneReason llm.DoneReason
 	// logprobs configuration
 	logprobs    bool
 	topLogprobs int
 	// Metrics
 	startedAt, lastUpdatedAt time.Time
 	processingDuration       time.Duration
@ -102,13 +115,15 @@ type Sequence struct {
 }
 type NewSequenceParams struct {
-	numPredict int
+	numPredict  int
-	stop       []string
+	stop        []string
-	numKeep    int32
+	numKeep     int32
-	sampler    sample.Sampler
+	sampler     sample.Sampler
-	embedding  bool
+	embedding   bool
-	shift      bool
+	shift       bool
-	truncate   bool
+	truncate    bool
 	logprobs    bool
 	topLogprobs int
 }
 var errorInputTooLong = errors.New("the input length exceeds the context length")
@ -181,7 +196,7 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
 		numPromptInputs:  len(inputs),
 		numPredict:       params.numPredict,
 		pendingResponses: make([]string, 0),
-		responses:        make(chan string, 100),
+		responses:        make(chan response, 100),
 		quit:             make(chan bool, 1),
 		embedding:        make(chan []float32, 1),
 		sampler:          params.sampler,
@ -189,9 +204,20 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
 		stop:             params.stop,
 		numKeep:          params.numKeep,
 		shift:            params.shift,
 		logprobs:         params.logprobs,
 		topLogprobs:      params.topLogprobs,
 	}, nil
 }
 // calculateLogprobs converts raw logits to log probabilities and finds top K tokens
 func calculateLogprobs(logits []float32, selectedToken int32, topK int, textProcessor model.TextProcessor) []llm.Logprob {
 	decoder := func(tokenID int) string {
 		text, _ := textProcessor.Decode([]int32{int32(tokenID)})
 		return text
 	}
 	return common.CalculateLogprobs(logits, int(selectedToken), topK, decoder)
 }
 // inputs processes the prompt and images into a list of inputs
 // by splitting the prompt on [img-<n>] tags, tokenizing text and
 // decoding images
@ -371,7 +397,9 @@ func (s *Server) allNil() bool {
 func flushPending(seq *Sequence) bool {
 	joined := strings.Join(seq.pendingResponses, "")
 	logprobs := seq.pendingLogprobs
 	seq.pendingResponses = []string{}
 	seq.pendingLogprobs = []llm.Logprob{}
 	// Check if there are any partial UTF-8 characters remaining.
 	// We already check and queue as we are generating but some may
@ -388,7 +416,7 @@ func flushPending(seq *Sequence) bool {
 	}
 	select {
-	case seq.responses <- joined:
+	case seq.responses <- response{content: joined, logprobs: logprobs}:
 		return true
 	case <-seq.quit:
 		return false
@ -729,7 +757,8 @@ func (s *Server) computeBatch(activeBatch batchState) {
 		// sample a token
 		vocabSize := len(outputs) / activeBatch.batch.Outputs.Dim(0)
 		logutil.Trace("computeBatch: vocab details", "batchID", activeBatch.id, "seqIdx", i, "len(logits)", len(outputs), "len(activeBatch.batch.Outputs)", activeBatch.batch.Outputs.Dim(0), "vocabSize", vocabSize, "iBatches", iBatches)
-		token, err := seq.sampler.Sample(outputs[iBatches[i]*vocabSize : (iBatches[i]+1)*vocabSize])
+		logits := outputs[iBatches[i]*vocabSize : (iBatches[i]+1)*vocabSize]
 		token, err := seq.sampler.Sample(logits)
 		if err != nil {
 			panic("failed to sample token")
 		}
@ -751,6 +780,12 @@ func (s *Server) computeBatch(activeBatch batchState) {
 			panic("failed to decode token")
 		}
 		// Calculate logprobs if requested (after EOS check to avoid logprobs for EOS tokens)
 		if seq.logprobs {
 			logprobs := calculateLogprobs(logits, token, seq.topLogprobs, s.model.(model.TextProcessor))
 			seq.pendingLogprobs = append(seq.pendingLogprobs, logprobs...)
 		}
 		seq.pendingResponses = append(seq.pendingResponses, piece)
 		sequence := strings.Join(seq.pendingResponses, "")
@ -762,6 +797,17 @@ func (s *Server) computeBatch(activeBatch batchState) {
 			seq.pendingResponses, tokenTruncated = common.TruncateStop(seq.pendingResponses, stop)
 			newLen := len(seq.pendingResponses)
 			// Truncate logprobs to match the truncated responses
 			if seq.logprobs {
 				origLogprobsLen := len(seq.pendingLogprobs)
 				numTokensRemoved := origLen - newLen
 				newLogprobsLen := origLogprobsLen - numTokensRemoved
 				if newLogprobsLen < 0 {
 					newLogprobsLen = 0
 				}
 				seq.pendingLogprobs = seq.pendingLogprobs[:newLogprobsLen]
 			}
 			// Update the cache based on the tokens that will be returned:
 			// - We have 1 token more than is currently in the cache because
 			// the last one generated wasn't submitted to Decode
@ -845,13 +891,15 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 	)
 	seq, err := s.NewSequence(req.Prompt, req.Images, NewSequenceParams{
-		numPredict: req.Options.NumPredict,
+		numPredict:  req.Options.NumPredict,
-		stop:       req.Options.Stop,
+		stop:        req.Options.Stop,
-		numKeep:    int32(req.Options.NumKeep),
+		numKeep:     int32(req.Options.NumKeep),
-		sampler:    sampler,
+		sampler:     sampler,
-		embedding:  false,
+		embedding:   false,
-		shift:      req.Shift,
+		shift:       req.Shift,
-		truncate:   req.Truncate,
+		truncate:    req.Truncate,
 		logprobs:    req.Logprobs,
 		topLogprobs: req.TopLogprobs,
 	})
 	if err != nil {
 		if errors.Is(err, errorInputTooLong) {
@ -903,10 +951,11 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		case <-r.Context().Done():
 			close(seq.quit)
 			return
-		case content, ok := <-seq.responses:
+		case resp, ok := <-seq.responses:
 			if ok {
 				if err := json.NewEncoder(w).Encode(&llm.CompletionResponse{
-					Content: content,
+					Content:  resp.content,
 					Logprobs: resp.logprobs,
 				}); err != nil {
 					http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
 					close(seq.quit)
--- a/server/logprob.go
+++ b/server/logprob.go
@ -0,0 +1,29 @@
 package server
 import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/llm"
 )
 // toAPILogprobs converts llm.Logprobs to api.Logprobs
 func toAPILogprobs(logprobs []llm.Logprob) []api.Logprob {
 	result := make([]api.Logprob, len(logprobs))
 	for i, lp := range logprobs {
 		result[i] = api.Logprob{
 			TokenLogprob: api.TokenLogprob{
 				Token:   lp.Token,
 				Logprob: lp.Logprob,
 			},
 		}
 		if len(lp.TopLogprobs) > 0 {
 			result[i].TopLogprobs = make([]api.TokenLogprob, len(lp.TopLogprobs))
 			for j, tlp := range lp.TopLogprobs {
 				result[i].TopLogprobs[j] = api.TokenLogprob{
 					Token:   tlp.Token,
 					Logprob: tlp.Logprob,
 				}
 			}
 		}
 	}
 	return result
 }
--- a/server/routes.go
+++ b/server/routes.go
@ -183,6 +183,11 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		return
 	}
 	if req.TopLogprobs < 0 || req.TopLogprobs > 20 {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "top_logprobs must be between 0 and 20"})
 		return
 	}
 	name := model.ParseName(req.Model)
 	if !name.IsValid() {
 		// Ideally this is "invalid model name" but we're keeping with
@ -212,6 +217,11 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		return
 	}
 	if req.TopLogprobs < 0 || req.TopLogprobs > 20 {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "top_logprobs must be between 0 and 20"})
 		return
 	}
 	if m.Config.RemoteHost != "" && m.Config.RemoteModel != "" {
 		origModel := req.Model
@ -502,12 +512,14 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		var sb strings.Builder
 		defer close(ch)
 		if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
-			Prompt:   prompt,
+			Prompt:      prompt,
-			Images:   images,
+			Images:      images,
-			Format:   req.Format,
+			Format:      req.Format,
-			Options:  opts,
+			Options:     opts,
-			Shift:    req.Shift == nil || *req.Shift,
+			Shift:       req.Shift == nil || *req.Shift,
-			Truncate: req.Truncate == nil || *req.Truncate,
+			Truncate:    req.Truncate == nil || *req.Truncate,
 			Logprobs:    req.Logprobs,
 			TopLogprobs: req.TopLogprobs,
 		}, func(cr llm.CompletionResponse) {
 			res := api.GenerateResponse{
 				Model:     req.Model,
@ -520,6 +532,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 					EvalCount:          cr.EvalCount,
 					EvalDuration:       cr.EvalDuration,
 				},
 				Logprobs: toAPILogprobs(cr.Logprobs),
 			}
 			if builtinParser != nil {
@ -580,6 +593,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 	if req.Stream != nil && !*req.Stream {
 		var r api.GenerateResponse
 		var allLogprobs []api.Logprob
 		var sbThinking strings.Builder
 		var sbContent strings.Builder
 		for rr := range ch {
@ -588,6 +602,10 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 				sbThinking.WriteString(t.Thinking)
 				sbContent.WriteString(t.Response)
 				r = t
 				// Accumulate logprobs from all chunks for non-streaming response
 				if len(t.Logprobs) > 0 {
 					allLogprobs = append(allLogprobs, t.Logprobs...)
 				}
 			case gin.H:
 				msg, ok := t["error"].(string)
 				if !ok {
@ -609,6 +627,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		r.Thinking = sbThinking.String()
 		r.Response = sbContent.String()
 		r.Logprobs = allLogprobs
 		c.JSON(http.StatusOK, r)
 		return
@ -1834,6 +1853,11 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		return
 	}
 	if req.TopLogprobs < 0 || req.TopLogprobs > 20 {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "top_logprobs must be between 0 and 20"})
 		return
 	}
 	name := model.ParseName(req.Model)
 	if !name.IsValid() {
 		c.JSON(http.StatusBadRequest, gin.H{"error": "model is required"})
@ -1859,6 +1883,11 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		return
 	}
 	if req.TopLogprobs < 0 || req.TopLogprobs > 20 {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "top_logprobs must be between 0 and 20"})
 		return
 	}
 	// expire the runner
 	if len(req.Messages) == 0 && req.KeepAlive != nil && req.KeepAlive.Duration == 0 {
 		s.sched.expireRunner(m)
@ -2104,12 +2133,14 @@ func (s *Server) ChatHandler(c *gin.Context) {
 			// sets up new context given parent context per request
 			ctx, cancel := context.WithCancel(c.Request.Context())
 			err := r.Completion(ctx, llm.CompletionRequest{
-				Prompt:   prompt,
+				Prompt:      prompt,
-				Images:   images,
+				Images:      images,
-				Format:   currentFormat,
+				Format:      currentFormat,
-				Options:  opts,
+				Options:     opts,
-				Shift:    req.Shift == nil || *req.Shift,
+				Shift:       req.Shift == nil || *req.Shift,
-				Truncate: truncate,
+				Truncate:    truncate,
 				Logprobs:    req.Logprobs,
 				TopLogprobs: req.TopLogprobs,
 			}, func(r llm.CompletionResponse) {
 				res := api.ChatResponse{
 					Model:     req.Model,
@ -2122,7 +2153,9 @@ func (s *Server) ChatHandler(c *gin.Context) {
 						EvalCount:          r.EvalCount,
 						EvalDuration:       r.EvalDuration,
 					},
 					Logprobs: toAPILogprobs(r.Logprobs),
 				}
 				if r.Done {
 					res.DoneReason = r.DoneReason.String()
 					res.TotalDuration = time.Since(checkpointStart)
@ -2251,6 +2284,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	if req.Stream != nil && !*req.Stream {
 		var resp api.ChatResponse
 		var toolCalls []api.ToolCall
 		var allLogprobs []api.Logprob
 		var sbThinking strings.Builder
 		var sbContent strings.Builder
 		for rr := range ch {
@ -2262,6 +2296,10 @@ func (s *Server) ChatHandler(c *gin.Context) {
 				if len(req.Tools) > 0 {
 					toolCalls = append(toolCalls, t.Message.ToolCalls...)
 				}
 				// Accumulate logprobs from all chunks for non-streaming response
 				if len(t.Logprobs) > 0 {
 					allLogprobs = append(allLogprobs, t.Logprobs...)
 				}
 			case gin.H:
 				msg, ok := t["error"].(string)
 				if !ok {
@ -2283,6 +2321,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		resp.Message.Content = sbContent.String()
 		resp.Message.Thinking = sbThinking.String()
 		resp.Logprobs = allLogprobs
 		if len(toolCalls) > 0 {
 			resp.Message.ToolCalls = toolCalls
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@ -1184,6 +1184,86 @@ func TestGenerate(t *testing.T) {
 	})
 }
 func TestGenerateLogprobs(t *testing.T) {
 	t.Run("invalid top_logprobs negative", func(t *testing.T) {
 		gin.SetMode(gin.TestMode)
 		s := Server{}
 		w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
 			Model:       "test",
 			Prompt:      "Hello",
 			TopLogprobs: -1,
 		})
 		if w.Code != http.StatusBadRequest {
 			t.Errorf("expected status 400, got %d", w.Code)
 		}
 		if diff := cmp.Diff(w.Body.String(), `{"error":"top_logprobs must be between 0 and 20"}`); diff != "" {
 			t.Errorf("mismatch (-got +want):\n%s", diff)
 		}
 	})
 	t.Run("invalid top_logprobs too high", func(t *testing.T) {
 		gin.SetMode(gin.TestMode)
 		s := Server{}
 		w := createRequest(t, s.GenerateHandler, api.GenerateRequest{
 			Model:       "test",
 			Prompt:      "Hello",
 			TopLogprobs: 21,
 		})
 		if w.Code != http.StatusBadRequest {
 			t.Errorf("expected status 400, got %d", w.Code)
 		}
 		if diff := cmp.Diff(w.Body.String(), `{"error":"top_logprobs must be between 0 and 20"}`); diff != "" {
 			t.Errorf("mismatch (-got +want):\n%s", diff)
 		}
 	})
 }
 func TestChatLogprobs(t *testing.T) {
 	t.Run("invalid top_logprobs negative", func(t *testing.T) {
 		gin.SetMode(gin.TestMode)
 		s := Server{}
 		w := createRequest(t, s.ChatHandler, api.ChatRequest{
 			Model: "test",
 			Messages: []api.Message{
 				{Role: "user", Content: "Hello"},
 			},
 			TopLogprobs: -1,
 		})
 		if w.Code != http.StatusBadRequest {
 			t.Errorf("expected status 400, got %d", w.Code)
 		}
 		if diff := cmp.Diff(w.Body.String(), `{"error":"top_logprobs must be between 0 and 20"}`); diff != "" {
 			t.Errorf("mismatch (-got +want):\n%s", diff)
 		}
 	})
 	t.Run("invalid top_logprobs too high", func(t *testing.T) {
 		gin.SetMode(gin.TestMode)
 		s := Server{}
 		w := createRequest(t, s.ChatHandler, api.ChatRequest{
 			Model: "test",
 			Messages: []api.Message{
 				{Role: "user", Content: "Hello"},
 			},
 			TopLogprobs: 21,
 		})
 		if w.Code != http.StatusBadRequest {
 			t.Errorf("expected status 400, got %d", w.Code)
 		}
 		if diff := cmp.Diff(w.Body.String(), `{"error":"top_logprobs must be between 0 and 20"}`); diff != "" {
 			t.Errorf("mismatch (-got +want):\n%s", diff)
 		}
 	})
 }
 func TestChatWithPromptEndingInThinkTag(t *testing.T) {
 	gin.SetMode(gin.TestMode)