mirror of https://github.com/ollama/ollama
Merge e860834e00 into 903b1fc97f
This commit is contained in:
commit
02deb45532
32
docs/gpu.mdx
32
docs/gpu.mdx
|
|
@ -163,4 +163,34 @@ To select specific Vulkan GPU(s), you can set the environment variable
|
|||
`GGML_VK_VISIBLE_DEVICES` to one or more numeric IDs on the Ollama server as
|
||||
described in the [FAQ](faq.md#how-do-i-configure-ollama-server). If you
|
||||
encounter any problems with Vulkan based GPUs, you can disable all Vulkan GPUs
|
||||
by setting `GGML_VK_VISIBLE_DEVICES=-1`
|
||||
by setting `GGML_VK_VISIBLE_DEVICES=-1`
|
||||
|
||||
|
||||
## Advanced: Manually overriding multi-GPU layer split
|
||||
|
||||
By default, Ollama decides how many layers to offload to each GPU based on free
|
||||
memory and other heuristics. For some large models and specific hardware mixes you may
|
||||
prefer a `manual split`.
|
||||
|
||||
Set the environment variable `OLLAMA_OVERRIDE_CONFIG` to an INI file (or place one at
|
||||
`~/.ollama.ini`) and add a section for the model’s `short name` with a `tensor-split`
|
||||
value:
|
||||
|
||||
```ini
|
||||
[llama3:70b]
|
||||
tensor-split=18,21,21,21
|
||||
```
|
||||
|
||||
The list represents proportions for each visible GPU, in order. The sum of
|
||||
the values determines how many of the last layers will be offloaded to GPU VRAM
|
||||
(`n_gpu_layers = sum(tensor-split)`). The assignment is proportional across devices:
|
||||
larger numbers get more of those last layers.
|
||||
|
||||
**Constraints**
|
||||
|
||||
- The number of values must be ≤ the number of visible GPUs.
|
||||
- All values must be non-negative integers.
|
||||
- If invalid or not present for the model, Ollama falls back to its heuristics.
|
||||
|
||||
This feature is intended for expert tuning when the automatic split under-utilizes
|
||||
your GPUs for a given model/context configuration.
|
||||
|
|
@ -206,6 +206,9 @@ var (
|
|||
UseAuth = Bool("OLLAMA_AUTH")
|
||||
// Enable Vulkan backend
|
||||
EnableVulkan = Bool("OLLAMA_VULKAN")
|
||||
// Optional path to per-model override file (INI).
|
||||
// If unset, defaults to ~/.ollama.ini
|
||||
OverrideConfigPath = String("OLLAMA_OVERRIDE_CONFIG")
|
||||
)
|
||||
|
||||
func String(s string) func() string {
|
||||
|
|
@ -293,6 +296,7 @@ func AsMap() map[string]EnvVar {
|
|||
"OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"},
|
||||
"OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
|
||||
"OLLAMA_REMOTES": {"OLLAMA_REMOTES", Remotes(), "Allowed hosts for remote models (default \"ollama.com\")"},
|
||||
"OLLAMA_OVERRIDE_CONFIG": {"OLLAMA_OVERRIDE_CONFIG", OverrideConfigPath(), "Path to model override config (default: ~/.ollama.ini)"},
|
||||
|
||||
// Informational
|
||||
"HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
|
||||
|
|
|
|||
|
|
@ -0,0 +1,116 @@
|
|||
package envconfig
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type Override struct {
|
||||
ModelName string
|
||||
NumGPULayers int // -1 means unset
|
||||
TensorSplit []int // nil means unset
|
||||
}
|
||||
|
||||
// LoadOverride loads overrides for the given model section name (e.g. "llama3.2-vision:90b").
|
||||
// The INI format is:
|
||||
// [model-name:params]
|
||||
// tensor-split=<int[,int,...]>
|
||||
// Note: n-gpu-layers is not read from the file; it is always derived as the sum of tensor-split.
|
||||
// Returns nil if no file or no matching section.
|
||||
func LoadOverride(model string) *Override {
|
||||
// Resolve config path
|
||||
path := OverrideConfigPath()
|
||||
if path == "" {
|
||||
home, _ := os.UserHomeDir()
|
||||
if home == "" {
|
||||
return nil
|
||||
}
|
||||
path = filepath.Join(home, ".ollama.ini")
|
||||
}
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
sectionHdr := "[" + model + "]"
|
||||
var inSection bool
|
||||
ovr := &Override{ModelName: model, NumGPULayers: -1}
|
||||
|
||||
sc := bufio.NewScanner(f)
|
||||
for sc.Scan() {
|
||||
line := strings.TrimSpace(sc.Text())
|
||||
if line == "" || strings.HasPrefix(line, "#") || strings.HasPrefix(line, ";") {
|
||||
continue
|
||||
}
|
||||
if strings.HasPrefix(line, "[") && strings.HasSuffix(line, "]") {
|
||||
inSection = (line == sectionHdr)
|
||||
continue
|
||||
}
|
||||
if !inSection {
|
||||
continue
|
||||
}
|
||||
kv := strings.SplitN(line, "=", 2)
|
||||
if len(kv) != 2 {
|
||||
continue
|
||||
}
|
||||
k := strings.TrimSpace(strings.ToLower(kv[0]))
|
||||
v := strings.TrimSpace(kv[1])
|
||||
|
||||
switch k {
|
||||
case "tensor-split":
|
||||
if arr := parseUintList(v); len(arr) > 0 {
|
||||
ovr.TensorSplit = arr
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If a tensor-split is provided, NumGPULayers is always the sum of entries.
|
||||
if len(ovr.TensorSplit) > 0 {
|
||||
total := 0
|
||||
for _, n := range ovr.TensorSplit {
|
||||
total += n
|
||||
}
|
||||
ovr.NumGPULayers = total
|
||||
}
|
||||
|
||||
// If nothing set, return nil
|
||||
if ovr.NumGPULayers < 0 && len(ovr.TensorSplit) == 0 {
|
||||
return nil
|
||||
}
|
||||
return ovr
|
||||
}
|
||||
|
||||
func parseUint(s string) int {
|
||||
s = strings.TrimSpace(s)
|
||||
if s == "" {
|
||||
return -1
|
||||
}
|
||||
var n int
|
||||
for _, r := range s {
|
||||
if r < '0' || r > '9' {
|
||||
return -1
|
||||
}
|
||||
n = n*10 + int(r-'0')
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
func parseUintList(s string) []int {
|
||||
s = strings.TrimSpace(s)
|
||||
if s == "" {
|
||||
return nil
|
||||
}
|
||||
parts := strings.Split(s, ",")
|
||||
out := make([]int, 0, len(parts))
|
||||
for _, p := range parts {
|
||||
n := parseUint(strings.TrimSpace(p))
|
||||
if n < 0 {
|
||||
return nil
|
||||
}
|
||||
out = append(out, n)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
|
@ -0,0 +1,140 @@
|
|||
package envconfig
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseUint(t *testing.T) {
|
||||
tests := []struct {
|
||||
in string
|
||||
want int
|
||||
}{
|
||||
{"0", 0},
|
||||
{"1", 1},
|
||||
{"123", 123},
|
||||
{"", -1},
|
||||
{" 42 ", 42},
|
||||
{"-1", -1},
|
||||
{"abc", -1},
|
||||
{"12x", -1},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
if got := parseUint(tt.in); got != tt.want {
|
||||
t.Fatalf("parseUint(%q) = %d; want %d", tt.in, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseUintList(t *testing.T) {
|
||||
tests := []struct {
|
||||
in string
|
||||
want []int
|
||||
}{
|
||||
{"", nil},
|
||||
{"1", []int{1}},
|
||||
{"1,2,3", []int{1, 2, 3}},
|
||||
{" 4 , 5 , 6 ", []int{4, 5, 6}},
|
||||
{"1, -2", nil}, // invalid -> whole list rejected
|
||||
{"a,b", nil},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
got := parseUintList(tt.in)
|
||||
if (got == nil) != (tt.want == nil) {
|
||||
t.Fatalf("parseUintList(%q) = %v; want %v", tt.in, got, tt.want)
|
||||
}
|
||||
if got == nil {
|
||||
continue
|
||||
}
|
||||
if len(got) != len(tt.want) {
|
||||
t.Fatalf("parseUintList(%q) len=%d; want %d", tt.in, len(got), len(tt.want))
|
||||
}
|
||||
for i := range got {
|
||||
if got[i] != tt.want[i] {
|
||||
t.Fatalf("parseUintList(%q)[%d]=%d; want %d", tt.in, i, got[i], tt.want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadOverride_Basic(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
cfg := filepath.Join(dir, "over.ini")
|
||||
content := `
|
||||
; comment
|
||||
[llama3.2-vision:90b]
|
||||
n-gpu-layers=33
|
||||
tensor-split=10,20,30
|
||||
|
||||
[other]
|
||||
n-gpu-layers=1
|
||||
`
|
||||
if err := os.WriteFile(cfg, []byte(content), 0o600); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
t.Setenv("OLLAMA_OVERRIDE_CONFIG", cfg)
|
||||
|
||||
ovr := LoadOverride("llama3.2-vision:90b")
|
||||
if ovr == nil {
|
||||
t.Fatalf("LoadOverride returned nil")
|
||||
}
|
||||
if ovr.ModelName != "llama3.2-vision:90b" {
|
||||
t.Fatalf("ModelName=%q; want %q", ovr.ModelName, "llama3.2-vision:90b")
|
||||
}
|
||||
// n-gpu-layers must be the sum of tensor-split entries (10+20+30=60).
|
||||
if ovr.NumGPULayers != 60 {
|
||||
t.Fatalf("NumGPULayers=%d; want %d", ovr.NumGPULayers, 60)
|
||||
}
|
||||
wantSplit := []int{10, 20, 30}
|
||||
if len(ovr.TensorSplit) != len(wantSplit) {
|
||||
t.Fatalf("TensorSplit len=%d; want %d", len(ovr.TensorSplit), len(wantSplit))
|
||||
}
|
||||
for i := range wantSplit {
|
||||
if ovr.TensorSplit[i] != wantSplit[i] {
|
||||
t.Fatalf("TensorSplit[%d]=%d; want %d", i, ovr.TensorSplit[i], wantSplit[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadOverride_NoMatchOrEmpty(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
cfg := filepath.Join(dir, "over.ini")
|
||||
content := `
|
||||
[some-model]
|
||||
n-gpu-layers=7
|
||||
`
|
||||
if err := os.WriteFile(cfg, []byte(content), 0o600); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
t.Setenv("OLLAMA_OVERRIDE_CONFIG", cfg)
|
||||
|
||||
// Section exists but different model -> nil
|
||||
if got := LoadOverride("another-model"); got != nil {
|
||||
t.Fatalf("expected nil for unmatched section, got %#v", got)
|
||||
}
|
||||
|
||||
// File missing -> nil
|
||||
t.Setenv("OLLAMA_OVERRIDE_CONFIG", filepath.Join(dir, "missing.ini"))
|
||||
if got := LoadOverride("some-model"); got != nil {
|
||||
t.Fatalf("expected nil for missing file, got %#v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadOverride_BadValuesIgnored(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
cfg := filepath.Join(dir, "over.ini")
|
||||
content := `
|
||||
[m]
|
||||
n-gpu-layers=abc
|
||||
tensor-split=1,2,x
|
||||
`
|
||||
if err := os.WriteFile(cfg, []byte(content), 0o600); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
t.Setenv("OLLAMA_OVERRIDE_CONFIG", cfg)
|
||||
|
||||
if got := LoadOverride("m"); got != nil {
|
||||
t.Fatalf("expected nil when no valid keys parsed, got %#v", got)
|
||||
}
|
||||
}
|
||||
159
llm/server.go
159
llm/server.go
|
|
@ -103,7 +103,8 @@ type llmServer struct {
|
|||
loadStart time.Time // Record how long it took the model to load
|
||||
loadProgress float32
|
||||
|
||||
sem *semaphore.Weighted
|
||||
sem *semaphore.Weighted
|
||||
override *envconfig.Override
|
||||
}
|
||||
|
||||
type llamaServer struct {
|
||||
|
|
@ -118,6 +119,80 @@ type ollamaServer struct {
|
|||
textProcessor model.TextProcessor // textProcessor handles text encoding/decoding
|
||||
}
|
||||
|
||||
// buildGPULayersFromOverride constructs an explicit ml.GPULayersList from a
|
||||
// per-model override configuration.
|
||||
//
|
||||
// It takes:
|
||||
// - totalLayers: the total number of layers in the model (i.e. block_count+1,
|
||||
// including the output layer).
|
||||
// - gpus: the visible GPUs, whose order is used to map tensor-split entries to
|
||||
// device IDs (tensor-split index i -> gpus[i]).
|
||||
// - override: the parsed override which provides:
|
||||
// * NumGPULayers: how many of the last model layers to offload, and
|
||||
// * TensorSplit: integer weights describing how to distribute those layers
|
||||
// across the GPUs (proportional split).
|
||||
//
|
||||
// The function assigns the last NumGPULayers layers in the range
|
||||
// [blocks-NumGPULayers, blocks] to GPUs according to the cumulative proportions
|
||||
// derived from TensorSplit. If TensorSplit has more entries than visible GPUs,
|
||||
// any required value is non-positive, the proportional total is zero, or the
|
||||
// computed span is empty, the function returns nil to signal "no override".
|
||||
//
|
||||
// On success it returns a non-empty GPULayersList; otherwise it returns nil.
|
||||
//
|
||||
func buildGPULayersFromOverride(totalLayers int, gpus []ml.DeviceInfo, override *envconfig.Override) ml.GPULayersList {
|
||||
if totalLayers <= 0 || len(gpus) == 0 || override == nil {
|
||||
return nil
|
||||
}
|
||||
if len(override.TensorSplit) > len(gpus) {
|
||||
return nil
|
||||
}
|
||||
// cumulative proportions
|
||||
var total int
|
||||
for _, v := range override.TensorSplit {
|
||||
total += v
|
||||
}
|
||||
if total <= 0 {
|
||||
return nil
|
||||
}
|
||||
cum := make([]float32, len(override.TensorSplit))
|
||||
var run float32
|
||||
for i, v := range override.TensorSplit {
|
||||
run += float32(v) / float32(total)
|
||||
cum[i] = run
|
||||
}
|
||||
|
||||
// totalLayers = blocks + 1
|
||||
blocks := totalLayers - 1
|
||||
start := max(0, blocks-override.NumGPULayers)
|
||||
stop := min(start+override.NumGPULayers, blocks+1)
|
||||
|
||||
gl := make(ml.GPULayersList, len(gpus))
|
||||
for i := range gpus {
|
||||
gl[i].DeviceID = gpus[i].DeviceID
|
||||
}
|
||||
|
||||
span := float32(stop - start)
|
||||
if span <= 0 {
|
||||
return nil
|
||||
}
|
||||
for layer := start; layer < stop; layer++ {
|
||||
ratio := float32(layer-start) / span
|
||||
idx := 0
|
||||
for i := range cum {
|
||||
if ratio < cum[i] {
|
||||
idx = i
|
||||
break
|
||||
}
|
||||
}
|
||||
gl[idx].Layers = append(gl[idx].Layers, layer)
|
||||
}
|
||||
if gl.Sum() == 0 {
|
||||
return nil
|
||||
}
|
||||
return gl
|
||||
}
|
||||
|
||||
// LoadModel will load a model from disk. The model must be in the GGML format.
|
||||
//
|
||||
// It collects array values for arrays with a size less than or equal to
|
||||
|
|
@ -139,7 +214,7 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
|
|||
}
|
||||
|
||||
// NewLlamaServer will run a server for the given GPUs
|
||||
func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
|
||||
func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int, override *envconfig.Override) (LlamaServer, error) {
|
||||
var llamaModel *llama.Model
|
||||
var textProcessor model.TextProcessor
|
||||
var err error
|
||||
|
|
@ -280,6 +355,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
|
|||
totalLayers: f.KV().BlockCount() + 1,
|
||||
loadStart: time.Now(),
|
||||
done: make(chan error, 1),
|
||||
override: override,
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
|
|
@ -492,6 +568,75 @@ type LoadResponse struct {
|
|||
|
||||
var ErrLoadRequiredFull = errors.New("unable to load full model on GPU")
|
||||
|
||||
// maybeApplyOverride attempts to replace a heuristic GPU layer layout with a
|
||||
// per-model override read from OLLAMA_OVERRIDE_CONFIG.
|
||||
//
|
||||
// Inputs:
|
||||
// - gpus: the set of visible GPUs (their order is used to map tensor-split
|
||||
// entries onto devices: tensor-split index i -> gpus[i]).
|
||||
// - gpuLayers: the current, heuristic ml.GPULayersList that would be used if
|
||||
// no override is applied.
|
||||
//
|
||||
// Behavior:
|
||||
// * If no override is configured, or it is incomplete (missing NumGPULayers
|
||||
// or TensorSplit), the function returns the original gpuLayers and false.
|
||||
// * If TensorSplit has more entries than visible GPUs, or NumGPULayers
|
||||
// exceeds the model's total layers (block_count+1), the override is ignored
|
||||
// and the function returns the original gpuLayers and false (with a log
|
||||
// warning).
|
||||
// * Otherwise, it builds a replacement assignment via buildGPULayersFromOverride.
|
||||
// On success, it logs the application, updates s.options.NumGPU to match
|
||||
// the override's NumGPULayers (so downstream logging/heuristics see a
|
||||
// consistent value), and returns (override, true). If the mapping produces
|
||||
// no layers, the heuristic layout is kept and false is returned.
|
||||
//
|
||||
func (s *llmServer) maybeApplyOverride(gpus []ml.DeviceInfo, gpuLayers ml.GPULayersList) (ml.GPULayersList, bool) {
|
||||
// If no override loaded, or incomplete, bail out
|
||||
if s.override == nil || s.override.NumGPULayers <= 0 || len(s.override.TensorSplit) == 0 {
|
||||
return gpuLayers, false
|
||||
}
|
||||
|
||||
// Too many split entries for visible GPUs? Warn and fallback.
|
||||
if len(s.override.TensorSplit) > len(gpus) {
|
||||
slog.Warn(
|
||||
"Override ignored: tensor-split override has more entries than visible GPUs; using heuristic split instead",
|
||||
"model", s.override.ModelName,
|
||||
"tensor_split_entries", len(s.override.TensorSplit),
|
||||
"visible_gpus", len(gpus),
|
||||
)
|
||||
return gpuLayers, false
|
||||
}
|
||||
|
||||
// Clamp to model size (totalLayers == block_count + 1)
|
||||
maxLayers := int(s.totalLayers)
|
||||
if s.override.NumGPULayers > maxLayers {
|
||||
slog.Warn(
|
||||
"Override ignored: n_gpu_layers is larger than the maximum supported; using heuristic split instead",
|
||||
"model", s.override.ModelName,
|
||||
"max_layers", maxLayers,
|
||||
"n_gpu_layers", s.override.NumGPULayers,
|
||||
)
|
||||
return gpuLayers, false
|
||||
}
|
||||
|
||||
override := buildGPULayersFromOverride(int(s.totalLayers), gpus, s.override)
|
||||
if override == nil || override.Sum() == 0 {
|
||||
slog.Warn("Override ignored: override mapping produced no layers; using heuristic layout instead")
|
||||
return gpuLayers, false
|
||||
}
|
||||
|
||||
slog.Info(
|
||||
"Applying override from OLLAMA_OVERRIDE_CONFIG",
|
||||
"model", s.override.ModelName,
|
||||
"n_gpu_layers", s.override.NumGPULayers,
|
||||
"tensor_split", s.override.TensorSplit,
|
||||
"layers_offloaded", override.Sum(),
|
||||
)
|
||||
// Align NumGPU with override for downstream logging / heuristics that read it
|
||||
s.options.NumGPU = s.override.NumGPULayers
|
||||
return override, true
|
||||
}
|
||||
|
||||
func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) {
|
||||
slog.Info("loading model", "model layers", s.totalLayers, "requested", s.options.NumGPU)
|
||||
|
||||
|
|
@ -626,6 +771,11 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system
|
|||
}
|
||||
}
|
||||
|
||||
// Apply per-model override
|
||||
if newLayers, ok := s.maybeApplyOverride(gpus, gpuLayers); ok {
|
||||
gpuLayers = newLayers
|
||||
}
|
||||
|
||||
// This maintains the historical assignment of graph sizes, though it isn't fully accurate
|
||||
graphSize := graphFullOffload
|
||||
if gpuLayers.Sum() < int(s.totalLayers) {
|
||||
|
|
@ -761,6 +911,11 @@ nextOperation:
|
|||
for operation := LoadOperationFit; operation < LoadOperationCommit; operation++ {
|
||||
nextLoad:
|
||||
for {
|
||||
// Apply per-model override if present
|
||||
if newLayers, ok := s.maybeApplyOverride(gpus, gpuLayers); ok {
|
||||
gpuLayers = newLayers
|
||||
}
|
||||
|
||||
s.loadRequest.GPULayers = gpuLayers
|
||||
resp, err := s.initModel(ctx, s.loadRequest, operation)
|
||||
if err != nil {
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ import (
|
|||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"golang.org/x/sync/semaphore"
|
||||
)
|
||||
|
|
@ -279,3 +280,125 @@ func TestLLMServerCompletionFormat(t *testing.T) {
|
|||
}, nil)
|
||||
checkValid(err)
|
||||
}
|
||||
|
||||
func TestBuildGPULayersFromOverride_Basic(t *testing.T) {
|
||||
// totalLayers = blocks + 1. With totalLayers=5 -> blocks in [0..4].
|
||||
totalLayers := 5
|
||||
gpus := []ml.DeviceInfo{
|
||||
{DeviceID: ml.DeviceID{ID: "gpu0"}},
|
||||
{DeviceID: ml.DeviceID{ID: "gpu1"}},
|
||||
}
|
||||
ov := &envconfig.Override{
|
||||
ModelName: "dummy",
|
||||
NumGPULayers: 4, // assign last 4 layers: indices 0..3 with our simplified test
|
||||
TensorSplit: []int{1,1}, // even split across 2 GPUs
|
||||
}
|
||||
|
||||
gl := buildGPULayersFromOverride(totalLayers, gpus, ov)
|
||||
if gl == nil || gl.Sum() == 0 {
|
||||
t.Fatalf("expected non-empty GPULayersList, got %#v", gl)
|
||||
}
|
||||
|
||||
// Expect gpu0 to get first half (layers 0,1) and gpu1 to get (2,3)
|
||||
want := ml.GPULayersList{
|
||||
{DeviceID: gpus[0].DeviceID, Layers: []int{0, 1}},
|
||||
{DeviceID: gpus[1].DeviceID, Layers: []int{2, 3}},
|
||||
}
|
||||
if gl.Hash() != want.Hash() {
|
||||
t.Errorf("override mapping = %v, want %v", gl, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildGPULayersFromOverride_TooManySplits(t *testing.T) {
|
||||
totalLayers := 5
|
||||
gpus := []ml.DeviceInfo{
|
||||
{DeviceID: ml.DeviceID{ID: "gpu0"}},
|
||||
{DeviceID: ml.DeviceID{ID: "gpu1"}},
|
||||
}
|
||||
ov := &envconfig.Override{
|
||||
ModelName: "dummy",
|
||||
NumGPULayers: 4,
|
||||
TensorSplit: []int{1, 1, 1}, // 3 entries, only 2 GPUs
|
||||
}
|
||||
gl := buildGPULayersFromOverride(totalLayers, gpus, ov)
|
||||
if gl != nil {
|
||||
t.Fatalf("expected nil due to too many tensor-split entries, got %v", gl)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildGPULayersFromOverride_ZeroTotalSplit(t *testing.T) {
|
||||
totalLayers := 5
|
||||
gpus := []ml.DeviceInfo{
|
||||
{DeviceID: ml.DeviceID{ID: "gpu0"}},
|
||||
{DeviceID: ml.DeviceID{ID: "gpu1"}},
|
||||
}
|
||||
ov := &envconfig.Override{
|
||||
ModelName: "dummy",
|
||||
NumGPULayers: 4,
|
||||
TensorSplit: []int{0, 0}, // totals to zero
|
||||
}
|
||||
gl := buildGPULayersFromOverride(totalLayers, gpus, ov)
|
||||
if gl != nil {
|
||||
t.Fatalf("expected nil due to zero/invalid tensor-split total, got %v", gl)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMaybeApplyOverride_Applies(t *testing.T) {
|
||||
// Model with 5 total layers (blocks 0..4).
|
||||
s := &llmServer{
|
||||
totalLayers: 5,
|
||||
options: api.Options{},
|
||||
override: &envconfig.Override{
|
||||
ModelName: "dummy",
|
||||
NumGPULayers: 4,
|
||||
TensorSplit: []int{1, 1},
|
||||
},
|
||||
}
|
||||
gpus := []ml.DeviceInfo{
|
||||
{DeviceID: ml.DeviceID{ID: "gpu0"}},
|
||||
{DeviceID: ml.DeviceID{ID: "gpu1"}},
|
||||
}
|
||||
// Heuristic layout (will be replaced)
|
||||
heuristic := ml.GPULayersList{
|
||||
{DeviceID: gpus[1].DeviceID, Layers: []int{0, 1}},
|
||||
}
|
||||
got, ok := s.maybeApplyOverride(gpus, heuristic)
|
||||
if !ok {
|
||||
t.Fatalf("expected override to be applied")
|
||||
}
|
||||
// Expect override mapping (even split)
|
||||
want := ml.GPULayersList{
|
||||
{DeviceID: gpus[0].DeviceID, Layers: []int{0, 1}},
|
||||
{DeviceID: gpus[1].DeviceID, Layers: []int{2, 3}},
|
||||
}
|
||||
if got.Hash() != want.Hash() {
|
||||
t.Errorf("maybeApplyOverride = %v, want %v", got, want)
|
||||
}
|
||||
// options.NumGPU should align with override.NumGPULayers
|
||||
if s.options.NumGPU != s.override.NumGPULayers {
|
||||
t.Errorf("options.NumGPU = %d, want %d", s.options.NumGPU, s.override.NumGPULayers)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMaybeApplyOverride_RejectsTooManySplits(t *testing.T) {
|
||||
s := &llmServer{
|
||||
totalLayers: 5,
|
||||
options: api.Options{},
|
||||
override: &envconfig.Override{
|
||||
ModelName: "dummy",
|
||||
NumGPULayers: 4,
|
||||
TensorSplit: []int{1, 1, 1}, // 3 entries, 2 GPUs -> reject
|
||||
},
|
||||
}
|
||||
gpus := []ml.DeviceInfo{
|
||||
{DeviceID: ml.DeviceID{ID: "gpu0"}},
|
||||
{DeviceID: ml.DeviceID{ID: "gpu1"}},
|
||||
}
|
||||
heuristic := ml.GPULayersList{
|
||||
{DeviceID: gpus[1].DeviceID, Layers: []int{0, 1}},
|
||||
}
|
||||
got, ok := s.maybeApplyOverride(gpus, heuristic)
|
||||
if ok || got.Hash() != heuristic.Hash() {
|
||||
t.Fatalf("expected override to be ignored and heuristic preserved; got=%v ok=%v", got, ok)
|
||||
}
|
||||
}
|
||||
|
|
@ -17,6 +17,7 @@ import (
|
|||
"github.com/google/go-cmp/cmp"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/ml"
|
||||
|
|
@ -48,8 +49,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
|
|||
return
|
||||
}
|
||||
|
||||
func newMockServer(mock *mockRunner) func(ml.SystemInfo, []ml.DeviceInfo, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
|
||||
return func(_ ml.SystemInfo, _ []ml.DeviceInfo, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
|
||||
func newMockServer(mock *mockRunner) func(ml.SystemInfo, []ml.DeviceInfo, string, *ggml.GGML, []string, []string, api.Options, int, *envconfig.Override) (llm.LlamaServer, error) {
|
||||
return func(_ ml.SystemInfo, _ []ml.DeviceInfo, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int, _ *envconfig.Override) (llm.LlamaServer, error) {
|
||||
return mock, nil
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ type Scheduler struct {
|
|||
loaded map[string]*runnerRef
|
||||
|
||||
loadFn func(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) bool
|
||||
newServerFn func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
|
||||
newServerFn func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int, override *envconfig.Override) (llm.LlamaServer, error)
|
||||
getGpuFn func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo
|
||||
getSystemInfoFn func() ml.SystemInfo
|
||||
waitForRecovery time.Duration
|
||||
|
|
@ -414,7 +414,9 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo
|
|||
|
||||
if llama == nil {
|
||||
var err error
|
||||
llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
|
||||
// Load per-model override (by short name)
|
||||
override := envconfig.LoadOverride(req.model.ShortName)
|
||||
llama, err = s.newServerFn(systemInfo, gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel, override)
|
||||
if err != nil {
|
||||
// some older models are not compatible with newer versions of llama.cpp
|
||||
// show a generalized compatibility error until there is a better way to
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ import (
|
|||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
|
|
@ -49,7 +50,7 @@ func TestSchedLoad(t *testing.T) {
|
|||
sessionDuration: &api.Duration{Duration: 2 * time.Second},
|
||||
}
|
||||
// Fail to load model first
|
||||
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int, override *envconfig.Override) (llm.LlamaServer, error) {
|
||||
return nil, errors.New("something failed to load model blah")
|
||||
}
|
||||
gpus := []ml.DeviceInfo{}
|
||||
|
|
@ -64,7 +65,7 @@ func TestSchedLoad(t *testing.T) {
|
|||
require.Contains(t, err.Error(), "this model may be incompatible")
|
||||
|
||||
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
|
||||
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int, override *envconfig.Override) (llm.LlamaServer, error) {
|
||||
server.modelPath = model
|
||||
return server, nil
|
||||
}
|
||||
|
|
@ -106,7 +107,7 @@ type reqBundle struct {
|
|||
f *ggml.GGML
|
||||
}
|
||||
|
||||
func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int, override *envconfig.Override) (llm.LlamaServer, error) {
|
||||
scenario.srv.modelPath = model
|
||||
return scenario.srv, nil
|
||||
}
|
||||
|
|
@ -466,7 +467,7 @@ func TestSchedExpireRunner(t *testing.T) {
|
|||
gpus := []ml.DeviceInfo{}
|
||||
systemInfo := ml.SystemInfo{}
|
||||
server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
|
||||
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int, override *envconfig.Override) (llm.LlamaServer, error) {
|
||||
server.modelPath = model
|
||||
return server, nil
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue