mirror of https://github.com/ollama/ollama
llm: Don't always evict models on CPU-only systems
Model eviction happens when we have at least one other model loaded and are unable to load all layers into VRAM. However, on CPU-only systems we can never load layers into VRAM, so this constantly triggered eviction. Fixes #13227
This commit is contained in:
parent
d771043e88
commit
5317202c38
|
|
@ -874,7 +874,7 @@ func (s *llmServer) createLayout(systemInfo ml.SystemInfo, systemGPUs []ml.Devic
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
gpuLayers, layers := s.buildLayout(systemGPUs, memory, requireFull, backoff)
|
gpuLayers, layers := s.buildLayout(systemGPUs, memory, requireFull, backoff)
|
||||||
err := s.verifyLayout(systemInfo, memory, requireFull, gpuLayers, layers)
|
err := s.verifyLayout(systemInfo, systemGPUs, memory, requireFull, gpuLayers, layers)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
@ -943,7 +943,7 @@ func (s *llmServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.BackendMe
|
||||||
}
|
}
|
||||||
|
|
||||||
// verifyLayout ensures that we don't exceed limits, such as requirements about partial offloading or system memory
|
// verifyLayout ensures that we don't exceed limits, such as requirements about partial offloading or system memory
|
||||||
func (s *llmServer) verifyLayout(systemInfo ml.SystemInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error {
|
func (s *llmServer) verifyLayout(systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error {
|
||||||
// These sizes will only increase as we go through additional iterations and get additional information.
|
// These sizes will only increase as we go through additional iterations and get additional information.
|
||||||
cpuSize := memory.InputWeights + memory.CPU.Graph
|
cpuSize := memory.InputWeights + memory.CPU.Graph
|
||||||
var vramSize uint64
|
var vramSize uint64
|
||||||
|
|
@ -970,8 +970,8 @@ nextLayer:
|
||||||
}
|
}
|
||||||
|
|
||||||
if requireFull {
|
if requireFull {
|
||||||
if gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) {
|
if len(systemGPUs) > 0 && gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) {
|
||||||
slog.Info("model requires more memory than is currently available, evicting a model to make space", "loaded layers", gpuLayers.Sum())
|
slog.Info("model requires more gpu memory than is currently available, evicting a model to make space", "loaded layers", gpuLayers.Sum())
|
||||||
return ErrLoadRequiredFull
|
return ErrLoadRequiredFull
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -998,7 +998,7 @@ nextLayer:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if gpuLayers.Sum() == 0 {
|
if len(systemGPUs) > 0 && gpuLayers.Sum() == 0 {
|
||||||
slog.Debug("insufficient VRAM to load any model layers")
|
slog.Debug("insufficient VRAM to load any model layers")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -30,6 +30,7 @@ func TestLLMServerFitGPU(t *testing.T) {
|
||||||
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||||
numGPU: -1,
|
numGPU: -1,
|
||||||
expected: ml.GPULayersList{},
|
expected: ml.GPULayersList{},
|
||||||
|
requireFull: true, // Should not try to evict even though we can't load any layers
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "Full single GPU",
|
name: "Full single GPU",
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue