llm: Don't always evict models on CPU-only systems

Model eviction happens when we have at least one other model
loaded and are unable to load all layers into VRAM. However, on
CPU-only systems we can never load layers into VRAM, so this
constantly triggered eviction.

Fixes #13227
This commit is contained in:
Jesse Gross 2025-11-25 14:51:02 -08:00 committed by Jesse Gross
parent d771043e88
commit 5317202c38
2 changed files with 10 additions and 9 deletions

View File

@ -874,7 +874,7 @@ func (s *llmServer) createLayout(systemInfo ml.SystemInfo, systemGPUs []ml.Devic
}} }}
} }
gpuLayers, layers := s.buildLayout(systemGPUs, memory, requireFull, backoff) gpuLayers, layers := s.buildLayout(systemGPUs, memory, requireFull, backoff)
err := s.verifyLayout(systemInfo, memory, requireFull, gpuLayers, layers) err := s.verifyLayout(systemInfo, systemGPUs, memory, requireFull, gpuLayers, layers)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -943,7 +943,7 @@ func (s *llmServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.BackendMe
} }
// verifyLayout ensures that we don't exceed limits, such as requirements about partial offloading or system memory // verifyLayout ensures that we don't exceed limits, such as requirements about partial offloading or system memory
func (s *llmServer) verifyLayout(systemInfo ml.SystemInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error { func (s *llmServer) verifyLayout(systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error {
// These sizes will only increase as we go through additional iterations and get additional information. // These sizes will only increase as we go through additional iterations and get additional information.
cpuSize := memory.InputWeights + memory.CPU.Graph cpuSize := memory.InputWeights + memory.CPU.Graph
var vramSize uint64 var vramSize uint64
@ -970,8 +970,8 @@ nextLayer:
} }
if requireFull { if requireFull {
if gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) { if len(systemGPUs) > 0 && gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) {
slog.Info("model requires more memory than is currently available, evicting a model to make space", "loaded layers", gpuLayers.Sum()) slog.Info("model requires more gpu memory than is currently available, evicting a model to make space", "loaded layers", gpuLayers.Sum())
return ErrLoadRequiredFull return ErrLoadRequiredFull
} }
@ -998,7 +998,7 @@ nextLayer:
} }
} }
if gpuLayers.Sum() == 0 { if len(systemGPUs) > 0 && gpuLayers.Sum() == 0 {
slog.Debug("insufficient VRAM to load any model layers") slog.Debug("insufficient VRAM to load any model layers")
} }

View File

@ -26,10 +26,11 @@ func TestLLMServerFitGPU(t *testing.T) {
expectedErr error expectedErr error
}{ }{
{ {
name: "No GPU", name: "No GPU",
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1, numGPU: -1,
expected: ml.GPULayersList{}, expected: ml.GPULayersList{},
requireFull: true, // Should not try to evict even though we can't load any layers
}, },
{ {
name: "Full single GPU", name: "Full single GPU",