llm: Don't always evict models on CPU-only systems

Model eviction happens when we have at least one other model loaded and are unable to load all layers into VRAM. However, on CPU-only systems we can never load layers into VRAM, so this constantly triggered eviction. Fixes #13227
2025-11-25 14:51:02 -08:00 · 2025-11-25 14:51:02 -08:00 · 5317202c38
parent d771043e88
commit 5317202c38
2 changed files with 10 additions and 9 deletions
--- a/llm/server.go
+++ b/llm/server.go
@ -874,7 +874,7 @@ func (s *llmServer) createLayout(systemInfo ml.SystemInfo, systemGPUs []ml.Devic
 		}}
 	}
 	gpuLayers, layers := s.buildLayout(systemGPUs, memory, requireFull, backoff)
-	err := s.verifyLayout(systemInfo, memory, requireFull, gpuLayers, layers)
+	err := s.verifyLayout(systemInfo, systemGPUs, memory, requireFull, gpuLayers, layers)
 	if err != nil {
 		return nil, err
 	}
@ -943,7 +943,7 @@ func (s *llmServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.BackendMe
 }
 // verifyLayout ensures that we don't exceed limits, such as requirements about partial offloading or system memory
-func (s *llmServer) verifyLayout(systemInfo ml.SystemInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error {
+func (s *llmServer) verifyLayout(systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error {
 	// These sizes will only increase as we go through additional iterations and get additional information.
 	cpuSize := memory.InputWeights + memory.CPU.Graph
 	var vramSize uint64
@ -970,8 +970,8 @@ nextLayer:
 	}
 	if requireFull {
-		if gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) {
+		if len(systemGPUs) > 0 && gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) {
-			slog.Info("model requires more memory than is currently available, evicting a model to make space", "loaded layers", gpuLayers.Sum())
+			slog.Info("model requires more gpu memory than is currently available, evicting a model to make space", "loaded layers", gpuLayers.Sum())
 			return ErrLoadRequiredFull
 		}
@ -998,7 +998,7 @@ nextLayer:
 		}
 	}
-	if gpuLayers.Sum() == 0 {
+	if len(systemGPUs) > 0 && gpuLayers.Sum() == 0 {
 		slog.Debug("insufficient VRAM to load any model layers")
 	}
--- a/llm/server_test.go
+++ b/llm/server_test.go
@ -30,6 +30,7 @@ func TestLLMServerFitGPU(t *testing.T) {
 			layers:      []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
 			numGPU:      -1,
 			expected:    ml.GPULayersList{},
 			requireFull: true, // Should not try to evict even though we can't load any layers
 		},
 		{
 			name:     "Full single GPU",