diff --git a/llm/server.go b/llm/server.go index 4eaa88df0..fa4e438d3 100644 --- a/llm/server.go +++ b/llm/server.go @@ -874,7 +874,7 @@ func (s *llmServer) createLayout(systemInfo ml.SystemInfo, systemGPUs []ml.Devic }} } gpuLayers, layers := s.buildLayout(systemGPUs, memory, requireFull, backoff) - err := s.verifyLayout(systemInfo, memory, requireFull, gpuLayers, layers) + err := s.verifyLayout(systemInfo, systemGPUs, memory, requireFull, gpuLayers, layers) if err != nil { return nil, err } @@ -943,7 +943,7 @@ func (s *llmServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.BackendMe } // verifyLayout ensures that we don't exceed limits, such as requirements about partial offloading or system memory -func (s *llmServer) verifyLayout(systemInfo ml.SystemInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error { +func (s *llmServer) verifyLayout(systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error { // These sizes will only increase as we go through additional iterations and get additional information. cpuSize := memory.InputWeights + memory.CPU.Graph var vramSize uint64 @@ -970,8 +970,8 @@ nextLayer: } if requireFull { - if gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) { - slog.Info("model requires more memory than is currently available, evicting a model to make space", "loaded layers", gpuLayers.Sum()) + if len(systemGPUs) > 0 && gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) { + slog.Info("model requires more gpu memory than is currently available, evicting a model to make space", "loaded layers", gpuLayers.Sum()) return ErrLoadRequiredFull } @@ -998,7 +998,7 @@ nextLayer: } } - if gpuLayers.Sum() == 0 { + if len(systemGPUs) > 0 && gpuLayers.Sum() == 0 { slog.Debug("insufficient VRAM to load any model layers") } diff --git a/llm/server_test.go b/llm/server_test.go index 1f5d5cda3..5dc0aa9bc 100644 --- a/llm/server_test.go +++ b/llm/server_test.go @@ -26,10 +26,11 @@ func TestLLMServerFitGPU(t *testing.T) { expectedErr error }{ { - name: "No GPU", - layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, - numGPU: -1, - expected: ml.GPULayersList{}, + name: "No GPU", + layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte}, + numGPU: -1, + expected: ml.GPULayersList{}, + requireFull: true, // Should not try to evict even though we can't load any layers }, { name: "Full single GPU",