From 3f3083673496adcc0429ff213dabb0c4fcbe21a2 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 3 Dec 2025 12:58:16 -0800 Subject: [PATCH] CUDA: filter devices on secondary discovery (#13317) We now do a deeper probe of CUDA devices to verify the library version has the correct compute capability coverage for the device. Due to ROCm also interpreting the CUDA env var to filter AMD devices, we try to avoid setting it which leads to problems in mixed vendor systems. However without setting it for this deeper probe, each CUDA library subprocess discovers all CUDA GPUs and on systems with lots of GPUs, this can lead to hitting timeouts. The fix is to turn on the CUDA visibility env var just for this deeper probe use-case. --- discover/runner.go | 5 +++-- llm/server.go | 2 +- ml/device.go | 16 ++++++++++++---- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/discover/runner.go b/discover/runner.go index 44737aa22..c963de6f8 100644 --- a/discover/runner.go +++ b/discover/runner.go @@ -147,7 +147,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml. wg.Add(1) go func(i int) { defer wg.Done() - extraEnvs := ml.GetVisibleDevicesEnv(devices[i : i+1]) + extraEnvs := ml.GetVisibleDevicesEnv(devices[i:i+1], true) devices[i].AddInitValidation(extraEnvs) if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 { slog.Debug("filtering device which didn't fully initialize", @@ -333,7 +333,8 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml. defer cancel() // Apply any dev filters to avoid re-discovering unsupported devices, and get IDs correct - devFilter := ml.GetVisibleDevicesEnv(devices) + // We avoid CUDA filters here to keep ROCm from failing to discover GPUs in a mixed environment + devFilter := ml.GetVisibleDevicesEnv(devices, false) for dir := range libDirs { updatedDevices := bootstrapDevices(ctx, []string{ml.LibOllamaPath, dir}, devFilter) diff --git a/llm/server.go b/llm/server.go index fa4e438d3..e9d0a030f 100644 --- a/llm/server.go +++ b/llm/server.go @@ -227,7 +227,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st modelPath, gpuLibs, status, - ml.GetVisibleDevicesEnv(gpus), + ml.GetVisibleDevicesEnv(gpus, false), ) s := llmServer{ diff --git a/ml/device.go b/ml/device.go index 7d86dfddb..f892b512d 100644 --- a/ml/device.go +++ b/ml/device.go @@ -494,13 +494,14 @@ func FlashAttentionSupported(l []DeviceInfo) bool { // Given the list of GPUs this instantiation is targeted for, // figure out the visible devices environment variables -func GetVisibleDevicesEnv(l []DeviceInfo) map[string]string { +// Set mustFilter true to enable filtering of CUDA devices +func GetVisibleDevicesEnv(l []DeviceInfo, mustFilter bool) map[string]string { if len(l) == 0 { return nil } env := map[string]string{} for _, d := range l { - d.updateVisibleDevicesEnv(env) + d.updateVisibleDevicesEnv(env, mustFilter) } return env } @@ -532,7 +533,7 @@ func (d DeviceInfo) PreferredLibrary(other DeviceInfo) bool { return false } -func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string) { +func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string, mustFilter bool) { var envVar string switch d.Library { case "ROCm": @@ -541,8 +542,15 @@ func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string) { if runtime.GOOS != "linux" { envVar = "HIP_VISIBLE_DEVICES" } + case "CUDA": + if !mustFilter { + // By default we try to avoid filtering CUDA devices because ROCm also + // looks at the CUDA env var, and gets confused in mixed vendor environments. + return + } + envVar = "CUDA_VISIBLE_DEVICES" default: - // CUDA and Vulkan are not filtered via env var, but via scheduling decisions + // Vulkan is not filtered via env var, but via scheduling decisions return } v, existing := env[envVar]