Merge 8466b205b8 into a013693f80

2025-12-17 06:54:52 +02:00 · 2025-12-17 06:54:52 +02:00 · c0fb854ee4
parent a013693f80 8466b205b8
commit c0fb854ee4
9 changed files with 872 additions and 3 deletions
--- a/README.md
+++ b/README.md
@ -24,6 +24,9 @@ curl -fsSL https://ollama.com/install.sh | sh

 [Manual install instructions](https://docs.ollama.com/linux#manual-install)

+> [!TIP]
+> Running ROCm on UMA-only AMD APUs (e.g., Radeon 760M) requires staging additional ROCm runtime files and enabling experimental discovery flags. See [docs/rocm-apu.md](docs/rocm-apu.md) for the exact build and runtime steps used on this branch.
+
 ### Docker

 The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `ollama/ollama` is available on Docker Hub.
@ -32,6 +35,7 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla

 - [ollama-python](https://github.com/ollama/ollama-python)
 - [ollama-js](https://github.com/ollama/ollama-js)
+- [Experimental ROCm iGPU Guide](docs/rocm-apu.md)

 ### Community

--- a/cmake/CMakeDetermineHIPCompiler.cmake
+++ b/cmake/CMakeDetermineHIPCompiler.cmake
@ -0,0 +1,329 @@
+# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
+# file Copyright.txt or https://cmake.org/licensing for details.
+
+include(${CMAKE_ROOT}/Modules/CMakeDetermineCompiler.cmake)
+include(${CMAKE_ROOT}/Modules/CMakeParseImplicitLinkInfo.cmake)
+include(${CMAKE_ROOT}/Modules/CMakeParseLibraryArchitecture.cmake)
+
+if(NOT ((CMAKE_GENERATOR MATCHES "Make") OR
+        (CMAKE_GENERATOR MATCHES "Ninja")))
+  message(FATAL_ERROR "HIP language not currently supported by \"${CMAKE_GENERATOR}\" generator")
+endif()
+
+if(NOT CMAKE_HIP_PLATFORM)
+  execute_process(COMMAND hipconfig --platform
+    OUTPUT_VARIABLE _CMAKE_HIPCONFIG_PLATFORM OUTPUT_STRIP_TRAILING_WHITESPACE
+    RESULT_VARIABLE _CMAKE_HIPCONFIG_RESULT
+    )
+  if(_CMAKE_HIPCONFIG_RESULT EQUAL 0 AND _CMAKE_HIPCONFIG_PLATFORM MATCHES "^(nvidia|nvcc)$")
+    set(CMAKE_HIP_PLATFORM "nvidia" CACHE STRING "HIP platform" FORCE)
+  else()
+    set(CMAKE_HIP_PLATFORM "amd" CACHE STRING "HIP platform" FORCE)
+  endif()
+endif()
+if(NOT CMAKE_HIP_PLATFORM MATCHES "^(amd|nvidia)$")
+  message(FATAL_ERROR
+    "The CMAKE_HIP_PLATFORM has unsupported value:\n"
+    " '${CMAKE_HIP_PLATFORM}'\n"
+    "It must be 'amd' or 'nvidia'."
+    )
+endif()
+
+if(NOT CMAKE_HIP_COMPILER)
+  set(CMAKE_HIP_COMPILER_INIT NOTFOUND)
+
+  # prefer the environment variable HIPCXX
+  if(NOT $ENV{HIPCXX} STREQUAL "")
+    get_filename_component(CMAKE_HIP_COMPILER_INIT $ENV{HIPCXX} PROGRAM PROGRAM_ARGS CMAKE_HIP_FLAGS_ENV_INIT)
+    if(CMAKE_HIP_FLAGS_ENV_INIT)
+      set(CMAKE_HIP_COMPILER_ARG1 "${CMAKE_HIP_FLAGS_ENV_INIT}" CACHE STRING "Arguments to CXX compiler")
+    endif()
+    if(NOT EXISTS ${CMAKE_HIP_COMPILER_INIT})
+      message(FATAL_ERROR "Could not find compiler set in environment variable HIPCXX:\n$ENV{HIPCXX}.\n${CMAKE_HIP_COMPILER_INIT}")
+    endif()
+  endif()
+
+  # finally list compilers to try
+  if(NOT CMAKE_HIP_COMPILER_INIT)
+    if(CMAKE_HIP_PLATFORM STREQUAL "nvidia")
+      set(CMAKE_HIP_COMPILER_LIST nvcc)
+    elseif(CMAKE_HIP_PLATFORM STREQUAL "amd")
+      set(CMAKE_HIP_COMPILER_LIST clang++)
+
+      # Look for the Clang coming with ROCm to support HIP.
+      execute_process(COMMAND hipconfig --hipclangpath
+        OUTPUT_VARIABLE _CMAKE_HIPCONFIG_CLANGPATH
+        RESULT_VARIABLE _CMAKE_HIPCONFIG_RESULT
+      )
+      if(_CMAKE_HIPCONFIG_RESULT EQUAL 0 AND EXISTS "${_CMAKE_HIPCONFIG_CLANGPATH}")
+        set(CMAKE_HIP_COMPILER_HINTS "${_CMAKE_HIPCONFIG_CLANGPATH}")
+      endif()
+    endif()
+  endif()
+
+  _cmake_find_compiler(HIP)
+else()
+  _cmake_find_compiler_path(HIP)
+endif()
+
+mark_as_advanced(CMAKE_HIP_COMPILER)
+
+# Build a small source file to identify the compiler.
+if(NOT CMAKE_HIP_COMPILER_ID_RUN)
+  set(CMAKE_HIP_COMPILER_ID_RUN 1)
+
+  include(${CMAKE_ROOT}/Modules/CMakeDetermineCompilerId.cmake)
+
+  # We determine the vendor to use the right flags for detection right away.
+  # The main compiler identification is still needed below to extract other information.
+  list(APPEND CMAKE_HIP_COMPILER_ID_VENDORS NVIDIA Clang)
+  set(CMAKE_HIP_COMPILER_ID_VENDOR_REGEX_NVIDIA "nvcc: NVIDIA \\(R\\) Cuda compiler driver")
+  set(CMAKE_HIP_COMPILER_ID_VENDOR_REGEX_Clang "(clang version)")
+  CMAKE_DETERMINE_COMPILER_ID_VENDOR(HIP "--version")
+
+  if(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA")
+    # Find the CUDA toolkit to get:
+    # - CMAKE_HIP_COMPILER_CUDA_TOOLKIT_VERSION
+    # - CMAKE_HIP_COMPILER_CUDA_TOOLKIT_ROOT
+    # - CMAKE_HIP_COMPILER_CUDA_LIBRARY_ROOT
+    # We save them in CMakeHIPCompiler.cmake.
+    # Match arguments with cmake_cuda_architectures_all call.
+    include(Internal/CMakeCUDAFindToolkit)
+    cmake_cuda_find_toolkit(HIP CMAKE_HIP_COMPILER_CUDA_)
+
+    # If the user set CMAKE_HIP_ARCHITECTURES, validate its value.
+    include(Internal/CMakeCUDAArchitecturesValidate)
+    cmake_cuda_architectures_validate(HIP)
+
+    if(NOT CMAKE_HIP_HOST_COMPILER AND NOT $ENV{HIPHOSTCXX} STREQUAL "")
+      get_filename_component(CMAKE_HIP_HOST_COMPILER $ENV{HIPHOSTCXX} PROGRAM)
+      if(NOT EXISTS "${CMAKE_HIP_HOST_COMPILER}")
+        message(FATAL_ERROR "Could not find compiler set in environment variable HIPHOSTCXX:\n$ENV{HIPHOSTCXX}.\n${CMAKE_HIP_HOST_COMPILER}")
+      endif()
+    endif()
+  endif()
+
+  if(CMAKE_HIP_COMPILER_ID STREQUAL "Clang")
+    list(APPEND CMAKE_HIP_COMPILER_ID_TEST_FLAGS_FIRST "-v")
+  elseif(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA")
+    # Tell nvcc to treat .hip files as CUDA sources.
+    list(APPEND CMAKE_HIP_COMPILER_ID_TEST_FLAGS_FIRST "-x cu -v")
+    if(CMAKE_HIP_HOST_COMPILER)
+      string(APPEND CMAKE_HIP_COMPILER_ID_TEST_FLAGS_FIRST " -ccbin=\"${CMAKE_HIP_HOST_COMPILER}\"")
+    endif()
+  endif()
+
+  # We perform compiler identification for a second time to extract implicit linking info.
+  # We need to unset the compiler ID otherwise CMAKE_DETERMINE_COMPILER_ID() doesn't work.
+  set(CMAKE_HIP_COMPILER_ID)
+  set(CMAKE_HIP_PLATFORM_ID)
+  file(READ ${CMAKE_ROOT}/Modules/CMakePlatformId.h.in
+    CMAKE_HIP_COMPILER_ID_PLATFORM_CONTENT)
+
+  CMAKE_DETERMINE_COMPILER_ID(HIP HIPFLAGS CMakeHIPCompilerId.hip)
+
+  if(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA")
+    include(Internal/CMakeCUDAArchitecturesAll)
+    # From CMAKE_HIP_COMPILER_CUDA_TOOLKIT_VERSION and CMAKE_HIP_COMPILER_{ID,VERSION}, get:
+    # - CMAKE_HIP_ARCHITECTURES_ALL
+    # - CMAKE_HIP_ARCHITECTURES_ALL_MAJOR
+    # Match arguments with cmake_cuda_find_toolkit call.
+    cmake_cuda_architectures_all(HIP CMAKE_HIP_COMPILER_CUDA_)
+  endif()
+
+  _cmake_find_compiler_sysroot(HIP)
+endif()
+
+if(NOT CMAKE_HIP_COMPILER_ROCM_ROOT AND CMAKE_HIP_COMPILER_ID STREQUAL "Clang")
+   execute_process(COMMAND "${CMAKE_HIP_COMPILER}" -v -print-targets
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    RESULT_VARIABLE _CMAKE_HIP_COMPILER_RESULT
+    OUTPUT_VARIABLE _CMAKE_HIP_COMPILER_STDOUT
+    ERROR_VARIABLE _CMAKE_HIP_COMPILER_STDERR
+    )
+
+  if(_CMAKE_HIP_COMPILER_RESULT EQUAL 0 AND _CMAKE_HIP_COMPILER_STDERR MATCHES "Found HIP installation: *([^,]*)[,\n]")
+    set(CMAKE_HIP_COMPILER_ROCM_ROOT "${CMAKE_MATCH_1}")
+    file(TO_CMAKE_PATH "${CMAKE_HIP_COMPILER_ROCM_ROOT}" CMAKE_HIP_COMPILER_ROCM_ROOT)
+  endif()
+endif()
+if(NOT CMAKE_HIP_COMPILER_ROCM_ROOT)
+  execute_process(
+    COMMAND hipconfig --rocmpath
+    OUTPUT_VARIABLE _CMAKE_HIPCONFIG_ROCMPATH
+    RESULT_VARIABLE _CMAKE_HIPCONFIG_RESULT
+    )
+  if(_CMAKE_HIPCONFIG_RESULT EQUAL 0 AND EXISTS "${_CMAKE_HIPCONFIG_ROCMPATH}")
+    set(CMAKE_HIP_COMPILER_ROCM_ROOT "${_CMAKE_HIPCONFIG_ROCMPATH}")
+  endif()
+endif()
+if(NOT CMAKE_HIP_COMPILER_ROCM_ROOT)
+  message(FATAL_ERROR "Failed to find ROCm root directory.")
+endif()
+
+if(CMAKE_HIP_PLATFORM STREQUAL "amd")
+  # For this platform we need the hip-lang cmake package.
+
+  # Normally implicit link information is not detected until ABI detection,
+  # but we need to populate CMAKE_HIP_LIBRARY_ARCHITECTURE to find hip-lang.
+  cmake_parse_implicit_link_info("${CMAKE_HIP_COMPILER_PRODUCED_OUTPUT}"
+    _CMAKE_HIP_COMPILER_ID_IMPLICIT_LIBS
+    _CMAKE_HIP_COMPILER_ID_IMPLICIT_DIRS
+    _CMAKE_HIP_COMPILER_ID_IMPLICIT_FWKS
+    _CMAKE_HIP_COMPILER_ID_IMPLICIT_LOG
+    "" LANGUAGE HIP)
+  message(CONFIGURE_LOG
+    "Parsed HIP implicit link information from compiler id output:\n${_CMAKE_HIP_COMPILER_ID_IMPLICIT_LOG}\n\n")
+  cmake_parse_library_architecture(HIP "${_CMAKE_HIP_COMPILER_ID_IMPLICIT_DIRS}" "" CMAKE_HIP_LIBRARY_ARCHITECTURE)
+  if(CMAKE_HIP_LIBRARY_ARCHITECTURE)
+    message(CONFIGURE_LOG
+      "Parsed HIP library architecture from compiler id output: ${CMAKE_HIP_LIBRARY_ARCHITECTURE}\n")
+  endif()
+  unset(_CMAKE_HIP_COMPILER_ID_IMPLICIT_LIBS)
+  unset(_CMAKE_HIP_COMPILER_ID_IMPLICIT_DIRS)
+  unset(_CMAKE_HIP_COMPILER_ID_IMPLICIT_FWKS)
+  unset(_CMAKE_HIP_COMPILER_ID_IMPLICIT_LOG)
+
+  if(NOT CMAKE_HIP_COMPILER_ROCM_LIB)
+    set(_CMAKE_HIP_COMPILER_ROCM_LIB_DIRS
+      "${CMAKE_HIP_COMPILER_ROCM_ROOT}/lib"
+      "${CMAKE_HIP_COMPILER_ROCM_ROOT}/lib64"
+      )
+    if(CMAKE_HIP_LIBRARY_ARCHITECTURE)
+      list(APPEND _CMAKE_HIP_COMPILER_ROCM_LIB_DIRS "${CMAKE_HIP_COMPILER_ROCM_ROOT}/lib/${CMAKE_HIP_LIBRARY_ARCHITECTURE}")
+    endif()
+    foreach(dir IN LISTS _CMAKE_HIP_COMPILER_ROCM_LIB_DIRS)
+      if(EXISTS "${dir}/cmake/hip-lang/hip-lang-config.cmake")
+        set(CMAKE_HIP_COMPILER_ROCM_LIB "${dir}")
+        break()
+      endif()
+    endforeach()
+    if(NOT CMAKE_HIP_COMPILER_ROCM_LIB)
+      list(TRANSFORM _CMAKE_HIP_COMPILER_ROCM_LIB_DIRS APPEND "/cmake/hip-lang/hip-lang-config.cmake")
+      string(REPLACE ";" "\n " _CMAKE_HIP_COMPILER_ROCM_LIB_DIRS "${_CMAKE_HIP_COMPILER_ROCM_LIB_DIRS}")
+      message(FATAL_ERROR
+        "The ROCm root directory:\n"
+        " ${CMAKE_HIP_COMPILER_ROCM_ROOT}\n"
+        "does not contain the HIP runtime CMake package, expected at one of:\n"
+        " ${_CMAKE_HIP_COMPILER_ROCM_LIB_DIRS}\n"
+        )
+    endif()
+    unset(_CMAKE_HIP_COMPILER_ROCM_LIB_DIRS)
+  endif()
+  if(NOT DEFINED CMAKE_SIZEOF_VOID_P)
+    # We have not yet determined the target ABI but we need 'find_package' to
+    # search lib64 directories to find hip-lang CMake package dependencies.
+    # This will be replaced by ABI detection later.
+    set(CMAKE_HIP_SIZEOF_DATA_PTR 8)
+  endif()
+endif()
+
+if (NOT _CMAKE_TOOLCHAIN_LOCATION)
+  get_filename_component(_CMAKE_TOOLCHAIN_LOCATION "${CMAKE_HIP_COMPILER}" PATH)
+endif ()
+
+set(_CMAKE_PROCESSING_LANGUAGE "HIP")
+include(CMakeFindBinUtils)
+include(Compiler/${CMAKE_HIP_COMPILER_ID}-FindBinUtils OPTIONAL)
+unset(_CMAKE_PROCESSING_LANGUAGE)
+
+if(CMAKE_HIP_COMPILER_ID STREQUAL "Clang")
+  set(CMAKE_HIP_RUNTIME_LIBRARY_DEFAULT "SHARED")
+elseif(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA")
+  include(Internal/CMakeNVCCParseImplicitInfo)
+  # Parse CMAKE_HIP_COMPILER_PRODUCED_OUTPUT to get:
+  # - CMAKE_HIP_ARCHITECTURES_DEFAULT
+  # - CMAKE_HIP_HOST_IMPLICIT_LINK_DIRECTORIES
+  # - CMAKE_HIP_HOST_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES
+  # - CMAKE_HIP_HOST_IMPLICIT_LINK_LIBRARIES
+  # - CMAKE_HIP_HOST_LINK_LAUNCHER
+  # - CMAKE_HIP_RUNTIME_LIBRARY_DEFAULT
+  # - CMAKE_HIP_CUDA_TOOLKIT_INCLUDE_DIRECTORIES
+  # Match arguments with cmake_nvcc_filter_implicit_info call in CMakeTestHIPCompiler.
+  cmake_nvcc_parse_implicit_info(HIP CMAKE_HIP_CUDA_)
+
+  include(Internal/CMakeCUDAFilterImplicitLibs)
+  # Filter out implicit link libraries that should not be passed unconditionally.
+  cmake_cuda_filter_implicit_libs(CMAKE_HIP_HOST_IMPLICIT_LINK_LIBRARIES)
+endif()
+
+if(CMAKE_HIP_COMPILER_SYSROOT)
+  string(CONCAT _SET_CMAKE_HIP_COMPILER_SYSROOT
+    "set(CMAKE_HIP_COMPILER_SYSROOT \"${CMAKE_HIP_COMPILER_SYSROOT}\")\n"
+    "set(CMAKE_COMPILER_SYSROOT \"${CMAKE_HIP_COMPILER_SYSROOT}\")")
+else()
+  set(_SET_CMAKE_HIP_COMPILER_SYSROOT "")
+endif()
+
+if(CMAKE_HIP_COMPILER_ARCHITECTURE_ID)
+  set(_SET_CMAKE_HIP_COMPILER_ARCHITECTURE_ID
+    "set(CMAKE_HIP_COMPILER_ARCHITECTURE_ID ${CMAKE_HIP_COMPILER_ARCHITECTURE_ID})")
+else()
+  set(_SET_CMAKE_HIP_COMPILER_ARCHITECTURE_ID "")
+endif()
+
+if(MSVC_HIP_ARCHITECTURE_ID)
+  set(SET_MSVC_HIP_ARCHITECTURE_ID
+    "set(MSVC_HIP_ARCHITECTURE_ID ${MSVC_HIP_ARCHITECTURE_ID})")
+endif()
+
+if(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA")
+  if(NOT "$ENV{CUDAARCHS}" STREQUAL "")
+    set(CMAKE_HIP_ARCHITECTURES "$ENV{CUDAARCHS}" CACHE STRING "CUDA architectures")
+  endif()
+
+  # If the user did not set CMAKE_HIP_ARCHITECTURES, use the compiler's default.
+  if("${CMAKE_HIP_ARCHITECTURES}" STREQUAL "")
+    set(CMAKE_HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES_DEFAULT}" CACHE STRING "HIP architectures" FORCE)
+    if(NOT CMAKE_HIP_ARCHITECTURES)
+      message(FATAL_ERROR "Failed to detect a default HIP architecture.\n\nCompiler output:\n${CMAKE_HIP_COMPILER_PRODUCED_OUTPUT}")
+    endif()
+  endif()
+  unset(CMAKE_HIP_ARCHITECTURES_DEFAULT)
+elseif(NOT DEFINED CMAKE_HIP_ARCHITECTURES)
+  # Use 'rocm_agent_enumerator' to get the current GPU architecture.
+  set(_CMAKE_HIP_ARCHITECTURES)
+  find_program(_CMAKE_HIP_ROCM_AGENT_ENUMERATOR
+    NAMES rocm_agent_enumerator
+    HINTS "${CMAKE_HIP_COMPILER_ROCM_ROOT}/bin"
+    NO_CACHE)
+  if(_CMAKE_HIP_ROCM_AGENT_ENUMERATOR)
+    execute_process(COMMAND "${_CMAKE_HIP_ROCM_AGENT_ENUMERATOR}" -t GPU
+      RESULT_VARIABLE _CMAKE_ROCM_AGENT_ENUMERATOR_RESULT
+      OUTPUT_VARIABLE _CMAKE_ROCM_AGENT_ENUMERATOR_STDOUT
+      ERROR_VARIABLE  _CMAKE_ROCM_AGENT_ENUMERATOR_STDERR
+    )
+    if(_CMAKE_ROCM_AGENT_ENUMERATOR_RESULT EQUAL 0)
+      separate_arguments(_hip_archs NATIVE_COMMAND "${_CMAKE_ROCM_AGENT_ENUMERATOR_STDOUT}")
+      foreach(_hip_arch ${_hip_archs})
+        if(_hip_arch STREQUAL "gfx000")
+          continue()
+        endif()
+        string(FIND ${_hip_arch} ":" pos)
+        if(NOT pos STREQUAL "-1")
+          string(SUBSTRING ${_hip_arch} 0 ${pos} _hip_arch)
+        endif()
+        list(APPEND _CMAKE_HIP_ARCHITECTURES "${_hip_arch}")
+      endforeach()
+    endif()
+    unset(_CMAKE_ROCM_AGENT_ENUMERATOR_RESULT)
+    unset(_CMAKE_ROCM_AGENT_ENUMERATOR_STDOUT)
+    unset(_CMAKE_ROCM_AGENT_ENUMERATOR_STDERR)
+  endif()
+  unset(_CMAKE_HIP_ROCM_AGENT_ENUMERATOR)
+  if(_CMAKE_HIP_ARCHITECTURES)
+    set(CMAKE_HIP_ARCHITECTURES "${_CMAKE_HIP_ARCHITECTURES}" CACHE STRING "HIP architectures")
+  elseif(CMAKE_HIP_COMPILER_PRODUCED_OUTPUT MATCHES " -target-cpu ([a-z0-9]+) ")
+    set(CMAKE_HIP_ARCHITECTURES "${CMAKE_MATCH_1}" CACHE STRING "HIP architectures")
+  else()
+    message(FATAL_ERROR "Failed to find a default HIP architecture.")
+  endif()
+  unset(_CMAKE_HIP_ARCHITECTURES)
+endif()
+
+# configure variables set in this file for fast reload later on
+configure_file(${CMAKE_ROOT}/Modules/CMakeHIPCompiler.cmake.in
+  ${CMAKE_PLATFORM_INFO_DIR}/CMakeHIPCompiler.cmake
+  @ONLY
+  )
+set(CMAKE_HIP_COMPILER_ENV_VAR "HIPCXX")
--- a/docs/logs/rocm-760m-run-response.json
+++ b/docs/logs/rocm-760m-run-response.json
--- a/docs/logs/rocm-760m-run.log
+++ b/docs/logs/rocm-760m-run.log
@ -0,0 +1 @@
+Error: listen tcp 127.0.0.1:11434: bind: address already in use
--- a/docs/rocm-apu.md
+++ b/docs/rocm-apu.md
@ -0,0 +1,92 @@
+# Experimental ROCm iGPU Support
+
+This branch adds a ROCm backend path geared toward AMD APUs that only expose a small VRAM aperture but share a large UMA pool with the CPU. The steps below outline how to reproduce the build and how to run Ollama with the staged ROCm runtime.
+
+> **Warning**
+> Upstream ROCm does not officially support these APUs yet. Expect driver updates, kernel parameters, or environment variables such as `HSA_OVERRIDE_GFX_VERSION` to change between releases.
+
+## 1. Stage the ROCm runtime
+
+We avoid touching the system installation by unpacking the required RPMs into `build/rocm-stage`.
+
+```bash
+mkdir -p build/rocm-stage build/rpm-tmp
+cd build/rpm-tmp
+dnf download \
+  hipblas hipblas-devel hipblas-common-devel \
+  rocblas rocblas-devel \
+  rocsolver rocsolver-devel \
+  rocm-hip-devel rocm-device-libs rocm-comgr rocm-comgr-devel
+
+cd ../rocm-stage
+for rpm in ../rpm-tmp/*.rpm; do
+  echo "extracting ${rpm}"
+  rpm2cpio "${rpm}" | bsdtar -xf -
+done
+```
+
+Important staged paths after extraction:
+
+| Purpose                  | Location                                        |
+| ------------------------ | ----------------------------------------------- |
+| HIP/rocBLAS libraries    | `build/rocm-stage/lib64`                        |
+| Tensile kernels (rocBLAS)| `build/rocm-stage/lib64/rocblas/library`        |
+| Headers (`hip`, `rocblas`)| `build/rocm-stage/include`                     |
+
+## 2. Build the ROCm backend
+
+Configure CMake with the preset that targets ROCm 6.x and point it at the staged HIP compiler:
+
+```bash
+cmake --preset "ROCm 6" -B build/rocm \
+  -DGGML_VULKAN=OFF \
+  -DCMAKE_INSTALL_PREFIX=/usr/local \
+  -DCMAKE_HIP_COMPILER=/usr/bin/hipcc \
+  -DCMAKE_PREFIX_PATH="$PWD/build/rocm-stage"
+
+cmake --build build/rocm --target ggml-hip -j$(nproc)
+```
+
+Artifacts land in `build/lib/ollama/rocm` (and mirrored in `dist/lib/ollama/rocm` when packaging). These include `libggml-hip.so`, CPU fallback variants, Vulkan, and `librocsolver.so`.
+
+## 3. Run Ollama on ROCm
+
+The runner needs to see both the GGML plugins and the staged ROCm runtime. The following environment block works for an AMD Radeon 760M with a UMA carve-out:
+
+```bash
+export BASE=$HOME/ollama-gpu
+export OLLAMA_LIBRARY_PATH=$BASE/build/lib/ollama/rocm:$BASE/build/lib/ollama
+export LD_LIBRARY_PATH=$OLLAMA_LIBRARY_PATH:$BASE/build/rocm-stage/lib64:${LD_LIBRARY_PATH:-}
+export ROCBLAS_TENSILE_LIBPATH=$BASE/build/rocm-stage/lib64/rocblas/library
+export ROCBLAS_TENSILE_PATH=$ROCBLAS_TENSILE_LIBPATH
+
+export HSA_OVERRIDE_GFX_VERSION=11.0.0   # spoof gfx1100 for Phoenix
+export GGML_HIP_FORCE_GTT=1             # force GTT allocations for UMA memory
+export OLLAMA_GPU_DRIVER=rocm
+export OLLAMA_GPU=100                   # opt into GPU-only scheduling
+export OLLAMA_LLM_LIBRARY=rocm          # skip CUDA/Vulkan discovery noise
+export OLLAMA_VULKAN=0                  # optional: suppress Vulkan backend
+
+$BASE/build/ollama serve
+```
+
+On launch you should see log lines similar to:
+
+```
+library=ROCm compute=gfx1100 name=ROCm0 description="AMD Radeon 760M Graphics"
+ggml_hip_get_device_memory using GTT memory for 0000:0e:00.0 (total=16352354304 free=15034097664)
+```
+
+If the runner crashes before enumerating devices:
+
+- Double-check that `ROCBLAS_TENSILE_LIBPATH` points to the staged `rocblas/library`.
+- Ensure no other `LD_LIBRARY_PATH` entries override `libamdhip64.so`.
+- Try unsetting `HSA_OVERRIDE_GFX_VERSION` to confirm whether the kernel patch is still needed on your system.
+
+> Example discovery + run log: [`docs/logs/rocm-760m-run.log`](logs/rocm-760m-run.log). The matching `curl` response is saved as [`docs/logs/rocm-760m-run-response.json`](logs/rocm-760m-run-response.json).
+
+## 4. Sharing this build
+
+- Keep the staged RPMs alongside the branch so others can reproduce the exact runtime.
+- Include `/tmp/ollama_rocm_run.log` or similar discovery logs in issues/PRs to help maintainers understand the UMA setup.
+- Mention any kernel parameters (e.g., large UMA buffer in firmware) when opening upstream tickets.
--- a/llm/server.go
+++ b/llm/server.go
@ -360,6 +360,36 @@ func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.W

 	// Note: we always put our dependency paths first
 	// since these are the exact version we compiled/linked against
+	userLibs := []string{}
+	if existing, ok := os.LookupEnv("OLLAMA_LIBRARY_PATH"); ok && existing != "" {
+		userLibs = filepath.SplitList(existing)
+	}
+	if len(userLibs) != 0 {
+		seen := make(map[string]struct{}, len(userLibs)+len(gpuLibs))
+		merged := make([]string, 0, len(userLibs)+len(gpuLibs))
+		for _, dir := range userLibs {
+			if dir == "" {
+				continue
+			}
+			if _, ok := seen[dir]; ok {
+				continue
+			}
+			seen[dir] = struct{}{}
+			merged = append(merged, dir)
+		}
+		for _, dir := range gpuLibs {
+			if dir == "" {
+				continue
+			}
+			if _, ok := seen[dir]; ok {
+				continue
+			}
+			seen[dir] = struct{}{}
+			merged = append(merged, dir)
+		}
+		gpuLibs = merged
+	}
+
 	libraryPaths := append([]string{}, gpuLibs...)
 	if libraryPath, ok := os.LookupEnv(pathEnv); ok {
 		libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
--- a/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h
@ -1,5 +1,107 @@
 #pragma once

+#include <cmath>
+#include <type_traits>
+
+#if defined(__HIP_PLATFORM_AMD__)
+#include <hip/amd_detail/device_library_decls.h>
+#include <hip/amd_detail/math_fwd.h>
+
+static __host__ __device__ inline float ggml_hip_max_f32(float a, float b) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_fmax_f32(a, b);
+#else
+    return a > b ? a : b;
+#endif
+}
+
+static __host__ __device__ inline double ggml_hip_max_f64(double a, double b) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_fmax_f64(a, b);
+#else
+    return a > b ? a : b;
+#endif
+}
+
+static __host__ __device__ inline float ggml_hip_min_f32(float a, float b) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_fmin_f32(a, b);
+#else
+    return a < b ? a : b;
+#endif
+}
+
+static __host__ __device__ inline double ggml_hip_min_f64(double a, double b) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_fmin_f64(a, b);
+#else
+    return a < b ? a : b;
+#endif
+}
+
+static __host__ __device__ inline float ggml_hip_pow_f32(float base, float exp) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_pow_f32(base, exp);
+#else
+    return std::pow(base, exp);
+#endif
+}
+
+static __host__ __device__ inline double ggml_hip_pow_f64(double base, double exp) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_pow_f64(base, exp);
+#else
+    return std::pow(base, exp);
+#endif
+}
+
+template <typename A, typename B,
+          typename std::enable_if<std::is_integral<A>::value && std::is_integral<B>::value, int>::type = 0>
+__host__ __device__ inline typename std::common_type<A, B>::type ggml_hip_int_max(A a, B b) {
+    using R = typename std::common_type<A, B>::type;
+    const R aa = static_cast<R>(a);
+    const R bb = static_cast<R>(b);
+    return aa > bb ? aa : bb;
+}
+
+template <typename A, typename B,
+          typename std::enable_if<std::is_integral<A>::value && std::is_integral<B>::value, int>::type = 0>
+__host__ __device__ inline typename std::common_type<A, B>::type ggml_hip_int_min(A a, B b) {
+    using R = typename std::common_type<A, B>::type;
+    const R aa = static_cast<R>(a);
+    const R bb = static_cast<R>(b);
+    return aa < bb ? aa : bb;
+}
+
+__host__ __device__ inline float max(float a, float b) {
+    return ggml_hip_max_f32(a, b);
+}
+
+__host__ __device__ inline double max(double a, double b) {
+    return ggml_hip_max_f64(a, b);
+}
+
+template <typename A, typename B,
+          typename std::enable_if<std::is_integral<A>::value && std::is_integral<B>::value, int>::type = 0>
+__host__ __device__ inline typename std::common_type<A, B>::type max(A a, B b) {
+    return ggml_hip_int_max(a, b);
+}
+
+__host__ __device__ inline float min(float a, float b) {
+    return ggml_hip_min_f32(a, b);
+}
+
+__host__ __device__ inline double min(double a, double b) {
+    return ggml_hip_min_f64(a, b);
+}
+
+template <typename A, typename B,
+          typename std::enable_if<std::is_integral<A>::value && std::is_integral<B>::value, int>::type = 0>
+__host__ __device__ inline typename std::common_type<A, B>::type min(A a, B b) {
+    return ggml_hip_int_min(a, b);
+}
+#endif // defined(__HIP_PLATFORM_AMD__)
+
 #define HIP_DISABLE_WARP_SYNC_BUILTINS 1
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
@ -8,6 +110,232 @@
 // for rocblas_initialize()
 #include "rocblas/rocblas.h"

+#if defined(__HIP_PLATFORM_AMD__)
+#undef fmaxf
+#define fmaxf(a, b) ggml_hip_max_f32((a), (b))
+#undef fminf
+#define fminf(a, b) ggml_hip_min_f32((a), (b))
+#undef powf
+#define powf(a, b) ggml_hip_pow_f32((a), (b))
+
+static __host__ __device__ inline float ggml_hip_expf(float x) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_exp_f32(x);
+#else
+    using std::exp;
+    return static_cast<float>(exp(x));
+#endif
+}
+#undef expf
+#define expf(x) ggml_hip_expf((x))
+
+static __host__ __device__ inline float ggml_hip_expm1f(float x) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_expm1_f32(x);
+#else
+    using std::expm1;
+    return static_cast<float>(expm1(x));
+#endif
+}
+#undef expm1f
+#define expm1f(x) ggml_hip_expm1f((x))
+
+static __host__ __device__ inline float ggml_hip_logf(float x) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_log_f32(x);
+#else
+    using std::log;
+    return static_cast<float>(log(x));
+#endif
+}
+#undef logf
+#define logf(x) ggml_hip_logf((x))
+
+static __host__ __device__ inline float ggml_hip_log1pf(float x) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_log1p_f32(x);
+#else
+    using std::log1p;
+    return static_cast<float>(log1p(x));
+#endif
+}
+#undef log1pf
+#define log1pf(x) ggml_hip_log1pf((x))
+
+static __host__ __device__ inline float ggml_hip_log2f(float x) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_log2_f32(x);
+#else
+    using std::log2;
+    return static_cast<float>(log2(x));
+#endif
+}
+#undef log2f
+#define log2f(x) ggml_hip_log2f((x))
+
+static __host__ __device__ inline float ggml_hip_tanhf(float x) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_tanh_f32(x);
+#else
+    using std::tanh;
+    return static_cast<float>(tanh(x));
+#endif
+}
+#undef tanhf
+#define tanhf(x) ggml_hip_tanhf((x))
+
+static __host__ __device__ inline float ggml_hip_sinf(float x) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_sin_f32(x);
+#else
+    using std::sin;
+    return static_cast<float>(sin(x));
+#endif
+}
+#undef sinf
+#define sinf(x) ggml_hip_sinf((x))
+
+static __host__ __device__ inline float ggml_hip_cosf(float x) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_cos_f32(x);
+#else
+    using std::cos;
+    return static_cast<float>(cos(x));
+#endif
+}
+#undef cosf
+#define cosf(x) ggml_hip_cosf((x))
+
+static __host__ __device__ inline float ggml_hip_erff(float x) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_erf_f32(x);
+#else
+    using std::erf;
+    return static_cast<float>(erf(x));
+#endif
+}
+#undef erff
+#define erff(x) ggml_hip_erff((x))
+
+static __host__ __device__ inline float ggml_hip_fabsf(float x) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_fabs_f32(x);
+#else
+    using std::fabs;
+    return static_cast<float>(fabs(x));
+#endif
+}
+#undef fabsf
+#define fabsf(x) ggml_hip_fabsf((x))
+
+static __host__ __device__ inline float ggml_hip_floorf(float x) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_floor_f32(x);
+#else
+    using std::floor;
+    return static_cast<float>(floor(x));
+#endif
+}
+#undef floorf
+#define floorf(x) ggml_hip_floorf((x))
+
+static __host__ __device__ inline float ggml_hip_ceilf(float x) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_ceil_f32(x);
+#else
+    using std::ceil;
+    return static_cast<float>(ceil(x));
+#endif
+}
+#undef ceilf
+#define ceilf(x) ggml_hip_ceilf((x))
+
+static __host__ __device__ inline float ggml_hip_roundf(float x) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_round_f32(x);
+#else
+    using std::round;
+    return static_cast<float>(round(x));
+#endif
+}
+#undef roundf
+#define roundf(x) ggml_hip_roundf((x))
+
+static __host__ __device__ inline float ggml_hip_round_scalar(float x) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_round_f32(x);
+#else
+    using std::round;
+    return static_cast<float>(round(x));
+#endif
+}
+
+static __host__ __device__ inline double ggml_hip_round_scalar(double x) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_round_f64(x);
+#else
+    using std::round;
+    return round(x);
+#endif
+}
+#undef round
+#define round(x) ggml_hip_round_scalar((x))
+
+static __host__ __device__ inline float ggml_hip_sqrtf(float x) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_sqrt_f32(x);
+#else
+    using std::sqrt;
+    return static_cast<float>(sqrt(x));
+#endif
+}
+#undef sqrtf
+#define sqrtf(x) ggml_hip_sqrtf((x))
+
+static __host__ __device__ inline float ggml_hip_rsqrtf(float x) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_rsqrt_f32(x);
+#else
+    using std::sqrt;
+    return 1.0f / static_cast<float>(sqrt(x));
+#endif
+}
+#undef rsqrtf
+#define rsqrtf(x) ggml_hip_rsqrtf((x))
+
+static __host__ __device__ inline float ggml_hip_trunc_scalar(float x) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_trunc_f32(x);
+#else
+    using std::trunc;
+    return static_cast<float>(trunc(x));
+#endif
+}
+
+static __host__ __device__ inline double ggml_hip_trunc_scalar(double x) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_trunc_f64(x);
+#else
+    using std::trunc;
+    return trunc(x);
+#endif
+}
+#undef trunc
+#define trunc(x) ggml_hip_trunc_scalar((x))
+
+static __host__ __device__ inline int ggml_hip_isinf(float x) {
+#if defined(__HIP_DEVICE_COMPILE__)
+    return __ocml_isinf_f32(x);
+#else
+    using std::isinf;
+    return static_cast<int>(isinf(x));
+#endif
+}
+#undef isinf
+#define isinf(x) ggml_hip_isinf((x))
+
+#endif
+
 #if defined(GGML_HIP_ROCWMMA_FATTN)
 #include <rocwmma/rocwmma-version.hpp>
 #endif // defined(GGML_HIP_ROCWMMA_FATTN)
--- a/ml/backend/ggml/ggml/src/mem_hip.cpp
+++ b/ml/backend/ggml/ggml/src/mem_hip.cpp
@ -442,6 +442,8 @@ int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
 #include <string>
 #include <vector>
 #include <filesystem>
+#include <cstdlib>
+#include <strings.h>

 #include <sys/stat.h>
 #include <dirent.h>
@ -449,6 +451,60 @@ int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
 #include <glob.h>
 namespace fs = std::filesystem;

+namespace {
+
+static bool ggml_env_flag_enabled(const char *name) {
+    const char *value = getenv(name);
+    if (value == nullptr) {
+        return false;
+    }
+    if (*value == '\0') {
+        return true;
+    }
+
+    if ((value[0] == '0' && value[1] == '\0') ||
+        strcasecmp(value, "false") == 0 ||
+        strcasecmp(value, "off") == 0) {
+        return false;
+    }
+    return true;
+}
+
+static bool ggml_read_sysfs_value(const std::string &path, uint64_t &value) {
+    std::ifstream stream(path.c_str());
+    if (!stream.is_open()) {
+        GGML_LOG_DEBUG("%s unable to open sysfs node %s\n", __func__, path.c_str());
+        return false;
+    }
+    stream >> value;
+    if (stream.fail()) {
+        GGML_LOG_DEBUG("%s unable to parse sysfs node %s\n", __func__, path.c_str());
+        return false;
+    }
+    return true;
+}
+
+static bool ggml_should_use_gtt(uint64_t vram_total, uint64_t gtt_total) {
+    if (ggml_env_flag_enabled("GGML_HIP_DISABLE_GTT")) {
+        return false;
+    }
+    if (ggml_env_flag_enabled("GGML_HIP_FORCE_GTT")) {
+        return gtt_total > 0;
+    }
+    if (gtt_total == 0) {
+        return false;
+    }
+
+    const uint64_t umaThreshold = 1024ull * 1024ull * 1024ull; // 1 GiB
+    if (vram_total == 0) {
+        return true;
+    }
+
+    return vram_total <= umaThreshold && gtt_total > vram_total;
+}
+
+} // namespace
+
 extern "C" {

 int ggml_hip_mgmt_init() {
@ -461,6 +517,8 @@ int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
    const std::string drmTotalMemoryFile = "mem_info_vram_total";
    const std::string drmUsedMemoryFile = "mem_info_vram_used";
    const std::string drmUeventPCISlotLabel = "PCI_SLOT_NAME=";
+    const std::string drmGttTotalFile = "mem_info_gtt_total";
+    const std::string drmGttUsedFile = "mem_info_gtt_used";

    glob_t glob_result;
    glob(drmDeviceGlob.c_str(), GLOB_NOSORT, NULL, &glob_result);
@ -495,7 +553,6 @@ int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {

                    uint64_t memory;
                    totalFileStream >> memory;
-                    *total = memory;

                    std::string usedFile = dir + "/" + drmUsedMemoryFile;
                    std::ifstream usedFileStream(usedFile.c_str());
@ -508,7 +565,30 @@ int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {

                    uint64_t memoryUsed;
                    usedFileStream >> memoryUsed;
-                    *free = memory - memoryUsed;
+
+                    uint64_t gttTotal = 0;
+                    uint64_t gttUsed = 0;
+                    bool hasGttTotal = ggml_read_sysfs_value(dir + "/" + drmGttTotalFile, gttTotal);
+                    bool hasGttUsed = ggml_read_sysfs_value(dir + "/" + drmGttUsedFile, gttUsed);
+                    bool useGtt = ggml_should_use_gtt(memory, hasGttTotal ? gttTotal : 0);
+
+                    if (useGtt && hasGttTotal) {
+                        uint64_t freeGtt = gttTotal;
+                        if (hasGttUsed && gttTotal > gttUsed) {
+                            freeGtt = gttTotal - gttUsed;
+                        }
+                        *total = gttTotal;
+                        *free = freeGtt;
+                        GGML_LOG_INFO("%s using GTT memory for %s (total=%zu free=%zu)\n",
+                                      __func__, id, *total, *free);
+                    } else {
+                        *total = memory;
+                        if (memory > memoryUsed) {
+                            *free = memory - memoryUsed;
+                        } else {
+                            *free = 0;
+                        }
+                    }

                    file.close();
                    globfree(&glob_result);
@ -526,4 +606,4 @@ int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {

 } // extern "C"

-#endif // #ifdef _WIN32
+#endif // #ifdef _WIN32
--- a/ml/device.go
+++ b/ml/device.go
@ -17,6 +17,7 @@ import (
 	"strings"
 	"time"

+	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/logutil"
 )
@ -538,6 +539,9 @@ func GetVisibleDevicesEnv(l []DeviceInfo, mustFilter bool) map[string]string {
 func (d DeviceInfo) NeedsInitValidation() bool {
 	// ROCm: rocblas will crash on unsupported devices.
 	// CUDA: verify CC is supported by the version of the library
+	if d.Library == "ROCm" && envconfig.HsaOverrideGfxVersion() != "" {
+		return false
+	}
 	return d.Library == "ROCm" || d.Library == "CUDA"
 }
				`@ -0,0 +1 @@`
				`Error: listen tcp 127.0.0.1:11434: bind: address already in use`