This commit is contained in:
Marx 2025-12-17 06:54:52 +02:00 committed by GitHub
commit c0fb854ee4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 872 additions and 3 deletions

View File

@ -24,6 +24,9 @@ curl -fsSL https://ollama.com/install.sh | sh
[Manual install instructions](https://docs.ollama.com/linux#manual-install)
> [!TIP]
> Running ROCm on UMA-only AMD APUs (e.g., Radeon 760M) requires staging additional ROCm runtime files and enabling experimental discovery flags. See [docs/rocm-apu.md](docs/rocm-apu.md) for the exact build and runtime steps used on this branch.
### Docker
The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `ollama/ollama` is available on Docker Hub.
@ -32,6 +35,7 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla
- [ollama-python](https://github.com/ollama/ollama-python)
- [ollama-js](https://github.com/ollama/ollama-js)
- [Experimental ROCm iGPU Guide](docs/rocm-apu.md)
### Community

View File

@ -0,0 +1,329 @@
# Distributed under the OSI-approved BSD 3-Clause License. See accompanying
# file Copyright.txt or https://cmake.org/licensing for details.
include(${CMAKE_ROOT}/Modules/CMakeDetermineCompiler.cmake)
include(${CMAKE_ROOT}/Modules/CMakeParseImplicitLinkInfo.cmake)
include(${CMAKE_ROOT}/Modules/CMakeParseLibraryArchitecture.cmake)
if(NOT ((CMAKE_GENERATOR MATCHES "Make") OR
(CMAKE_GENERATOR MATCHES "Ninja")))
message(FATAL_ERROR "HIP language not currently supported by \"${CMAKE_GENERATOR}\" generator")
endif()
if(NOT CMAKE_HIP_PLATFORM)
execute_process(COMMAND hipconfig --platform
OUTPUT_VARIABLE _CMAKE_HIPCONFIG_PLATFORM OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE _CMAKE_HIPCONFIG_RESULT
)
if(_CMAKE_HIPCONFIG_RESULT EQUAL 0 AND _CMAKE_HIPCONFIG_PLATFORM MATCHES "^(nvidia|nvcc)$")
set(CMAKE_HIP_PLATFORM "nvidia" CACHE STRING "HIP platform" FORCE)
else()
set(CMAKE_HIP_PLATFORM "amd" CACHE STRING "HIP platform" FORCE)
endif()
endif()
if(NOT CMAKE_HIP_PLATFORM MATCHES "^(amd|nvidia)$")
message(FATAL_ERROR
"The CMAKE_HIP_PLATFORM has unsupported value:\n"
" '${CMAKE_HIP_PLATFORM}'\n"
"It must be 'amd' or 'nvidia'."
)
endif()
if(NOT CMAKE_HIP_COMPILER)
set(CMAKE_HIP_COMPILER_INIT NOTFOUND)
# prefer the environment variable HIPCXX
if(NOT $ENV{HIPCXX} STREQUAL "")
get_filename_component(CMAKE_HIP_COMPILER_INIT $ENV{HIPCXX} PROGRAM PROGRAM_ARGS CMAKE_HIP_FLAGS_ENV_INIT)
if(CMAKE_HIP_FLAGS_ENV_INIT)
set(CMAKE_HIP_COMPILER_ARG1 "${CMAKE_HIP_FLAGS_ENV_INIT}" CACHE STRING "Arguments to CXX compiler")
endif()
if(NOT EXISTS ${CMAKE_HIP_COMPILER_INIT})
message(FATAL_ERROR "Could not find compiler set in environment variable HIPCXX:\n$ENV{HIPCXX}.\n${CMAKE_HIP_COMPILER_INIT}")
endif()
endif()
# finally list compilers to try
if(NOT CMAKE_HIP_COMPILER_INIT)
if(CMAKE_HIP_PLATFORM STREQUAL "nvidia")
set(CMAKE_HIP_COMPILER_LIST nvcc)
elseif(CMAKE_HIP_PLATFORM STREQUAL "amd")
set(CMAKE_HIP_COMPILER_LIST clang++)
# Look for the Clang coming with ROCm to support HIP.
execute_process(COMMAND hipconfig --hipclangpath
OUTPUT_VARIABLE _CMAKE_HIPCONFIG_CLANGPATH
RESULT_VARIABLE _CMAKE_HIPCONFIG_RESULT
)
if(_CMAKE_HIPCONFIG_RESULT EQUAL 0 AND EXISTS "${_CMAKE_HIPCONFIG_CLANGPATH}")
set(CMAKE_HIP_COMPILER_HINTS "${_CMAKE_HIPCONFIG_CLANGPATH}")
endif()
endif()
endif()
_cmake_find_compiler(HIP)
else()
_cmake_find_compiler_path(HIP)
endif()
mark_as_advanced(CMAKE_HIP_COMPILER)
# Build a small source file to identify the compiler.
if(NOT CMAKE_HIP_COMPILER_ID_RUN)
set(CMAKE_HIP_COMPILER_ID_RUN 1)
include(${CMAKE_ROOT}/Modules/CMakeDetermineCompilerId.cmake)
# We determine the vendor to use the right flags for detection right away.
# The main compiler identification is still needed below to extract other information.
list(APPEND CMAKE_HIP_COMPILER_ID_VENDORS NVIDIA Clang)
set(CMAKE_HIP_COMPILER_ID_VENDOR_REGEX_NVIDIA "nvcc: NVIDIA \\(R\\) Cuda compiler driver")
set(CMAKE_HIP_COMPILER_ID_VENDOR_REGEX_Clang "(clang version)")
CMAKE_DETERMINE_COMPILER_ID_VENDOR(HIP "--version")
if(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA")
# Find the CUDA toolkit to get:
# - CMAKE_HIP_COMPILER_CUDA_TOOLKIT_VERSION
# - CMAKE_HIP_COMPILER_CUDA_TOOLKIT_ROOT
# - CMAKE_HIP_COMPILER_CUDA_LIBRARY_ROOT
# We save them in CMakeHIPCompiler.cmake.
# Match arguments with cmake_cuda_architectures_all call.
include(Internal/CMakeCUDAFindToolkit)
cmake_cuda_find_toolkit(HIP CMAKE_HIP_COMPILER_CUDA_)
# If the user set CMAKE_HIP_ARCHITECTURES, validate its value.
include(Internal/CMakeCUDAArchitecturesValidate)
cmake_cuda_architectures_validate(HIP)
if(NOT CMAKE_HIP_HOST_COMPILER AND NOT $ENV{HIPHOSTCXX} STREQUAL "")
get_filename_component(CMAKE_HIP_HOST_COMPILER $ENV{HIPHOSTCXX} PROGRAM)
if(NOT EXISTS "${CMAKE_HIP_HOST_COMPILER}")
message(FATAL_ERROR "Could not find compiler set in environment variable HIPHOSTCXX:\n$ENV{HIPHOSTCXX}.\n${CMAKE_HIP_HOST_COMPILER}")
endif()
endif()
endif()
if(CMAKE_HIP_COMPILER_ID STREQUAL "Clang")
list(APPEND CMAKE_HIP_COMPILER_ID_TEST_FLAGS_FIRST "-v")
elseif(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA")
# Tell nvcc to treat .hip files as CUDA sources.
list(APPEND CMAKE_HIP_COMPILER_ID_TEST_FLAGS_FIRST "-x cu -v")
if(CMAKE_HIP_HOST_COMPILER)
string(APPEND CMAKE_HIP_COMPILER_ID_TEST_FLAGS_FIRST " -ccbin=\"${CMAKE_HIP_HOST_COMPILER}\"")
endif()
endif()
# We perform compiler identification for a second time to extract implicit linking info.
# We need to unset the compiler ID otherwise CMAKE_DETERMINE_COMPILER_ID() doesn't work.
set(CMAKE_HIP_COMPILER_ID)
set(CMAKE_HIP_PLATFORM_ID)
file(READ ${CMAKE_ROOT}/Modules/CMakePlatformId.h.in
CMAKE_HIP_COMPILER_ID_PLATFORM_CONTENT)
CMAKE_DETERMINE_COMPILER_ID(HIP HIPFLAGS CMakeHIPCompilerId.hip)
if(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA")
include(Internal/CMakeCUDAArchitecturesAll)
# From CMAKE_HIP_COMPILER_CUDA_TOOLKIT_VERSION and CMAKE_HIP_COMPILER_{ID,VERSION}, get:
# - CMAKE_HIP_ARCHITECTURES_ALL
# - CMAKE_HIP_ARCHITECTURES_ALL_MAJOR
# Match arguments with cmake_cuda_find_toolkit call.
cmake_cuda_architectures_all(HIP CMAKE_HIP_COMPILER_CUDA_)
endif()
_cmake_find_compiler_sysroot(HIP)
endif()
if(NOT CMAKE_HIP_COMPILER_ROCM_ROOT AND CMAKE_HIP_COMPILER_ID STREQUAL "Clang")
execute_process(COMMAND "${CMAKE_HIP_COMPILER}" -v -print-targets
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE _CMAKE_HIP_COMPILER_RESULT
OUTPUT_VARIABLE _CMAKE_HIP_COMPILER_STDOUT
ERROR_VARIABLE _CMAKE_HIP_COMPILER_STDERR
)
if(_CMAKE_HIP_COMPILER_RESULT EQUAL 0 AND _CMAKE_HIP_COMPILER_STDERR MATCHES "Found HIP installation: *([^,]*)[,\n]")
set(CMAKE_HIP_COMPILER_ROCM_ROOT "${CMAKE_MATCH_1}")
file(TO_CMAKE_PATH "${CMAKE_HIP_COMPILER_ROCM_ROOT}" CMAKE_HIP_COMPILER_ROCM_ROOT)
endif()
endif()
if(NOT CMAKE_HIP_COMPILER_ROCM_ROOT)
execute_process(
COMMAND hipconfig --rocmpath
OUTPUT_VARIABLE _CMAKE_HIPCONFIG_ROCMPATH
RESULT_VARIABLE _CMAKE_HIPCONFIG_RESULT
)
if(_CMAKE_HIPCONFIG_RESULT EQUAL 0 AND EXISTS "${_CMAKE_HIPCONFIG_ROCMPATH}")
set(CMAKE_HIP_COMPILER_ROCM_ROOT "${_CMAKE_HIPCONFIG_ROCMPATH}")
endif()
endif()
if(NOT CMAKE_HIP_COMPILER_ROCM_ROOT)
message(FATAL_ERROR "Failed to find ROCm root directory.")
endif()
if(CMAKE_HIP_PLATFORM STREQUAL "amd")
# For this platform we need the hip-lang cmake package.
# Normally implicit link information is not detected until ABI detection,
# but we need to populate CMAKE_HIP_LIBRARY_ARCHITECTURE to find hip-lang.
cmake_parse_implicit_link_info("${CMAKE_HIP_COMPILER_PRODUCED_OUTPUT}"
_CMAKE_HIP_COMPILER_ID_IMPLICIT_LIBS
_CMAKE_HIP_COMPILER_ID_IMPLICIT_DIRS
_CMAKE_HIP_COMPILER_ID_IMPLICIT_FWKS
_CMAKE_HIP_COMPILER_ID_IMPLICIT_LOG
"" LANGUAGE HIP)
message(CONFIGURE_LOG
"Parsed HIP implicit link information from compiler id output:\n${_CMAKE_HIP_COMPILER_ID_IMPLICIT_LOG}\n\n")
cmake_parse_library_architecture(HIP "${_CMAKE_HIP_COMPILER_ID_IMPLICIT_DIRS}" "" CMAKE_HIP_LIBRARY_ARCHITECTURE)
if(CMAKE_HIP_LIBRARY_ARCHITECTURE)
message(CONFIGURE_LOG
"Parsed HIP library architecture from compiler id output: ${CMAKE_HIP_LIBRARY_ARCHITECTURE}\n")
endif()
unset(_CMAKE_HIP_COMPILER_ID_IMPLICIT_LIBS)
unset(_CMAKE_HIP_COMPILER_ID_IMPLICIT_DIRS)
unset(_CMAKE_HIP_COMPILER_ID_IMPLICIT_FWKS)
unset(_CMAKE_HIP_COMPILER_ID_IMPLICIT_LOG)
if(NOT CMAKE_HIP_COMPILER_ROCM_LIB)
set(_CMAKE_HIP_COMPILER_ROCM_LIB_DIRS
"${CMAKE_HIP_COMPILER_ROCM_ROOT}/lib"
"${CMAKE_HIP_COMPILER_ROCM_ROOT}/lib64"
)
if(CMAKE_HIP_LIBRARY_ARCHITECTURE)
list(APPEND _CMAKE_HIP_COMPILER_ROCM_LIB_DIRS "${CMAKE_HIP_COMPILER_ROCM_ROOT}/lib/${CMAKE_HIP_LIBRARY_ARCHITECTURE}")
endif()
foreach(dir IN LISTS _CMAKE_HIP_COMPILER_ROCM_LIB_DIRS)
if(EXISTS "${dir}/cmake/hip-lang/hip-lang-config.cmake")
set(CMAKE_HIP_COMPILER_ROCM_LIB "${dir}")
break()
endif()
endforeach()
if(NOT CMAKE_HIP_COMPILER_ROCM_LIB)
list(TRANSFORM _CMAKE_HIP_COMPILER_ROCM_LIB_DIRS APPEND "/cmake/hip-lang/hip-lang-config.cmake")
string(REPLACE ";" "\n " _CMAKE_HIP_COMPILER_ROCM_LIB_DIRS "${_CMAKE_HIP_COMPILER_ROCM_LIB_DIRS}")
message(FATAL_ERROR
"The ROCm root directory:\n"
" ${CMAKE_HIP_COMPILER_ROCM_ROOT}\n"
"does not contain the HIP runtime CMake package, expected at one of:\n"
" ${_CMAKE_HIP_COMPILER_ROCM_LIB_DIRS}\n"
)
endif()
unset(_CMAKE_HIP_COMPILER_ROCM_LIB_DIRS)
endif()
if(NOT DEFINED CMAKE_SIZEOF_VOID_P)
# We have not yet determined the target ABI but we need 'find_package' to
# search lib64 directories to find hip-lang CMake package dependencies.
# This will be replaced by ABI detection later.
set(CMAKE_HIP_SIZEOF_DATA_PTR 8)
endif()
endif()
if (NOT _CMAKE_TOOLCHAIN_LOCATION)
get_filename_component(_CMAKE_TOOLCHAIN_LOCATION "${CMAKE_HIP_COMPILER}" PATH)
endif ()
set(_CMAKE_PROCESSING_LANGUAGE "HIP")
include(CMakeFindBinUtils)
include(Compiler/${CMAKE_HIP_COMPILER_ID}-FindBinUtils OPTIONAL)
unset(_CMAKE_PROCESSING_LANGUAGE)
if(CMAKE_HIP_COMPILER_ID STREQUAL "Clang")
set(CMAKE_HIP_RUNTIME_LIBRARY_DEFAULT "SHARED")
elseif(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA")
include(Internal/CMakeNVCCParseImplicitInfo)
# Parse CMAKE_HIP_COMPILER_PRODUCED_OUTPUT to get:
# - CMAKE_HIP_ARCHITECTURES_DEFAULT
# - CMAKE_HIP_HOST_IMPLICIT_LINK_DIRECTORIES
# - CMAKE_HIP_HOST_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES
# - CMAKE_HIP_HOST_IMPLICIT_LINK_LIBRARIES
# - CMAKE_HIP_HOST_LINK_LAUNCHER
# - CMAKE_HIP_RUNTIME_LIBRARY_DEFAULT
# - CMAKE_HIP_CUDA_TOOLKIT_INCLUDE_DIRECTORIES
# Match arguments with cmake_nvcc_filter_implicit_info call in CMakeTestHIPCompiler.
cmake_nvcc_parse_implicit_info(HIP CMAKE_HIP_CUDA_)
include(Internal/CMakeCUDAFilterImplicitLibs)
# Filter out implicit link libraries that should not be passed unconditionally.
cmake_cuda_filter_implicit_libs(CMAKE_HIP_HOST_IMPLICIT_LINK_LIBRARIES)
endif()
if(CMAKE_HIP_COMPILER_SYSROOT)
string(CONCAT _SET_CMAKE_HIP_COMPILER_SYSROOT
"set(CMAKE_HIP_COMPILER_SYSROOT \"${CMAKE_HIP_COMPILER_SYSROOT}\")\n"
"set(CMAKE_COMPILER_SYSROOT \"${CMAKE_HIP_COMPILER_SYSROOT}\")")
else()
set(_SET_CMAKE_HIP_COMPILER_SYSROOT "")
endif()
if(CMAKE_HIP_COMPILER_ARCHITECTURE_ID)
set(_SET_CMAKE_HIP_COMPILER_ARCHITECTURE_ID
"set(CMAKE_HIP_COMPILER_ARCHITECTURE_ID ${CMAKE_HIP_COMPILER_ARCHITECTURE_ID})")
else()
set(_SET_CMAKE_HIP_COMPILER_ARCHITECTURE_ID "")
endif()
if(MSVC_HIP_ARCHITECTURE_ID)
set(SET_MSVC_HIP_ARCHITECTURE_ID
"set(MSVC_HIP_ARCHITECTURE_ID ${MSVC_HIP_ARCHITECTURE_ID})")
endif()
if(CMAKE_HIP_COMPILER_ID STREQUAL "NVIDIA")
if(NOT "$ENV{CUDAARCHS}" STREQUAL "")
set(CMAKE_HIP_ARCHITECTURES "$ENV{CUDAARCHS}" CACHE STRING "CUDA architectures")
endif()
# If the user did not set CMAKE_HIP_ARCHITECTURES, use the compiler's default.
if("${CMAKE_HIP_ARCHITECTURES}" STREQUAL "")
set(CMAKE_HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES_DEFAULT}" CACHE STRING "HIP architectures" FORCE)
if(NOT CMAKE_HIP_ARCHITECTURES)
message(FATAL_ERROR "Failed to detect a default HIP architecture.\n\nCompiler output:\n${CMAKE_HIP_COMPILER_PRODUCED_OUTPUT}")
endif()
endif()
unset(CMAKE_HIP_ARCHITECTURES_DEFAULT)
elseif(NOT DEFINED CMAKE_HIP_ARCHITECTURES)
# Use 'rocm_agent_enumerator' to get the current GPU architecture.
set(_CMAKE_HIP_ARCHITECTURES)
find_program(_CMAKE_HIP_ROCM_AGENT_ENUMERATOR
NAMES rocm_agent_enumerator
HINTS "${CMAKE_HIP_COMPILER_ROCM_ROOT}/bin"
NO_CACHE)
if(_CMAKE_HIP_ROCM_AGENT_ENUMERATOR)
execute_process(COMMAND "${_CMAKE_HIP_ROCM_AGENT_ENUMERATOR}" -t GPU
RESULT_VARIABLE _CMAKE_ROCM_AGENT_ENUMERATOR_RESULT
OUTPUT_VARIABLE _CMAKE_ROCM_AGENT_ENUMERATOR_STDOUT
ERROR_VARIABLE _CMAKE_ROCM_AGENT_ENUMERATOR_STDERR
)
if(_CMAKE_ROCM_AGENT_ENUMERATOR_RESULT EQUAL 0)
separate_arguments(_hip_archs NATIVE_COMMAND "${_CMAKE_ROCM_AGENT_ENUMERATOR_STDOUT}")
foreach(_hip_arch ${_hip_archs})
if(_hip_arch STREQUAL "gfx000")
continue()
endif()
string(FIND ${_hip_arch} ":" pos)
if(NOT pos STREQUAL "-1")
string(SUBSTRING ${_hip_arch} 0 ${pos} _hip_arch)
endif()
list(APPEND _CMAKE_HIP_ARCHITECTURES "${_hip_arch}")
endforeach()
endif()
unset(_CMAKE_ROCM_AGENT_ENUMERATOR_RESULT)
unset(_CMAKE_ROCM_AGENT_ENUMERATOR_STDOUT)
unset(_CMAKE_ROCM_AGENT_ENUMERATOR_STDERR)
endif()
unset(_CMAKE_HIP_ROCM_AGENT_ENUMERATOR)
if(_CMAKE_HIP_ARCHITECTURES)
set(CMAKE_HIP_ARCHITECTURES "${_CMAKE_HIP_ARCHITECTURES}" CACHE STRING "HIP architectures")
elseif(CMAKE_HIP_COMPILER_PRODUCED_OUTPUT MATCHES " -target-cpu ([a-z0-9]+) ")
set(CMAKE_HIP_ARCHITECTURES "${CMAKE_MATCH_1}" CACHE STRING "HIP architectures")
else()
message(FATAL_ERROR "Failed to find a default HIP architecture.")
endif()
unset(_CMAKE_HIP_ARCHITECTURES)
endif()
# configure variables set in this file for fast reload later on
configure_file(${CMAKE_ROOT}/Modules/CMakeHIPCompiler.cmake.in
${CMAKE_PLATFORM_INFO_DIR}/CMakeHIPCompiler.cmake
@ONLY
)
set(CMAKE_HIP_COMPILER_ENV_VAR "HIPCXX")

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
Error: listen tcp 127.0.0.1:11434: bind: address already in use

92
docs/rocm-apu.md Normal file
View File

@ -0,0 +1,92 @@
# Experimental ROCm iGPU Support
This branch adds a ROCm backend path geared toward AMD APUs that only expose a small VRAM aperture but share a large UMA pool with the CPU. The steps below outline how to reproduce the build and how to run Ollama with the staged ROCm runtime.
> **Warning**
> Upstream ROCm does not officially support these APUs yet. Expect driver updates, kernel parameters, or environment variables such as `HSA_OVERRIDE_GFX_VERSION` to change between releases.
## 1. Stage the ROCm runtime
We avoid touching the system installation by unpacking the required RPMs into `build/rocm-stage`.
```bash
mkdir -p build/rocm-stage build/rpm-tmp
cd build/rpm-tmp
dnf download \
hipblas hipblas-devel hipblas-common-devel \
rocblas rocblas-devel \
rocsolver rocsolver-devel \
rocm-hip-devel rocm-device-libs rocm-comgr rocm-comgr-devel
cd ../rocm-stage
for rpm in ../rpm-tmp/*.rpm; do
echo "extracting ${rpm}"
rpm2cpio "${rpm}" | bsdtar -xf -
done
```
Important staged paths after extraction:
| Purpose | Location |
| ------------------------ | ----------------------------------------------- |
| HIP/rocBLAS libraries | `build/rocm-stage/lib64` |
| Tensile kernels (rocBLAS)| `build/rocm-stage/lib64/rocblas/library` |
| Headers (`hip`, `rocblas`)| `build/rocm-stage/include` |
## 2. Build the ROCm backend
Configure CMake with the preset that targets ROCm 6.x and point it at the staged HIP compiler:
```bash
cmake --preset "ROCm 6" -B build/rocm \
-DGGML_VULKAN=OFF \
-DCMAKE_INSTALL_PREFIX=/usr/local \
-DCMAKE_HIP_COMPILER=/usr/bin/hipcc \
-DCMAKE_PREFIX_PATH="$PWD/build/rocm-stage"
cmake --build build/rocm --target ggml-hip -j$(nproc)
```
Artifacts land in `build/lib/ollama/rocm` (and mirrored in `dist/lib/ollama/rocm` when packaging). These include `libggml-hip.so`, CPU fallback variants, Vulkan, and `librocsolver.so`.
## 3. Run Ollama on ROCm
The runner needs to see both the GGML plugins and the staged ROCm runtime. The following environment block works for an AMD Radeon 760M with a UMA carve-out:
```bash
export BASE=$HOME/ollama-gpu
export OLLAMA_LIBRARY_PATH=$BASE/build/lib/ollama/rocm:$BASE/build/lib/ollama
export LD_LIBRARY_PATH=$OLLAMA_LIBRARY_PATH:$BASE/build/rocm-stage/lib64:${LD_LIBRARY_PATH:-}
export ROCBLAS_TENSILE_LIBPATH=$BASE/build/rocm-stage/lib64/rocblas/library
export ROCBLAS_TENSILE_PATH=$ROCBLAS_TENSILE_LIBPATH
export HSA_OVERRIDE_GFX_VERSION=11.0.0 # spoof gfx1100 for Phoenix
export GGML_HIP_FORCE_GTT=1 # force GTT allocations for UMA memory
export OLLAMA_GPU_DRIVER=rocm
export OLLAMA_GPU=100 # opt into GPU-only scheduling
export OLLAMA_LLM_LIBRARY=rocm # skip CUDA/Vulkan discovery noise
export OLLAMA_VULKAN=0 # optional: suppress Vulkan backend
$BASE/build/ollama serve
```
On launch you should see log lines similar to:
```
library=ROCm compute=gfx1100 name=ROCm0 description="AMD Radeon 760M Graphics"
ggml_hip_get_device_memory using GTT memory for 0000:0e:00.0 (total=16352354304 free=15034097664)
```
If the runner crashes before enumerating devices:
- Double-check that `ROCBLAS_TENSILE_LIBPATH` points to the staged `rocblas/library`.
- Ensure no other `LD_LIBRARY_PATH` entries override `libamdhip64.so`.
- Try unsetting `HSA_OVERRIDE_GFX_VERSION` to confirm whether the kernel patch is still needed on your system.
> Example discovery + run log: [`docs/logs/rocm-760m-run.log`](logs/rocm-760m-run.log). The matching `curl` response is saved as [`docs/logs/rocm-760m-run-response.json`](logs/rocm-760m-run-response.json).
## 4. Sharing this build
- Keep the staged RPMs alongside the branch so others can reproduce the exact runtime.
- Include `/tmp/ollama_rocm_run.log` or similar discovery logs in issues/PRs to help maintainers understand the UMA setup.
- Mention any kernel parameters (e.g., large UMA buffer in firmware) when opening upstream tickets.

View File

@ -360,6 +360,36 @@ func StartRunner(ollamaEngine bool, modelPath string, gpuLibs []string, out io.W
// Note: we always put our dependency paths first
// since these are the exact version we compiled/linked against
userLibs := []string{}
if existing, ok := os.LookupEnv("OLLAMA_LIBRARY_PATH"); ok && existing != "" {
userLibs = filepath.SplitList(existing)
}
if len(userLibs) != 0 {
seen := make(map[string]struct{}, len(userLibs)+len(gpuLibs))
merged := make([]string, 0, len(userLibs)+len(gpuLibs))
for _, dir := range userLibs {
if dir == "" {
continue
}
if _, ok := seen[dir]; ok {
continue
}
seen[dir] = struct{}{}
merged = append(merged, dir)
}
for _, dir := range gpuLibs {
if dir == "" {
continue
}
if _, ok := seen[dir]; ok {
continue
}
seen[dir] = struct{}{}
merged = append(merged, dir)
}
gpuLibs = merged
}
libraryPaths := append([]string{}, gpuLibs...)
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)

View File

@ -1,5 +1,107 @@
#pragma once
#include <cmath>
#include <type_traits>
#if defined(__HIP_PLATFORM_AMD__)
#include <hip/amd_detail/device_library_decls.h>
#include <hip/amd_detail/math_fwd.h>
static __host__ __device__ inline float ggml_hip_max_f32(float a, float b) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_fmax_f32(a, b);
#else
return a > b ? a : b;
#endif
}
static __host__ __device__ inline double ggml_hip_max_f64(double a, double b) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_fmax_f64(a, b);
#else
return a > b ? a : b;
#endif
}
static __host__ __device__ inline float ggml_hip_min_f32(float a, float b) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_fmin_f32(a, b);
#else
return a < b ? a : b;
#endif
}
static __host__ __device__ inline double ggml_hip_min_f64(double a, double b) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_fmin_f64(a, b);
#else
return a < b ? a : b;
#endif
}
static __host__ __device__ inline float ggml_hip_pow_f32(float base, float exp) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_pow_f32(base, exp);
#else
return std::pow(base, exp);
#endif
}
static __host__ __device__ inline double ggml_hip_pow_f64(double base, double exp) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_pow_f64(base, exp);
#else
return std::pow(base, exp);
#endif
}
template <typename A, typename B,
typename std::enable_if<std::is_integral<A>::value && std::is_integral<B>::value, int>::type = 0>
__host__ __device__ inline typename std::common_type<A, B>::type ggml_hip_int_max(A a, B b) {
using R = typename std::common_type<A, B>::type;
const R aa = static_cast<R>(a);
const R bb = static_cast<R>(b);
return aa > bb ? aa : bb;
}
template <typename A, typename B,
typename std::enable_if<std::is_integral<A>::value && std::is_integral<B>::value, int>::type = 0>
__host__ __device__ inline typename std::common_type<A, B>::type ggml_hip_int_min(A a, B b) {
using R = typename std::common_type<A, B>::type;
const R aa = static_cast<R>(a);
const R bb = static_cast<R>(b);
return aa < bb ? aa : bb;
}
__host__ __device__ inline float max(float a, float b) {
return ggml_hip_max_f32(a, b);
}
__host__ __device__ inline double max(double a, double b) {
return ggml_hip_max_f64(a, b);
}
template <typename A, typename B,
typename std::enable_if<std::is_integral<A>::value && std::is_integral<B>::value, int>::type = 0>
__host__ __device__ inline typename std::common_type<A, B>::type max(A a, B b) {
return ggml_hip_int_max(a, b);
}
__host__ __device__ inline float min(float a, float b) {
return ggml_hip_min_f32(a, b);
}
__host__ __device__ inline double min(double a, double b) {
return ggml_hip_min_f64(a, b);
}
template <typename A, typename B,
typename std::enable_if<std::is_integral<A>::value && std::is_integral<B>::value, int>::type = 0>
__host__ __device__ inline typename std::common_type<A, B>::type min(A a, B b) {
return ggml_hip_int_min(a, b);
}
#endif // defined(__HIP_PLATFORM_AMD__)
#define HIP_DISABLE_WARP_SYNC_BUILTINS 1
#include <hip/hip_runtime.h>
#include <hipblas/hipblas.h>
@ -8,6 +110,232 @@
// for rocblas_initialize()
#include "rocblas/rocblas.h"
#if defined(__HIP_PLATFORM_AMD__)
#undef fmaxf
#define fmaxf(a, b) ggml_hip_max_f32((a), (b))
#undef fminf
#define fminf(a, b) ggml_hip_min_f32((a), (b))
#undef powf
#define powf(a, b) ggml_hip_pow_f32((a), (b))
static __host__ __device__ inline float ggml_hip_expf(float x) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_exp_f32(x);
#else
using std::exp;
return static_cast<float>(exp(x));
#endif
}
#undef expf
#define expf(x) ggml_hip_expf((x))
static __host__ __device__ inline float ggml_hip_expm1f(float x) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_expm1_f32(x);
#else
using std::expm1;
return static_cast<float>(expm1(x));
#endif
}
#undef expm1f
#define expm1f(x) ggml_hip_expm1f((x))
static __host__ __device__ inline float ggml_hip_logf(float x) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_log_f32(x);
#else
using std::log;
return static_cast<float>(log(x));
#endif
}
#undef logf
#define logf(x) ggml_hip_logf((x))
static __host__ __device__ inline float ggml_hip_log1pf(float x) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_log1p_f32(x);
#else
using std::log1p;
return static_cast<float>(log1p(x));
#endif
}
#undef log1pf
#define log1pf(x) ggml_hip_log1pf((x))
static __host__ __device__ inline float ggml_hip_log2f(float x) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_log2_f32(x);
#else
using std::log2;
return static_cast<float>(log2(x));
#endif
}
#undef log2f
#define log2f(x) ggml_hip_log2f((x))
static __host__ __device__ inline float ggml_hip_tanhf(float x) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_tanh_f32(x);
#else
using std::tanh;
return static_cast<float>(tanh(x));
#endif
}
#undef tanhf
#define tanhf(x) ggml_hip_tanhf((x))
static __host__ __device__ inline float ggml_hip_sinf(float x) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_sin_f32(x);
#else
using std::sin;
return static_cast<float>(sin(x));
#endif
}
#undef sinf
#define sinf(x) ggml_hip_sinf((x))
static __host__ __device__ inline float ggml_hip_cosf(float x) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_cos_f32(x);
#else
using std::cos;
return static_cast<float>(cos(x));
#endif
}
#undef cosf
#define cosf(x) ggml_hip_cosf((x))
static __host__ __device__ inline float ggml_hip_erff(float x) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_erf_f32(x);
#else
using std::erf;
return static_cast<float>(erf(x));
#endif
}
#undef erff
#define erff(x) ggml_hip_erff((x))
static __host__ __device__ inline float ggml_hip_fabsf(float x) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_fabs_f32(x);
#else
using std::fabs;
return static_cast<float>(fabs(x));
#endif
}
#undef fabsf
#define fabsf(x) ggml_hip_fabsf((x))
static __host__ __device__ inline float ggml_hip_floorf(float x) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_floor_f32(x);
#else
using std::floor;
return static_cast<float>(floor(x));
#endif
}
#undef floorf
#define floorf(x) ggml_hip_floorf((x))
static __host__ __device__ inline float ggml_hip_ceilf(float x) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_ceil_f32(x);
#else
using std::ceil;
return static_cast<float>(ceil(x));
#endif
}
#undef ceilf
#define ceilf(x) ggml_hip_ceilf((x))
static __host__ __device__ inline float ggml_hip_roundf(float x) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_round_f32(x);
#else
using std::round;
return static_cast<float>(round(x));
#endif
}
#undef roundf
#define roundf(x) ggml_hip_roundf((x))
static __host__ __device__ inline float ggml_hip_round_scalar(float x) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_round_f32(x);
#else
using std::round;
return static_cast<float>(round(x));
#endif
}
static __host__ __device__ inline double ggml_hip_round_scalar(double x) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_round_f64(x);
#else
using std::round;
return round(x);
#endif
}
#undef round
#define round(x) ggml_hip_round_scalar((x))
static __host__ __device__ inline float ggml_hip_sqrtf(float x) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_sqrt_f32(x);
#else
using std::sqrt;
return static_cast<float>(sqrt(x));
#endif
}
#undef sqrtf
#define sqrtf(x) ggml_hip_sqrtf((x))
static __host__ __device__ inline float ggml_hip_rsqrtf(float x) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_rsqrt_f32(x);
#else
using std::sqrt;
return 1.0f / static_cast<float>(sqrt(x));
#endif
}
#undef rsqrtf
#define rsqrtf(x) ggml_hip_rsqrtf((x))
static __host__ __device__ inline float ggml_hip_trunc_scalar(float x) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_trunc_f32(x);
#else
using std::trunc;
return static_cast<float>(trunc(x));
#endif
}
static __host__ __device__ inline double ggml_hip_trunc_scalar(double x) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_trunc_f64(x);
#else
using std::trunc;
return trunc(x);
#endif
}
#undef trunc
#define trunc(x) ggml_hip_trunc_scalar((x))
static __host__ __device__ inline int ggml_hip_isinf(float x) {
#if defined(__HIP_DEVICE_COMPILE__)
return __ocml_isinf_f32(x);
#else
using std::isinf;
return static_cast<int>(isinf(x));
#endif
}
#undef isinf
#define isinf(x) ggml_hip_isinf((x))
#endif
#if defined(GGML_HIP_ROCWMMA_FATTN)
#include <rocwmma/rocwmma-version.hpp>
#endif // defined(GGML_HIP_ROCWMMA_FATTN)

View File

@ -442,6 +442,8 @@ int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
#include <string>
#include <vector>
#include <filesystem>
#include <cstdlib>
#include <strings.h>
#include <sys/stat.h>
#include <dirent.h>
@ -449,6 +451,60 @@ int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
#include <glob.h>
namespace fs = std::filesystem;
namespace {
static bool ggml_env_flag_enabled(const char *name) {
const char *value = getenv(name);
if (value == nullptr) {
return false;
}
if (*value == '\0') {
return true;
}
if ((value[0] == '0' && value[1] == '\0') ||
strcasecmp(value, "false") == 0 ||
strcasecmp(value, "off") == 0) {
return false;
}
return true;
}
static bool ggml_read_sysfs_value(const std::string &path, uint64_t &value) {
std::ifstream stream(path.c_str());
if (!stream.is_open()) {
GGML_LOG_DEBUG("%s unable to open sysfs node %s\n", __func__, path.c_str());
return false;
}
stream >> value;
if (stream.fail()) {
GGML_LOG_DEBUG("%s unable to parse sysfs node %s\n", __func__, path.c_str());
return false;
}
return true;
}
static bool ggml_should_use_gtt(uint64_t vram_total, uint64_t gtt_total) {
if (ggml_env_flag_enabled("GGML_HIP_DISABLE_GTT")) {
return false;
}
if (ggml_env_flag_enabled("GGML_HIP_FORCE_GTT")) {
return gtt_total > 0;
}
if (gtt_total == 0) {
return false;
}
const uint64_t umaThreshold = 1024ull * 1024ull * 1024ull; // 1 GiB
if (vram_total == 0) {
return true;
}
return vram_total <= umaThreshold && gtt_total > vram_total;
}
} // namespace
extern "C" {
int ggml_hip_mgmt_init() {
@ -461,6 +517,8 @@ int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
const std::string drmTotalMemoryFile = "mem_info_vram_total";
const std::string drmUsedMemoryFile = "mem_info_vram_used";
const std::string drmUeventPCISlotLabel = "PCI_SLOT_NAME=";
const std::string drmGttTotalFile = "mem_info_gtt_total";
const std::string drmGttUsedFile = "mem_info_gtt_used";
glob_t glob_result;
glob(drmDeviceGlob.c_str(), GLOB_NOSORT, NULL, &glob_result);
@ -495,7 +553,6 @@ int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
uint64_t memory;
totalFileStream >> memory;
*total = memory;
std::string usedFile = dir + "/" + drmUsedMemoryFile;
std::ifstream usedFileStream(usedFile.c_str());
@ -508,7 +565,30 @@ int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
uint64_t memoryUsed;
usedFileStream >> memoryUsed;
*free = memory - memoryUsed;
uint64_t gttTotal = 0;
uint64_t gttUsed = 0;
bool hasGttTotal = ggml_read_sysfs_value(dir + "/" + drmGttTotalFile, gttTotal);
bool hasGttUsed = ggml_read_sysfs_value(dir + "/" + drmGttUsedFile, gttUsed);
bool useGtt = ggml_should_use_gtt(memory, hasGttTotal ? gttTotal : 0);
if (useGtt && hasGttTotal) {
uint64_t freeGtt = gttTotal;
if (hasGttUsed && gttTotal > gttUsed) {
freeGtt = gttTotal - gttUsed;
}
*total = gttTotal;
*free = freeGtt;
GGML_LOG_INFO("%s using GTT memory for %s (total=%zu free=%zu)\n",
__func__, id, *total, *free);
} else {
*total = memory;
if (memory > memoryUsed) {
*free = memory - memoryUsed;
} else {
*free = 0;
}
}
file.close();
globfree(&glob_result);
@ -526,4 +606,4 @@ int ggml_hip_get_device_memory(const char *id, size_t *free, size_t *total) {
} // extern "C"
#endif // #ifdef _WIN32
#endif // #ifdef _WIN32

View File

@ -17,6 +17,7 @@ import (
"strings"
"time"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/logutil"
)
@ -538,6 +539,9 @@ func GetVisibleDevicesEnv(l []DeviceInfo, mustFilter bool) map[string]string {
func (d DeviceInfo) NeedsInitValidation() bool {
// ROCm: rocblas will crash on unsupported devices.
// CUDA: verify CC is supported by the version of the library
if d.Library == "ROCm" && envconfig.HsaOverrideGfxVersion() != "" {
return false
}
return d.Library == "ROCm" || d.Library == "CUDA"
}