mirror of https://github.com/ollama/ollama
ggml: Backport scale kernel fixes
The GGML scale kernel uses signed 32-bit ints to represent the number of elements in the tensor. For large images, mistral-small3.2 overflows this, triggering CUDA errors due to negative arguments. Currently, this can happen when the user passes a large image to mistral-small3.2. However, with upcoming changes to reserve CUDA memory, it happens every time mistral-small is loaded as we reserve using a worst case batch. This patch is part of an upstream GGML commit and should be removed after GGML is updated past 0a1b398 "ggml: add ops for WAN video model (cuda && cpu) (#15669)". Fixes #10388
This commit is contained in:
parent
734b57da0e
commit
efaee8c2d6
|
|
@ -0,0 +1,57 @@
|
||||||
|
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Jesse Gross <jesse@ollama.com>
|
||||||
|
Date: Tue, 23 Sep 2025 15:41:58 -0700
|
||||||
|
Subject: [PATCH] ggml: Backport scale kernel fixes
|
||||||
|
|
||||||
|
The GGML scale kernel uses signed 32-bit ints to represent
|
||||||
|
the number of elements in the tensor. For large images,
|
||||||
|
mistral-small3.2 overflows this, triggering CUDA errors due
|
||||||
|
to negative arguments.
|
||||||
|
|
||||||
|
Currently, this can happen when the user passes a large image
|
||||||
|
to mistral-small3.2. However, with upcoming changes to reserve
|
||||||
|
CUDA memory, it happens every time mistral-small is loaded as
|
||||||
|
we reserve using a worst case batch.
|
||||||
|
|
||||||
|
This patch is part of an upstream GGML commit and should be removed
|
||||||
|
after GGML is updated past 0a1b398 "ggml: add ops for WAN video model
|
||||||
|
(cuda && cpu) (#15669)".
|
||||||
|
|
||||||
|
Fixes #10388
|
||||||
|
---
|
||||||
|
ggml/src/ggml-cuda/scale.cu | 19 ++++++++++---------
|
||||||
|
1 file changed, 10 insertions(+), 9 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/ggml/src/ggml-cuda/scale.cu b/ggml/src/ggml-cuda/scale.cu
|
||||||
|
index 2ee9e5889..0ddeff6a1 100644
|
||||||
|
--- a/ggml/src/ggml-cuda/scale.cu
|
||||||
|
+++ b/ggml/src/ggml-cuda/scale.cu
|
||||||
|
@@ -1,18 +1,19 @@
|
||||||
|
#include "scale.cuh"
|
||||||
|
|
||||||
|
-static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k) {
|
||||||
|
- const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
+#define MAX_GRIDDIM_X 0x7FFFFFFF
|
||||||
|
|
||||||
|
- if (i >= k) {
|
||||||
|
- return;
|
||||||
|
- }
|
||||||
|
+static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int64_t nelements) {
|
||||||
|
+ int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
|
||||||
|
+ int64_t stride = (int64_t)blockDim.x * (int64_t)gridDim.x;
|
||||||
|
|
||||||
|
- dst[i] = scale * x[i] + bias;
|
||||||
|
+ for (int64_t i = tid; i < nelements; i += stride) {
|
||||||
|
+ dst[i] = scale * x[i] + bias;
|
||||||
|
+ }
|
||||||
|
}
|
||||||
|
|
||||||
|
-static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int k, cudaStream_t stream) {
|
||||||
|
- const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
||||||
|
- scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, k);
|
||||||
|
+static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int64_t nelements, cudaStream_t stream) {
|
||||||
|
+ const int64_t num_blocks = (nelements + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
||||||
|
+ scale_f32<<<MIN(MAX_GRIDDIM_X, num_blocks), CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, nelements);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
|
@ -1,18 +1,19 @@
|
||||||
#include "scale.cuh"
|
#include "scale.cuh"
|
||||||
|
|
||||||
static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k) {
|
#define MAX_GRIDDIM_X 0x7FFFFFFF
|
||||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
|
||||||
|
|
||||||
if (i >= k) {
|
static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int64_t nelements) {
|
||||||
return;
|
int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
|
||||||
|
int64_t stride = (int64_t)blockDim.x * (int64_t)gridDim.x;
|
||||||
|
|
||||||
|
for (int64_t i = tid; i < nelements; i += stride) {
|
||||||
|
dst[i] = scale * x[i] + bias;
|
||||||
}
|
}
|
||||||
|
|
||||||
dst[i] = scale * x[i] + bias;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int k, cudaStream_t stream) {
|
static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int64_t nelements, cudaStream_t stream) {
|
||||||
const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
const int64_t num_blocks = (nelements + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
|
||||||
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, k);
|
scale_f32<<<MIN(MAX_GRIDDIM_X, num_blocks), CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, nelements);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue