From ba463fb577c3f5801cf4e3b545a1d4cb1a1cf64a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Tue, 9 Dec 2025 12:23:47 +0100
Subject: [PATCH] ggml : allow fill node alloc inplace (llama/17870)

---
 ggml/src/ggml-alloc.c      | 1 +
 ggml/src/ggml-cuda/fill.cu | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 218222ec..a5995fdc 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -25,6 +25,7 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
 // ops that return true for this function must not use restrict pointers for their backend implementations
 bool ggml_op_can_inplace(enum ggml_op op) {
     switch (op) {
+        case GGML_OP_FILL:
         case GGML_OP_SCALE:
         case GGML_OP_DIAG_MASK_ZERO:
         case GGML_OP_DIAG_MASK_INF:
diff --git a/ggml/src/ggml-cuda/fill.cu b/ggml/src/ggml-cuda/fill.cu
index eb8ccb78..739062c4 100644
--- a/ggml/src/ggml-cuda/fill.cu
+++ b/ggml/src/ggml-cuda/fill.cu
@@ -4,7 +4,7 @@
 #define CUDA_FILL_BLOCK_SIZE 256
 
 template <typename T>
-static __global__ void fill_kernel(T * __restrict__ dst, const int64_t k, const T value) {
+static __global__ void fill_kernel(T * dst, const int64_t k, const T value) {
     const int64_t i = (int64_t)blockDim.x * blockIdx.x + threadIdx.x;
     if (i >= k) {
         return;