mirror of https://github.com/ollama/ollama
1243 lines
80 KiB
Diff
1243 lines
80 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: Jeff Bolz <jbolz@nvidia.com>
|
|
Date: Wed, 29 Oct 2025 15:13:10 -0500
|
|
Subject: [PATCH] vulkan: Fuse rope+set_rows (#16769)
|
|
|
|
This pattern appears in a lot of models, the rope operation is applied right
|
|
before storing into the KV cache (usually on the K tensor).
|
|
|
|
Add a path to some of the rope shaders that computes the destination address
|
|
based on the set_rows tensor. Compile variants of the shader with D_TYPE of
|
|
f16 (the usual KV cache type).
|
|
|
|
Add a src3 operand to ggml_vk_op_f32 - sometimes rope uses three srcs and needs
|
|
the fourth for the row indices.
|
|
|
|
Add fused_ops_write_mask to indicate which intermediate tensors need to write
|
|
their results to memory. Skipping writing the roped K value helps to allow more
|
|
nodes to run concurrently.
|
|
|
|
Add logic to ggml_vk_graph_optimize to make ROPE+VIEW+SET_ROWS consecutive. It
|
|
rarely starts out that way in the graph.
|
|
|
|
Add new backend tests.
|
|
---
|
|
ggml/src/ggml-vulkan/ggml-vulkan.cpp | 334 +++++++++++++-----
|
|
.../ggml-vulkan/vulkan-shaders/rope_head.glsl | 2 +
|
|
.../ggml-vulkan/vulkan-shaders/rope_neox.comp | 13 +-
|
|
.../ggml-vulkan/vulkan-shaders/rope_norm.comp | 13 +-
|
|
.../vulkan-shaders/vulkan-shaders-gen.cpp | 4 +
|
|
tests/test-backend-ops.cpp | 122 +++++--
|
|
6 files changed, 371 insertions(+), 117 deletions(-)
|
|
|
|
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
|
index b2855b078..aaf4334b5 100644
|
|
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
|
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
|
@@ -458,6 +458,11 @@ static topk_moe_mode ggml_vk_num_additional_ops_to_topk_moe_mode(uint32_t num) {
|
|
return mode;
|
|
}
|
|
|
|
+static constexpr std::initializer_list<std::array<int, 3>> rope_view_set_rows_edges {
|
|
+ { 1, 0, 0 }, // view->src[0] == rope
|
|
+ { 2, 0, 1 }, // set_rows->src[0] == view
|
|
+};
|
|
+
|
|
struct vk_device_struct {
|
|
std::recursive_mutex mutex;
|
|
|
|
@@ -640,8 +645,8 @@ struct vk_device_struct {
|
|
vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
|
|
vk_pipeline pipeline_soft_max_f32_wg512, pipeline_soft_max_f32_f16_wg512;
|
|
vk_pipeline pipeline_soft_max_back_f32;
|
|
- vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16;
|
|
- vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
|
|
+ vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16, pipeline_rope_norm_f32_f16;
|
|
+ vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16, pipeline_rope_neox_f32_f16;
|
|
vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16;
|
|
vk_pipeline pipeline_rope_vision_f32, pipeline_rope_vision_f16;
|
|
vk_pipeline pipeline_argsort_f32[num_argsort_pipelines];
|
|
@@ -1054,6 +1059,7 @@ struct vk_op_rope_push_constants {
|
|
uint32_t s2;
|
|
int32_t sections[4];
|
|
uint32_t is_back;
|
|
+ uint32_t set_rows_stride;
|
|
};
|
|
|
|
struct vk_op_soft_max_push_constants {
|
|
@@ -1563,6 +1569,10 @@ struct ggml_backend_vk_context {
|
|
// number of additional consecutive nodes that are being fused with the
|
|
// node currently being processed
|
|
int num_additional_fused_ops {};
|
|
+ // Bitmask of which fused ops need to write an intermediate value to memory.
|
|
+ // Bit 'i' means nodes[start_of_fusion + i] writes to memory.
|
|
+ // If there's no fusion, bit 0 is still set.
|
|
+ int fused_ops_write_mask {};
|
|
};
|
|
|
|
static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
|
|
@@ -3697,21 +3707,27 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1);
|
|
ggml_vk_create_pipeline(device, device->pipeline_soft_max_back_f32, "soft_max_back_f32", soft_max_back_f32_len, soft_max_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1, true);
|
|
|
|
- ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
- ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
- ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32, "rope_multi_f32", rope_multi_f32_len, rope_multi_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
- ggml_vk_create_pipeline(device, device->pipeline_rope_vision_f32, "rope_vision_f32", rope_vision_f32_len, rope_vision_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32, "rope_multi_f32", rope_multi_f32_len, rope_multi_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_rope_vision_f32, "rope_vision_f32", rope_vision_f32_len, rope_vision_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
|
|
if (device->float_controls_rte_fp16) {
|
|
- ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_rte_len, rope_norm_f16_rte_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
- ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_rte_len, rope_neox_f16_rte_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
- ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f16, "rope_multi_f16", rope_multi_f16_rte_len, rope_multi_f16_rte_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
- ggml_vk_create_pipeline(device, device->pipeline_rope_vision_f16, "rope_vision_f16", rope_vision_f16_rte_len, rope_vision_f16_rte_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_rte_len, rope_norm_f16_rte_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_rte_len, rope_neox_f16_rte_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f16, "rope_multi_f16", rope_multi_f16_rte_len, rope_multi_f16_rte_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_rope_vision_f16, "rope_vision_f16", rope_vision_f16_rte_len, rope_vision_f16_rte_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
+
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32_f16, "rope_norm_f32_f16", rope_norm_f32_f16_rte_len, rope_norm_f32_f16_rte_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32_f16, "rope_neox_f32_f16", rope_neox_f32_f16_rte_len, rope_neox_f32_f16_rte_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
} else {
|
|
- ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
- ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
- ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f16, "rope_multi_f16", rope_multi_f16_len, rope_multi_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
- ggml_vk_create_pipeline(device, device->pipeline_rope_vision_f16, "rope_vision_f16", rope_vision_f16_len, rope_vision_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f16, "rope_multi_f16", rope_multi_f16_len, rope_multi_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_rope_vision_f16, "rope_vision_f16", rope_vision_f16_len, rope_vision_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
+
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32_f16, "rope_norm_f32_f16", rope_norm_f32_f16_len, rope_norm_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
+ ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32_f16, "rope_neox_f32_f16", rope_neox_f32_f16_len, rope_neox_f32_f16_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
|
|
}
|
|
|
|
for (uint32_t i = 0; i < num_argsort_pipelines; ++i) {
|
|
@@ -8170,7 +8186,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
case GGML_OP_ROPE:
|
|
case GGML_OP_ROPE_BACK:
|
|
{
|
|
- const int mode = ((const int32_t *) dst->op_params)[2];
|
|
+ const ggml_tensor *rope = ctx->num_additional_fused_ops == 2 ? dst->src[0]->src[0] : dst;
|
|
+ const int mode = ((const int32_t *) rope->op_params)[2];
|
|
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
|
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
|
|
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
|
@@ -8179,6 +8196,9 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_rope_neox_f32;
|
|
}
|
|
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
|
|
+ return ctx->device->pipeline_rope_neox_f32_f16;
|
|
+ }
|
|
if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
|
return ctx->device->pipeline_rope_neox_f16;
|
|
}
|
|
@@ -8200,6 +8220,9 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
return ctx->device->pipeline_rope_norm_f32;
|
|
}
|
|
+ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
|
|
+ return ctx->device->pipeline_rope_norm_f32_f16;
|
|
+ }
|
|
if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
|
return ctx->device->pipeline_rope_norm_f16;
|
|
}
|
|
@@ -8409,20 +8432,22 @@ static uint32_t get_misalign_bytes(ggml_backend_vk_context * ctx, const ggml_ten
|
|
return ((vk_tensor_offset(t) + t->view_offs) & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1));;
|
|
}
|
|
|
|
-template <typename T> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, T &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
|
+template <typename T> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, T &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
|
GGML_UNUSED(p);
|
|
GGML_UNUSED(src0);
|
|
GGML_UNUSED(src1);
|
|
GGML_UNUSED(src2);
|
|
+ GGML_UNUSED(src3);
|
|
GGML_UNUSED(dst);
|
|
static_assert(!std::is_const<T>::value, "unexpected type");
|
|
GGML_ASSERT(!src0 || get_misalign_bytes(ctx, src0) == 0);
|
|
GGML_ASSERT(!src1 || get_misalign_bytes(ctx, src1) == 0);
|
|
GGML_ASSERT(!src2 || get_misalign_bytes(ctx, src2) == 0);
|
|
+ GGML_ASSERT(!src3 || get_misalign_bytes(ctx, src3) == 0);
|
|
GGML_ASSERT(!dst || get_misalign_bytes(ctx, dst) == 0);
|
|
}
|
|
|
|
-template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_unary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
|
+template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_unary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
|
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
|
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
|
|
|
@@ -8430,9 +8455,10 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk
|
|
|
|
GGML_UNUSED(src1);
|
|
GGML_UNUSED(src2);
|
|
+ GGML_UNUSED(src3);
|
|
}
|
|
|
|
-template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_sum_rows_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
|
+template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_sum_rows_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
|
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
|
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
|
|
|
@@ -8440,9 +8466,10 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk
|
|
|
|
GGML_UNUSED(src1);
|
|
GGML_UNUSED(src2);
|
|
+ GGML_UNUSED(src3);
|
|
}
|
|
|
|
-template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_pad_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
|
+template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_pad_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
|
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
|
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
|
|
|
@@ -8450,9 +8477,10 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk
|
|
|
|
GGML_UNUSED(src1);
|
|
GGML_UNUSED(src2);
|
|
+ GGML_UNUSED(src3);
|
|
}
|
|
|
|
-template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_im2col_3d_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
|
+template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_im2col_3d_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
|
const uint32_t a_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type);
|
|
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
|
|
|
@@ -8460,9 +8488,10 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk
|
|
|
|
GGML_UNUSED(src0);
|
|
GGML_UNUSED(src2);
|
|
+ GGML_UNUSED(src3);
|
|
}
|
|
|
|
-template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_binary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
|
+template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_binary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
|
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
|
const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type);
|
|
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
|
@@ -8472,9 +8501,10 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk
|
|
p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset;
|
|
|
|
GGML_UNUSED(src2);
|
|
+ GGML_UNUSED(src3);
|
|
}
|
|
|
|
-template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_upscale_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
|
|
+template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_upscale_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) {
|
|
const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type);
|
|
const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type);
|
|
|
|
@@ -8483,10 +8513,11 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk
|
|
|
|
GGML_UNUSED(src1);
|
|
GGML_UNUSED(src2);
|
|
+ GGML_UNUSED(src3);
|
|
}
|
|
|
|
template<typename PC>
|
|
-static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) {
|
|
+static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) {
|
|
VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
|
if (src1 != nullptr) {
|
|
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
|
@@ -8494,6 +8525,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
if (src2 != nullptr) {
|
|
std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
|
|
}
|
|
+ if (src3 != nullptr) {
|
|
+ std::cerr << "), (" << src3 << ", name=" << src3->name << ", type=" << src3->type << ", ne0=" << src3->ne[0] << ", ne1=" << src3->ne[1] << ", ne2=" << src3->ne[2] << ", ne3=" << src3->ne[3] << ", nb0=" << src3->nb[0] << ", nb1=" << src3->nb[1] << ", nb2=" << src3->nb[2] << ", nb3=" << src3->nb[3];
|
|
+ }
|
|
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
|
|
std::cerr << "), " << ggml_op_name(op) << ", " << (dryrun ? "dryrun" : "") << ")");
|
|
GGML_ASSERT(op == GGML_OP_GET_ROWS || op == GGML_OP_CPY || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
|
@@ -8520,6 +8554,13 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
const uint64_t ne23 = use_src2 ? src2->ne[3] : 0;
|
|
const uint64_t ne2 = ne20 * ne21;
|
|
|
|
+ const bool use_src3 = src3 != nullptr;
|
|
+ const uint64_t ne30 = use_src3 ? src3->ne[0] : 0;
|
|
+ const uint64_t ne31 = use_src3 ? src3->ne[1] : 0;
|
|
+ const uint64_t ne32 = use_src3 ? src3->ne[2] : 0;
|
|
+ const uint64_t ne33 = use_src3 ? src3->ne[3] : 0;
|
|
+ const uint64_t ne3 = ne30 * ne31;
|
|
+
|
|
const uint64_t ned0 = dst->ne[0];
|
|
const uint64_t ned1 = dst->ne[1];
|
|
const uint64_t ned2 = dst->ne[2];
|
|
@@ -8550,6 +8591,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
|
|
ggml_backend_vk_buffer_context * src1_buf_ctx = use_src1 ? (ggml_backend_vk_buffer_context *)src1->buffer->context : nullptr;
|
|
ggml_backend_vk_buffer_context * src2_buf_ctx = use_src2 ? (ggml_backend_vk_buffer_context *)src2->buffer->context : nullptr;
|
|
+ ggml_backend_vk_buffer_context * src3_buf_ctx = use_src3 ? (ggml_backend_vk_buffer_context *)src3->buffer->context : nullptr;
|
|
|
|
vk_buffer d_X = nullptr;
|
|
size_t x_buf_offset = 0;
|
|
@@ -8557,10 +8599,13 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
size_t y_buf_offset = 0;
|
|
vk_buffer d_Z = nullptr;
|
|
size_t z_buf_offset = 0;
|
|
+ vk_buffer d_W = nullptr;
|
|
+ size_t w_buf_offset = 0;
|
|
|
|
bool src0_uma = false;
|
|
bool src1_uma = false;
|
|
bool src2_uma = false;
|
|
+ bool src3_uma = false;
|
|
|
|
if (ctx->device->uma) {
|
|
ggml_vk_host_get(ctx->device, src0->data, d_X, x_buf_offset);
|
|
@@ -8573,6 +8618,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
ggml_vk_host_get(ctx->device, src2->data, d_Z, z_buf_offset);
|
|
src2_uma = d_Z != nullptr;
|
|
}
|
|
+ if (use_src3) {
|
|
+ ggml_vk_host_get(ctx->device, src3->data, d_W, w_buf_offset);
|
|
+ src3_uma = d_W != nullptr;
|
|
+ }
|
|
}
|
|
|
|
vk_buffer d_D = dst_buf_ctx->dev_buffer;
|
|
@@ -8594,11 +8643,17 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
z_buf_offset = vk_tensor_offset(src2) + src2->view_offs;
|
|
GGML_ASSERT(d_Z != nullptr);
|
|
}
|
|
+ if (use_src3 && !src3_uma) {
|
|
+ d_W = src3_buf_ctx->dev_buffer;
|
|
+ w_buf_offset = vk_tensor_offset(src3) + src3->view_offs;
|
|
+ GGML_ASSERT(d_W != nullptr);
|
|
+ }
|
|
// Compute misalignment offset for descriptors and store it in in push constants, then align the descriptor offsets.
|
|
- init_pushconst_tensor_offsets(ctx, pc, src0, src1, src2, dst);
|
|
+ init_pushconst_tensor_offsets(ctx, pc, src0, src1, src2, src3, dst);
|
|
x_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
|
y_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
|
z_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
|
+ w_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
|
d_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
|
|
|
|
std::array<uint32_t, 3> elements;
|
|
@@ -8799,12 +8854,13 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
break;
|
|
}
|
|
|
|
- uint64_t x_sz, y_sz, z_sz, d_sz;
|
|
+ uint64_t x_sz, y_sz, z_sz, w_sz, d_sz;
|
|
|
|
if (op_supports_incontiguous) {
|
|
x_sz = ggml_nbytes(src0) + get_misalign_bytes(ctx, src0);
|
|
y_sz = use_src1 ? ggml_nbytes(src1) + get_misalign_bytes(ctx, src1) : 0;
|
|
z_sz = use_src2 ? ggml_nbytes(src2) + get_misalign_bytes(ctx, src2) : 0;
|
|
+ w_sz = use_src3 ? ggml_nbytes(src3) + get_misalign_bytes(ctx, src3) : 0;
|
|
d_sz = ggml_nbytes(dst) + get_misalign_bytes(ctx, dst);
|
|
|
|
if (x_buf_offset + x_sz >= d_X->size) {
|
|
@@ -8816,6 +8872,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
if (use_src2 && z_buf_offset + z_sz >= d_Z->size) {
|
|
z_sz = ggml_vk_get_max_buffer_range(ctx, d_Z, z_buf_offset);
|
|
}
|
|
+ if (use_src3 && w_buf_offset + w_sz >= d_W->size) {
|
|
+ w_sz = ggml_vk_get_max_buffer_range(ctx, d_W, w_buf_offset);
|
|
+ }
|
|
if (d_buf_offset + d_sz >= d_D->size) {
|
|
d_sz = ggml_vk_get_max_buffer_range(ctx, d_D, d_buf_offset);
|
|
}
|
|
@@ -8823,6 +8882,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
x_sz = ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0 * ne02 * ne03;
|
|
y_sz = use_src1 ? ggml_type_size(src1->type) * ne1 * ne12 * ne13 : 0;
|
|
z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 * ne22 * ne23 : 0;
|
|
+ w_sz = use_src3 ? ggml_type_size(src3->type) * ne3 * ne32 * ne33 : 0;
|
|
d_sz = ggml_type_size(dst->type) * ned * ned2 * ned3;
|
|
}
|
|
|
|
@@ -8864,14 +8924,19 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
} else if (op == GGML_OP_ROPE || op == GGML_OP_ROPE_BACK) {
|
|
// Empty src2 is possible in rope, but the shader needs a buffer
|
|
- vk_subbuffer subbuf_z;
|
|
+ vk_subbuffer subbuf_z, subbuf_w;
|
|
if (use_src2) {
|
|
subbuf_z = { d_Z, z_buf_offset, z_sz };
|
|
} else {
|
|
subbuf_z = { d_X, 0, x_sz };
|
|
}
|
|
+ if (use_src3) {
|
|
+ subbuf_w = { d_W, w_buf_offset, w_sz };
|
|
+ } else {
|
|
+ subbuf_w = { d_X, 0, x_sz };
|
|
+ }
|
|
|
|
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz }, subbuf_w }, pc, elements);
|
|
} else if (op == GGML_OP_IM2COL || op == GGML_OP_IM2COL_3D) {
|
|
if (ctx->device->shader_int64 && ctx->device->buffer_device_address) {
|
|
// buffer device address path doesn't use dst buffer
|
|
@@ -8887,6 +8952,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
} else if (op == GGML_OP_OPT_STEP_SGD) {
|
|
// OPT_STEP_SGD works on src0, it does not need dst
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz } }, pc, elements);
|
|
+ } else if (use_src3) {
|
|
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_W, w_buf_offset, w_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
} else if (use_src2) {
|
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
} else if (use_src1) {
|
|
@@ -8901,7 +8968,7 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx,
|
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
- ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, {
|
|
+ ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_GET_ROWS, {
|
|
(uint32_t)ggml_nelements(src0),
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
@@ -8921,7 +8988,7 @@ static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
|
// int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
|
|
int offset = dst->op_params[3] / 4; // offset in bytes
|
|
|
|
- ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ACC, {
|
|
+ ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_ACC, {
|
|
(uint32_t)ggml_nelements(src0),
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)nb1, (uint32_t)nb2, (uint32_t)src0->nb[3] / src0_type_size,
|
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
@@ -9046,7 +9113,7 @@ static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
- ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ADD, {
|
|
+ ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_ADD, {
|
|
(uint32_t)ggml_nelements(src0),
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
@@ -9061,7 +9128,7 @@ static void ggml_vk_sub(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
- ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SUB, {
|
|
+ ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SUB, {
|
|
(uint32_t)ggml_nelements(src0),
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
@@ -9076,7 +9143,7 @@ static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
- ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_MUL, {
|
|
+ ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_MUL, {
|
|
(uint32_t)ggml_nelements(src0),
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
@@ -9091,7 +9158,7 @@ static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
- ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_DIV, {
|
|
+ ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_DIV, {
|
|
(uint32_t)ggml_nelements(src0),
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
@@ -9106,7 +9173,7 @@ static void ggml_vk_add_id(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
const uint32_t src2_type_size = ggml_type_size(src2->type);
|
|
|
|
- ggml_vk_op_f32<vk_op_add_id_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ADD_ID, {
|
|
+ ggml_vk_op_f32<vk_op_add_id_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_ADD_ID, {
|
|
(uint32_t)dst->ne[0],
|
|
(uint32_t)dst->ne[1],
|
|
(uint32_t)src0->nb[1] / src0_type_size,
|
|
@@ -9339,7 +9406,7 @@ static void ggml_vk_ssm_conv(ggml_backend_vk_context * ctx, vk_context& subctx,
|
|
const ggml_tensor * src0 = dst->src[0];
|
|
const ggml_tensor * src1 = dst->src[1];
|
|
|
|
- ggml_vk_op_f32<vk_op_ssm_conv_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SSM_CONV, {
|
|
+ ggml_vk_op_f32<vk_op_ssm_conv_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SSM_CONV, {
|
|
(uint32_t)src0->nb[1], (uint32_t)src0->nb[2],
|
|
(uint32_t)src1->nb[1],
|
|
(uint32_t)dst->nb[0], (uint32_t)dst->nb[1], (uint32_t)dst->nb[2],
|
|
@@ -9457,7 +9524,7 @@ static void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& su
|
|
static void ggml_vk_opt_step_sgd(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
|
|
const size_t n = ggml_nelements(dst->src[0]);
|
|
|
|
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_OPT_STEP_SGD, { (uint32_t)n, 0, 0.0f, 0.0f }, dryrun);
|
|
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_OPT_STEP_SGD, { (uint32_t)n, 0, 0.0f, 0.0f }, dryrun);
|
|
}
|
|
|
|
static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -9467,7 +9534,7 @@ static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
- ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONCAT, {
|
|
+ ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONCAT, {
|
|
(uint32_t)ggml_nelements(dst),
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
@@ -9491,7 +9558,7 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c
|
|
sf1 = (float)(dst->ne[1] - 1) / (src0->ne[1] - 1);
|
|
}
|
|
|
|
- ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
|
|
+ ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
|
|
(uint32_t)ggml_nelements(dst), 0, 0,
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1],
|
|
(uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
@@ -9505,23 +9572,23 @@ static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, con
|
|
p.param1 = ggml_get_op_params_f32(dst, 0);
|
|
p.param2 = ggml_get_op_params_f32(dst, 1);
|
|
|
|
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, std::move(p), dryrun);
|
|
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SCALE, std::move(p), dryrun);
|
|
}
|
|
|
|
static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, vk_op_unary_push_constants_init(src0, dst), dryrun);
|
|
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SQR, vk_op_unary_push_constants_init(src0, dst), dryrun);
|
|
}
|
|
|
|
static void ggml_vk_sqrt(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQRT, vk_op_unary_push_constants_init(src0, dst), dryrun);
|
|
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SQRT, vk_op_unary_push_constants_init(src0, dst), dryrun);
|
|
}
|
|
|
|
static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, vk_op_unary_push_constants_init(src0, dst), dryrun);
|
|
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SIN, vk_op_unary_push_constants_init(src0, dst), dryrun);
|
|
}
|
|
|
|
static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, vk_op_unary_push_constants_init(src0, dst), dryrun);
|
|
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_COS, vk_op_unary_push_constants_init(src0, dst), dryrun);
|
|
}
|
|
|
|
static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -9529,12 +9596,12 @@ static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, con
|
|
p.param1 = ggml_get_op_params_f32(dst, 0);
|
|
p.param2 = ggml_get_op_params_f32(dst, 1);
|
|
|
|
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, std::move(p), dryrun);
|
|
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CLAMP, std::move(p), dryrun);
|
|
}
|
|
|
|
static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
vk_op_pad_push_constants p = vk_op_pad_push_constants_init(src0, dst);
|
|
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, std::move(p), dryrun);
|
|
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_PAD, std::move(p), dryrun);
|
|
}
|
|
|
|
static void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -9549,17 +9616,17 @@ static void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, cons
|
|
memcpy(&p.param1, &s01_packed, sizeof(float));
|
|
memcpy(&p.param2, &s23_packed, sizeof(float));
|
|
|
|
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ROLL, std::move(p), dryrun);
|
|
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ROLL, std::move(p), dryrun);
|
|
}
|
|
|
|
static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
|
|
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, std::move(p), dryrun);
|
|
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_REPEAT, std::move(p), dryrun);
|
|
}
|
|
|
|
static void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
|
|
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, std::move(p), dryrun);
|
|
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, std::move(p), dryrun);
|
|
}
|
|
|
|
static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -9575,7 +9642,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
|
}
|
|
|
|
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ne);
|
|
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, std::move(p), dryrun);
|
|
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_CPY, std::move(p), dryrun);
|
|
}
|
|
|
|
static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -9590,7 +9657,7 @@ static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx,
|
|
return;
|
|
}
|
|
|
|
- ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SET_ROWS, {
|
|
+ ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SET_ROWS, {
|
|
(uint32_t)ggml_nelements(src0),
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
@@ -9601,13 +9668,13 @@ static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx,
|
|
}
|
|
|
|
static void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SILU_BACK, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
|
|
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SILU_BACK, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
|
|
}
|
|
|
|
static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
float * op_params = (float *)dst->op_params;
|
|
|
|
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
|
|
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
|
|
}
|
|
|
|
static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -9618,7 +9685,7 @@ static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
const float eps = float_op_params[1];
|
|
const uint32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
|
|
|
|
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f }, dryrun);
|
|
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f }, dryrun);
|
|
}
|
|
|
|
static uint32_t ggml_vk_rms_num_partials(ggml_backend_vk_context * ctx, const ggml_tensor *node) {
|
|
@@ -9641,7 +9708,7 @@ static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx,
|
|
|
|
uint32_t param3 = ctx->do_add_rms_partials ? ggml_vk_rms_num_partials(ctx, dst) : 0;
|
|
|
|
- ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_RMS_NORM, {
|
|
+ ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_RMS_NORM, {
|
|
(uint32_t)ggml_nelements(src0),
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
|
@@ -9658,16 +9725,16 @@ static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx,
|
|
|
|
static void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
float * op_params = (float *)dst->op_params;
|
|
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_RMS_NORM_BACK, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
|
|
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_RMS_NORM_BACK, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
|
|
}
|
|
|
|
static void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
float * op_params = (float *)dst->op_params;
|
|
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_L2_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
|
|
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_L2_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
|
|
}
|
|
|
|
static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
|
|
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
|
|
}
|
|
|
|
static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -9690,7 +9757,7 @@ static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
|
|
|
const uint32_t mode = split ? 2 : (swapped ? 1 : 0);
|
|
|
|
- ggml_vk_op_f32<vk_op_glu_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GLU,
|
|
+ ggml_vk_op_f32<vk_op_glu_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_GLU,
|
|
{
|
|
(uint32_t)ggml_nelements(dst),
|
|
(uint32_t)src0->ne[0],
|
|
@@ -9703,7 +9770,7 @@ static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
|
|
|
static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
int32_t * op_params = (int32_t *)dst->op_params;
|
|
- ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }, dryrun);
|
|
+ ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }, dryrun);
|
|
}
|
|
|
|
static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -9728,7 +9795,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
|
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
|
|
|
- ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_SOFT_MAX, {
|
|
+ ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, {
|
|
ncols,
|
|
src1 != nullptr ? nrows_y : (uint32_t)0,
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],
|
|
@@ -9744,7 +9811,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
|
|
|
|
static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
float * op_params = (float *)dst->op_params;
|
|
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX_BACK, { (uint32_t)src0->ne[0], (uint32_t)ggml_nrows(src0), op_params[0], op_params[1] }, dryrun);
|
|
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_SOFT_MAX_BACK, { (uint32_t)src0->ne[0], (uint32_t)ggml_nrows(src0), op_params[0], op_params[1] }, dryrun);
|
|
}
|
|
|
|
static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx, bool dryrun = false) {
|
|
@@ -9835,7 +9902,12 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,
|
|
}, pc, elements);
|
|
}
|
|
|
|
-static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool backprop, bool dryrun = false) {
|
|
+static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_cgraph * cgraph, int node_idx, bool backprop, bool dryrun = false) {
|
|
+ ggml_tensor * dst = cgraph->nodes[node_idx];
|
|
+ const ggml_tensor * src0 = dst->src[0];
|
|
+ const ggml_tensor * src1 = dst->src[1];
|
|
+ const ggml_tensor * src2 = dst->src[2];
|
|
+ const ggml_tensor * src3 = nullptr;
|
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
const int mode = ((int32_t *) dst->op_params)[2];
|
|
// const int n_ctx = ((int32_t *) dst->op_params)[3];
|
|
@@ -9859,11 +9931,20 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, cons
|
|
uint32_t s1 = src0->nb[1] / ggml_type_size(src0->type);
|
|
uint32_t s2 = src0->nb[2] / ggml_type_size(src0->type);
|
|
|
|
- ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
|
|
+ uint32_t set_rows_stride = 0;
|
|
+ // Fused rope + view + set_rows passes the set_rows destination stride in set_rows_stride
|
|
+ // and overrides the dst and sets src3=row_indices
|
|
+ if (ctx->num_additional_fused_ops > 0) {
|
|
+ set_rows_stride = cgraph->nodes[node_idx + 2]->nb[1] / ggml_type_size(cgraph->nodes[node_idx + 2]->type);
|
|
+ src3 = cgraph->nodes[node_idx + 2]->src[1];
|
|
+ dst = cgraph->nodes[node_idx + 2];
|
|
+ }
|
|
+
|
|
+ ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, src3, dst, GGML_OP_ROPE, {
|
|
(uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
|
|
freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
|
|
src2 != nullptr, (uint32_t)src0->ne[2], s1, s2,
|
|
- { sections[0], sections[1], sections[2], sections[3] }, backprop
|
|
+ { sections[0], sections[1], sections[2], sections[3] }, backprop, set_rows_stride,
|
|
}, dryrun);
|
|
}
|
|
|
|
@@ -9872,7 +9953,7 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, c
|
|
|
|
uint32_t ncols = src0->ne[0];
|
|
|
|
- ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, {
|
|
+ ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ARGSORT, {
|
|
ncols,
|
|
op_params[0],
|
|
}, dryrun);
|
|
@@ -9880,26 +9961,26 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, c
|
|
|
|
static void ggml_vk_sum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, ggml_nelements(src0));
|
|
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SUM, p, dryrun);
|
|
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SUM, p, dryrun);
|
|
}
|
|
|
|
static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]);
|
|
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, p, dryrun);
|
|
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, p, dryrun);
|
|
}
|
|
|
|
static void ggml_vk_mean(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]);
|
|
p.weight = 1.0f / (float)src0->ne[0];
|
|
- ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_MEAN, p, dryrun);
|
|
+ ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_MEAN, p, dryrun);
|
|
}
|
|
|
|
static void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGMAX, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f }, dryrun);
|
|
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ARGMAX, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f }, dryrun);
|
|
}
|
|
|
|
static void ggml_vk_count_equal(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_COUNT_EQUAL, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
|
|
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_COUNT_EQUAL, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
|
|
}
|
|
|
|
static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -9932,7 +10013,7 @@ static void ggml_vk_im2col(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
|
|
const vk::DeviceAddress dst_addr = d_buf->bda_addr + vk_tensor_offset(dst) + dst->view_offs;
|
|
|
|
- ggml_vk_op_f32<vk_op_im2col_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_IM2COL, {
|
|
+ ggml_vk_op_f32<vk_op_im2col_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_IM2COL, {
|
|
dst_addr,
|
|
batch_offset, offset_delta,
|
|
IC, IW, IH, OW, OH, KW, KH,
|
|
@@ -10005,7 +10086,7 @@ static void ggml_vk_im2col_3d(ggml_backend_vk_context * ctx, vk_context& subctx,
|
|
pc.OH_OW_IC_KD_KH_KW = OH*OW*IC*KD*KH*KW;
|
|
pc.OW_IC_KD_KH_KW = OW*IC*KD*KH*KW;
|
|
|
|
- ggml_vk_op_f32<vk_op_im2col_3d_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_IM2COL_3D, std::move(pc), dryrun);
|
|
+ ggml_vk_op_f32<vk_op_im2col_3d_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_IM2COL_3D, std::move(pc), dryrun);
|
|
}
|
|
|
|
static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -10013,7 +10094,7 @@ static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context
|
|
const uint32_t max_period = dst->op_params[1];
|
|
const uint32_t nb1 = dst->nb[1] / ggml_type_size(dst->type);
|
|
|
|
- ggml_vk_op_f32<vk_op_timestep_embedding_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_TIMESTEP_EMBEDDING, {
|
|
+ ggml_vk_op_f32<vk_op_timestep_embedding_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_TIMESTEP_EMBEDDING, {
|
|
nb1, dim, max_period,
|
|
}, dryrun);
|
|
}
|
|
@@ -10046,7 +10127,7 @@ static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context&
|
|
p.nb1 = static_cast<uint32_t>(nb1 / nb0);
|
|
p.s0 = static_cast<uint32_t>(s0);
|
|
|
|
- ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p), dryrun);
|
|
+ ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p), dryrun);
|
|
}
|
|
|
|
static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -10069,7 +10150,7 @@ static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c
|
|
|
|
const uint32_t parallel_elements = N * OC * OH * OW;
|
|
|
|
- ggml_vk_op_f32<vk_op_pool2d_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_POOL_2D, {
|
|
+ ggml_vk_op_f32<vk_op_pool2d_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_POOL_2D, {
|
|
IW, IH, OW, OH, OC,
|
|
parallel_elements,
|
|
op,
|
|
@@ -10123,7 +10204,7 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx,
|
|
GGML_ASSERT(ne03 == ne2);
|
|
GGML_ASSERT(ne02 == ne12);
|
|
|
|
- ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_2D, std::move(p), dryrun);
|
|
+ ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_2D, std::move(p), dryrun);
|
|
}
|
|
|
|
static void ggml_vk_conv_transpose_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0,
|
|
@@ -10172,7 +10253,7 @@ static void ggml_vk_conv_transpose_2d(ggml_backend_vk_context * ctx, vk_context
|
|
GGML_ASSERT(ne02 == ne2);
|
|
GGML_ASSERT(ne03 == ne12);
|
|
|
|
- ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_TRANSPOSE_2D, std::move(p), dryrun);
|
|
+ ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_2D, std::move(p), dryrun);
|
|
}
|
|
|
|
static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -10196,12 +10277,12 @@ static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
GGML_ASSERT(src0->ne[3] == p.channels);
|
|
GGML_ASSERT(src1->ne[3] == p.batches);
|
|
|
|
- ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_2D_DW, std::move(p), dryrun);
|
|
+ ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_2D_DW, std::move(p), dryrun);
|
|
}
|
|
|
|
static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
const float * op_params = (const float *)dst->op_params;
|
|
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }, dryrun);
|
|
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }, dryrun);
|
|
}
|
|
|
|
#ifdef GGML_VULKAN_RUN_TESTS
|
|
@@ -11327,7 +11408,6 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
|
case GGML_OP_DIAG_MASK_INF:
|
|
case GGML_OP_SOFT_MAX:
|
|
case GGML_OP_SOFT_MAX_BACK:
|
|
- case GGML_OP_ROPE:
|
|
case GGML_OP_ROPE_BACK:
|
|
case GGML_OP_ARGSORT:
|
|
case GGML_OP_SUM:
|
|
@@ -11401,9 +11481,12 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
|
// nodes require synchronization.
|
|
for (int32_t i = 0; i < ctx->num_additional_fused_ops + 1 && !need_sync; ++i) {
|
|
const ggml_tensor *cur_node = cgraph->nodes[node_idx + i];
|
|
- if (overlaps_unsynced(cur_node, ctx->unsynced_nodes_read) || overlaps_unsynced(cur_node, ctx->unsynced_nodes_written)) {
|
|
- need_sync = true;
|
|
- break;
|
|
+ // If the node actually writes to memory, then check if it needs to sync
|
|
+ if (ctx->fused_ops_write_mask & (1 << i)) {
|
|
+ if (overlaps_unsynced(cur_node, ctx->unsynced_nodes_read) || overlaps_unsynced(cur_node, ctx->unsynced_nodes_written)) {
|
|
+ need_sync = true;
|
|
+ break;
|
|
+ }
|
|
}
|
|
for (uint32_t j = 0; j < GGML_MAX_SRC; ++j) {
|
|
if (!cur_node->src[j]) {
|
|
@@ -11430,7 +11513,9 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
|
for (int32_t i = 0; i < ctx->num_additional_fused_ops + 1; ++i) {
|
|
const ggml_tensor *cur_node = cgraph->nodes[node_idx + i];
|
|
// Multiple outputs could be written, e.g. in topk_moe. Add them all to the list.
|
|
- ctx->unsynced_nodes_written.push_back(cur_node);
|
|
+ if (ctx->fused_ops_write_mask & (1 << i)) {
|
|
+ ctx->unsynced_nodes_written.push_back(cur_node);
|
|
+ }
|
|
for (uint32_t j = 0; j < GGML_MAX_SRC; ++j) {
|
|
if (!cur_node->src[j]) {
|
|
continue;
|
|
@@ -11621,11 +11706,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
|
|
|
break;
|
|
case GGML_OP_ROPE:
|
|
- ggml_vk_rope(ctx, compute_ctx, src0, src1, src2, node, false, dryrun);
|
|
+ ggml_vk_rope(ctx, compute_ctx, cgraph, node_idx, false, dryrun);
|
|
|
|
break;
|
|
case GGML_OP_ROPE_BACK:
|
|
- ggml_vk_rope(ctx, compute_ctx, src0, src1, src2, node, true, dryrun);
|
|
+ ggml_vk_rope(ctx, compute_ctx, cgraph, node_idx, true, dryrun);
|
|
|
|
break;
|
|
case GGML_OP_ARGSORT:
|
|
@@ -12487,6 +12572,41 @@ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struc
|
|
return true;
|
|
}
|
|
|
|
+static bool ggml_vk_can_fuse_rope_set_rows(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph,
|
|
+ int node_idx) {
|
|
+ GGML_UNUSED(ctx);
|
|
+ const ggml_tensor *rope = cgraph->nodes[node_idx + 0];
|
|
+ const ggml_tensor *view = cgraph->nodes[node_idx + 1];
|
|
+ const ggml_tensor *set_rows = cgraph->nodes[node_idx + 2];
|
|
+
|
|
+ // ne3 not tested
|
|
+ if (rope->src[0]->ne[3] != 1) {
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ if (set_rows->type != GGML_TYPE_F32 && set_rows->type != GGML_TYPE_F16) {
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ if (set_rows->src[1]->type != GGML_TYPE_I64) {
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ // The view should flatten two dims of rope into one dim
|
|
+ if (!ggml_is_contiguous(view) ||
|
|
+ view->ne[0] != rope->ne[0] * rope->ne[1]) {
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ // Only norm/neox shaders have the fusion code
|
|
+ const int mode = ((const int32_t *) rope->op_params)[2];
|
|
+ if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX) {
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx) {
|
|
|
|
const ggml_tensor *first_node = cgraph->nodes[node_idx];
|
|
@@ -12562,6 +12682,10 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|
ctx->num_additional_fused_ops = num_adds - 1;
|
|
} else if (ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
|
|
ctx->num_additional_fused_ops = 1;
|
|
+ } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 2 }) &&
|
|
+ ggml_check_edges(cgraph, i, rope_view_set_rows_edges) &&
|
|
+ ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i)) {
|
|
+ ctx->num_additional_fused_ops = 2;
|
|
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) &&
|
|
ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) &&
|
|
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) {
|
|
@@ -12671,20 +12795,31 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|
ctx->num_additional_fused_ops = num_adds - 1;
|
|
} else if (ggml_vk_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
|
|
ctx->num_additional_fused_ops = 1;
|
|
+ } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 2 }) &&
|
|
+ ggml_check_edges(cgraph, i, rope_view_set_rows_edges) &&
|
|
+ ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i)) {
|
|
+ ctx->num_additional_fused_ops = 2;
|
|
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) &&
|
|
ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) &&
|
|
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) {
|
|
ctx->num_additional_fused_ops = topk_moe_early_softmax_norm.size() - 1;
|
|
+ // view of argsort writes to memory
|
|
+ ctx->fused_ops_write_mask |= 1 << 3;
|
|
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax, { i + 3, i + 4 }) &&
|
|
ggml_check_edges(cgraph, i, topk_moe_early_softmax_edges) &&
|
|
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX)) {
|
|
ctx->num_additional_fused_ops = topk_moe_early_softmax.size() - 1;
|
|
+ // view of argsort writes to memory
|
|
+ ctx->fused_ops_write_mask |= 1 << 3;
|
|
} else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_late_softmax, { i + 1, i + 5 }) &&
|
|
ggml_check_edges(cgraph, i, topk_moe_late_softmax_edges) &&
|
|
ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_LATE_SOFTMAX)) {
|
|
ctx->num_additional_fused_ops = topk_moe_late_softmax.size() - 1;
|
|
+ // view of argsort writes to memory
|
|
+ ctx->fused_ops_write_mask |= 1 << 1;
|
|
}
|
|
}
|
|
+ ctx->fused_ops_write_mask |= 1 << ctx->num_additional_fused_ops;
|
|
|
|
// Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
|
|
bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5;
|
|
@@ -12730,6 +12865,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|
}
|
|
i += ctx->num_additional_fused_ops;
|
|
ctx->num_additional_fused_ops = 0;
|
|
+ ctx->fused_ops_write_mask = 0;
|
|
}
|
|
|
|
if (vk_perf_logger_enabled) {
|
|
@@ -12887,6 +13023,32 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
|
|
}
|
|
if (ok) {
|
|
current_set.push_back(j);
|
|
+ // Look for ROPE + VIEW + SET_ROWS and make them consecutive
|
|
+ if (graph->nodes[j]->op == GGML_OP_ROPE) {
|
|
+ int view_idx = -1;
|
|
+ int set_rows_idx = -1;
|
|
+ for (int k = j+1; k < std::min(j + 10, graph->n_nodes); ++k) {
|
|
+ if (view_idx == -1 &&
|
|
+ graph->nodes[k]->op == GGML_OP_VIEW &&
|
|
+ graph->nodes[k]->src[0] == graph->nodes[j]) {
|
|
+ view_idx = k;
|
|
+ continue;
|
|
+ }
|
|
+ if (view_idx != -1 &&
|
|
+ set_rows_idx == -1 &&
|
|
+ graph->nodes[k]->op == GGML_OP_SET_ROWS &&
|
|
+ graph->nodes[k]->src[0] == graph->nodes[view_idx]) {
|
|
+ set_rows_idx = k;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ if (set_rows_idx != -1) {
|
|
+ current_set.push_back(view_idx);
|
|
+ current_set.push_back(set_rows_idx);
|
|
+ used[view_idx] = true;
|
|
+ used[set_rows_idx] = true;
|
|
+ }
|
|
+ }
|
|
}
|
|
}
|
|
// Second pass grabs view nodes.
|
|
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl
|
|
index 50fc1f1e2..0eda186c8 100644
|
|
--- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl
|
|
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl
|
|
@@ -10,6 +10,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
|
|
layout (binding = 1) readonly buffer Y {int data_pos[];};
|
|
layout (binding = 2) readonly buffer Z {float data_ff[];};
|
|
layout (binding = 3) writeonly buffer D {D_TYPE data_d[];};
|
|
+layout (binding = 4) readonly buffer I {uvec2 data_i[];}; // indices for set_rows
|
|
|
|
layout (push_constant) uniform parameter {
|
|
uint ncols;
|
|
@@ -27,6 +28,7 @@ layout (push_constant) uniform parameter {
|
|
uint s2;
|
|
int sections[4];
|
|
uint is_back;
|
|
+ uint set_rows_stride;
|
|
} p;
|
|
|
|
float rope_yarn_ramp(const float low, const float high, const uint i0) {
|
|
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp
|
|
index 06e095bef..9f4538155 100644
|
|
--- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp
|
|
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp
|
|
@@ -16,12 +16,19 @@ void main() {
|
|
const uint row_x = row_dst % ne1;
|
|
const uint channel_x = row_dst / ne1;
|
|
|
|
- const uint idst = row_dst*ne0 + i0/2;
|
|
+ uint idst = row_dst*ne0 + i0/2;
|
|
const uint ix = channel_x*p.s2 + row_x*p.s1 + i0/2;
|
|
|
|
+ // Fusion optimization: ROPE + VIEW + SET_ROWS..
|
|
+ // The rope output is viewed as a 1D tensor and offset based on a row index in data_i.
|
|
+ if (p.set_rows_stride != 0) {
|
|
+ idst = row_x*ne0 + i0/2;
|
|
+ idst += data_i[channel_x].x * p.set_rows_stride;
|
|
+ }
|
|
+
|
|
if (i0 >= p.n_dims) {
|
|
- data_d[idst + i0/2 + 0] = data_a[ix + i0/2 + 0];
|
|
- data_d[idst + i0/2 + 1] = data_a[ix + i0/2 + 1];
|
|
+ data_d[idst + i0/2 + 0] = D_TYPE(data_a[ix + i0/2 + 0]);
|
|
+ data_d[idst + i0/2 + 1] = D_TYPE(data_a[ix + i0/2 + 1]);
|
|
|
|
return;
|
|
}
|
|
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp
|
|
index 6ba957540..f4209ed95 100644
|
|
--- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp
|
|
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp
|
|
@@ -16,12 +16,19 @@ void main() {
|
|
const uint row_x = row_dst % ne1;
|
|
const uint channel_x = row_dst / ne1;
|
|
|
|
- const uint idst = row_dst*ne0 + i0;
|
|
+ uint idst = row_dst*ne0 + i0;
|
|
const uint ix = channel_x*p.s2 + row_x*p.s1 + i0;
|
|
|
|
+ // Fusion optimization: ROPE + VIEW + SET_ROWS..
|
|
+ // The rope output is viewed as a 1D tensor and offset based on a row index in data_i.
|
|
+ if (p.set_rows_stride != 0) {
|
|
+ idst = row_x*ne0 + i0;
|
|
+ idst += data_i[channel_x].x * p.set_rows_stride;
|
|
+ }
|
|
+
|
|
if (i0 >= p.n_dims) {
|
|
- data_d[idst + 0] = data_a[ix + 0];
|
|
- data_d[idst + 1] = data_a[ix + 1];
|
|
+ data_d[idst + 0] = D_TYPE(data_a[ix + 0]);
|
|
+ data_d[idst + 1] = D_TYPE(data_a[ix + 1]);
|
|
|
|
return;
|
|
}
|
|
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
|
|
index 03fa01639..e6ec589fb 100644
|
|
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
|
|
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
|
|
@@ -842,10 +842,14 @@ void process_shaders() {
|
|
string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
|
string_to_spv("rope_norm_f16_rte", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
|
|
+ string_to_spv("rope_norm_f32_f16", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
|
|
+ string_to_spv("rope_norm_f32_f16_rte", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
|
|
|
|
string_to_spv("rope_neox_f32", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
|
string_to_spv("rope_neox_f16_rte", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
|
|
+ string_to_spv("rope_neox_f32_f16", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
|
|
+ string_to_spv("rope_neox_f32_f16_rte", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
|
|
|
|
string_to_spv("rope_multi_f32", "rope_multi.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
string_to_spv("rope_multi_f16", "rope_multi.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
|
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
|
|
index 9eb2b6687..657b6cc2f 100644
|
|
--- a/tests/test-backend-ops.cpp
|
|
+++ b/tests/test-backend-ops.cpp
|
|
@@ -2105,6 +2105,34 @@ struct test_get_rows_back : public test_case {
|
|
}
|
|
};
|
|
|
|
+static void init_set_rows_row_ids(ggml_tensor * t, int num_rows) {
|
|
+ std::random_device rd;
|
|
+ std::default_random_engine rng(rd());
|
|
+ for (int i2 = 0; i2 < t->ne[2]; i2++) {
|
|
+ for (int i1 = 0; i1 < t->ne[1]; i1++) {
|
|
+ // generate a shuffled subset of row indices
|
|
+ std::vector<int64_t> data(num_rows);
|
|
+ for (int i = 0; i < num_rows; i++) {
|
|
+ data[i] = i;
|
|
+ }
|
|
+ std::shuffle(data.begin(), data.end(), rng);
|
|
+ data.resize(t->ne[0]);
|
|
+
|
|
+ const size_t offs = i1*t->nb[1] + i2*t->nb[2];
|
|
+ if (t->type == GGML_TYPE_I32) {
|
|
+ // TODO: Make a template or something
|
|
+ std::vector<int32_t> data_i32(t->ne[0]);
|
|
+ for (int i = 0; i < t->ne[0]; i++) {
|
|
+ data_i32[i] = static_cast<int32_t>(data[i]);
|
|
+ }
|
|
+ ggml_backend_tensor_set(t, data_i32.data(), offs, t->ne[0]*sizeof(int32_t));
|
|
+ } else {
|
|
+ ggml_backend_tensor_set(t, data.data(), offs, t->ne[0]*sizeof(int64_t));
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
// GGML_OP_SET_ROWS
|
|
struct test_set_rows : public test_case {
|
|
const ggml_type type;
|
|
@@ -2148,37 +2176,13 @@ struct test_set_rows : public test_case {
|
|
}
|
|
|
|
void initialize_tensors(ggml_context * ctx) override {
|
|
- std::random_device rd;
|
|
- std::default_random_engine rng(rd());
|
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
if (t->type == GGML_TYPE_I64 || t->type == GGML_TYPE_I32) {
|
|
if (ggml_is_view_op(t->op)) {
|
|
continue;
|
|
}
|
|
|
|
- for (int i2 = 0; i2 < t->ne[2]; i2++) {
|
|
- for (int i1 = 0; i1 < t->ne[1]; i1++) {
|
|
- // generate a shuffled subset of row indices
|
|
- std::vector<int64_t> data(ne[1]);
|
|
- for (int i = 0; i < ne[1]; i++) {
|
|
- data[i] = i;
|
|
- }
|
|
- std::shuffle(data.begin(), data.end(), rng);
|
|
- data.resize(t->ne[0]);
|
|
-
|
|
- const size_t offs = i1*t->nb[1] + i2*t->nb[2];
|
|
- if (t->type == GGML_TYPE_I32) {
|
|
- // TODO: Make a template or something
|
|
- std::vector<int32_t> data_i32(t->ne[0]);
|
|
- for (int i = 0; i < t->ne[0]; i++) {
|
|
- data_i32[i] = static_cast<int32_t>(data[i]);
|
|
- }
|
|
- ggml_backend_tensor_set(t, data_i32.data(), offs, t->ne[0]*sizeof(int32_t));
|
|
- } else {
|
|
- ggml_backend_tensor_set(t, data.data(), offs, t->ne[0]*sizeof(int64_t));
|
|
- }
|
|
- }
|
|
- }
|
|
+ init_set_rows_row_ids(t, ne[1]);
|
|
} else {
|
|
init_tensor_uniform(t);
|
|
}
|
|
@@ -2207,6 +2211,67 @@ struct test_set_rows : public test_case {
|
|
}
|
|
};
|
|
|
|
+// GGML_OP_ROPE + GGML_OP_VIEW + GGML_OP_SET_ROWS
|
|
+struct test_rope_set_rows : public test_case {
|
|
+ const ggml_type type;
|
|
+ const ggml_type type_idx;
|
|
+ const std::array<int64_t, 4> ne;
|
|
+ int mode;
|
|
+
|
|
+ std::string vars() override {
|
|
+ return VARS_TO_STR4(type, type_idx, ne, mode);
|
|
+ }
|
|
+
|
|
+ std::string op_desc(ggml_tensor * t) override {
|
|
+ GGML_UNUSED(t);
|
|
+ return "ROPE_SET_ROWS";
|
|
+ }
|
|
+
|
|
+ bool run_whole_graph() override { return true; }
|
|
+
|
|
+ test_rope_set_rows(ggml_type type,
|
|
+ ggml_type type_idx,
|
|
+ std::array<int64_t, 4> ne,
|
|
+ int mode)
|
|
+ : type(type), type_idx(type_idx), ne(ne), mode(mode) {}
|
|
+
|
|
+ ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
+ ggml_tensor * src = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, ne[0], ne[1], ne[2], 1);
|
|
+ ggml_set_name(src, "src");
|
|
+
|
|
+ ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
|
|
+
|
|
+ ggml_tensor * rope = ggml_rope(ctx, src, pos, ne[0], mode);
|
|
+
|
|
+ ggml_tensor * view = ggml_view_2d(ctx, rope, ne[0] * ne[1], ne[2], rope->nb[2], 0);
|
|
+
|
|
+ ggml_tensor * dst = ggml_new_tensor_4d(ctx, type, ne[0] * ne[1], ne[2] * ne[3], 1, 1);
|
|
+ ggml_set_name(dst, "dst");
|
|
+
|
|
+ ggml_tensor * row_idxs = ggml_new_tensor_3d(ctx, type_idx, ne[2], 1, 1);
|
|
+ ggml_set_name(row_idxs, "row_idxs");
|
|
+
|
|
+ ggml_tensor * out = ggml_set_rows(ctx, dst, view, row_idxs);
|
|
+ ggml_set_name(out, "out");
|
|
+
|
|
+ return out;
|
|
+ }
|
|
+
|
|
+ void initialize_tensors(ggml_context * ctx) override {
|
|
+ for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
|
+ if (t->type == GGML_TYPE_I64 || t->type == GGML_TYPE_I32) {
|
|
+ if (ggml_is_view_op(t->op)) {
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ init_set_rows_row_ids(t, ne[2]);
|
|
+ } else {
|
|
+ init_tensor_uniform(t);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+};
|
|
+
|
|
// GGML_OP_ARGMAX
|
|
struct test_argmax : public test_case {
|
|
const ggml_type type;
|
|
@@ -6008,6 +6073,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|
}
|
|
}
|
|
|
|
+ for (int mode : { GGML_ROPE_TYPE_NORMAL, GGML_ROPE_TYPE_NEOX }) {
|
|
+ for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
|
|
+ test_cases.emplace_back(new test_rope_set_rows(type, GGML_TYPE_I64, { 128, 32, 1, 100 }, mode));
|
|
+ test_cases.emplace_back(new test_rope_set_rows(type, GGML_TYPE_I64, { 128, 32, 512, 1 }, mode));
|
|
+ }
|
|
+ }
|
|
+
|
|
for (ggml_type type_input : {GGML_TYPE_F32}) {
|
|
for (ggml_op_pool pool_type : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) {
|
|
for (int k0 : {1, 3}) {
|