mirror of https://github.com/ollama/ollama
326 lines
11 KiB
Go
326 lines
11 KiB
Go
package qwen25vl
|
|
|
|
import (
|
|
"math"
|
|
"slices"
|
|
|
|
"github.com/ollama/ollama/fs"
|
|
"github.com/ollama/ollama/ml"
|
|
"github.com/ollama/ollama/ml/nn"
|
|
"github.com/ollama/ollama/ml/nn/rope"
|
|
)
|
|
|
|
func blockDiagonalMask(ctx ml.Context, seqLength int, bounds []int) ml.Tensor {
|
|
// Initialize a 2D mask with -Inf
|
|
s := make([][]float32, seqLength)
|
|
for i := range s {
|
|
s[i] = slices.Repeat([]float32{float32(math.Inf(-1))}, seqLength)
|
|
}
|
|
|
|
// Fill in the mask with zeros for tokens that CAN attend to each other
|
|
for i := 1; i < len(bounds); i++ {
|
|
start, end := bounds[i-1], bounds[i]
|
|
// Enable attention within this sequence block
|
|
for row := start; row < end; row++ {
|
|
for col := start; col < end; col++ {
|
|
s[row][col] = 0.0
|
|
}
|
|
}
|
|
}
|
|
|
|
return ctx.Input().FromFloats(slices.Concat(s...), seqLength, seqLength)
|
|
}
|
|
|
|
type VisionSelfAttention struct {
|
|
Query *nn.Linear `gguf:"attn_q"`
|
|
Key *nn.Linear `gguf:"attn_k"`
|
|
Value *nn.Linear `gguf:"attn_v"`
|
|
Output *nn.Linear `gguf:"attn_out"`
|
|
}
|
|
|
|
func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenStates, positions, mask ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
|
query := sa.Query.Forward(ctx, hiddenStates)
|
|
key := sa.Key.Forward(ctx, hiddenStates)
|
|
value := sa.Value.Forward(ctx, hiddenStates)
|
|
|
|
query = query.Reshape(ctx, opts.headDim, opts.numHeads, query.Dim(1))
|
|
key = key.Reshape(ctx, opts.headDim, opts.numHeads, key.Dim(1))
|
|
value = value.Reshape(ctx, opts.headDim, opts.numHeads, value.Dim(1))
|
|
|
|
query = opts.applyRotaryPositionEmbeddings(ctx, query, positions)
|
|
key = opts.applyRotaryPositionEmbeddings(ctx, key, positions)
|
|
|
|
// Scale factor for scaled dot-product attention
|
|
scale := 1.0 / math.Sqrt(float64(opts.headDim))
|
|
|
|
// Scaled dot-product attention
|
|
query = query.Permute(ctx, 0, 2, 1, 3)
|
|
key = key.Permute(ctx, 0, 2, 1, 3)
|
|
value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
|
|
|
kq := key.MulmatFullPrec(ctx, query)
|
|
kq = kq.Scale(ctx, scale)
|
|
if mask != nil {
|
|
kq = kq.Add(ctx, mask)
|
|
}
|
|
kq = kq.Softmax(ctx)
|
|
kqv := value.Mulmat(ctx, kq)
|
|
attention := kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
|
attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2))
|
|
|
|
return sa.Output.Forward(ctx, attention)
|
|
}
|
|
|
|
// VisionMLP implements the multi-layer perceptron
|
|
type VisionMLP struct {
|
|
Gate *nn.Linear `gguf:"ffn_gate"`
|
|
Up *nn.Linear `gguf:"ffn_up"`
|
|
Down *nn.Linear `gguf:"ffn_down"`
|
|
}
|
|
|
|
func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
|
hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates))
|
|
return mlp.Down.Forward(ctx, hiddenStates)
|
|
}
|
|
|
|
type VisionEncoderLayer struct {
|
|
Norm1 *nn.RMSNorm `gguf:"ln1"`
|
|
SelfAttention *VisionSelfAttention
|
|
Norm2 *nn.RMSNorm `gguf:"ln2"`
|
|
MLP *VisionMLP
|
|
}
|
|
|
|
func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenStates, positions, mask ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
|
residual := hiddenStates
|
|
hiddenStates = e.Norm1.Forward(ctx, hiddenStates, opts.eps)
|
|
hiddenStates = e.SelfAttention.Forward(ctx, hiddenStates, positions, mask, opts)
|
|
hiddenStates = hiddenStates.Add(ctx, residual)
|
|
|
|
residual = hiddenStates
|
|
hiddenStates = e.Norm2.Forward(ctx, hiddenStates, opts.eps)
|
|
hiddenStates = e.MLP.Forward(ctx, hiddenStates, opts)
|
|
return hiddenStates.Add(ctx, residual)
|
|
}
|
|
|
|
// VisionModelOptions contains configuration options
|
|
type VisionModelOptions struct {
|
|
hiddenSize int
|
|
numHeads int
|
|
headDim int
|
|
patchSize int
|
|
numChannels int
|
|
eps float32
|
|
ropeTheta float32
|
|
spatialMergeSize int
|
|
windowSize int
|
|
fullAttnBlocks []int32
|
|
temporalPatchSize int
|
|
}
|
|
|
|
func (o VisionModelOptions) applyRotaryPositionEmbeddings(ctx ml.Context, states, positions ml.Tensor) ml.Tensor {
|
|
return nn.RoPE(ctx, states, positions, o.headDim/2, o.ropeTheta, 1,
|
|
rope.WithVision([]int{
|
|
o.headDim / 4,
|
|
o.headDim / 4,
|
|
o.headDim / 4,
|
|
o.headDim / 4,
|
|
}),
|
|
)
|
|
}
|
|
|
|
type PatchEmbedding struct {
|
|
PatchConv0 *nn.Conv2D `gguf:"patch_embd_0"`
|
|
PatchConv1 *nn.Conv2D `gguf:"patch_embd_1"`
|
|
}
|
|
|
|
func (pe *PatchEmbedding) Forward(ctx ml.Context, pixelValues ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
|
numPatches := pixelValues.Shape()[1]
|
|
|
|
// Reshape the input tensor to match the expected dimensions
|
|
pixelValues = pixelValues.Reshape(ctx, opts.patchSize*opts.patchSize, opts.temporalPatchSize, opts.numChannels, numPatches)
|
|
|
|
// Permute the tensor to bring the temporal dimension to the front
|
|
pixelValues = pixelValues.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
|
|
|
|
// Split the tensor into parts for the temporal convolutions
|
|
in0 := pixelValues.View(ctx, 0, 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
|
|
in0 = in0.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
|
|
in1 := pixelValues.View(ctx, pixelValues.Stride(0), 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
|
|
in1 = in1.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
|
|
|
|
s0, s1 := opts.patchSize, opts.patchSize // Use full stride
|
|
p0, p1 := 0, 0 // padding
|
|
d0, d1 := 1, 1 // dilation
|
|
out0 := pe.PatchConv0.Forward(ctx, in0, s0, s1, p0, p1, d0, d1)
|
|
out1 := pe.PatchConv1.Forward(ctx, in1, s0, s1, p0, p1, d0, d1)
|
|
|
|
// Add the outputs from the two temporal convolutions
|
|
out := out0.Add(ctx, out1)
|
|
|
|
// Reshape the output tensor to match the expected dimensions
|
|
return out.Reshape(ctx, opts.hiddenSize, numPatches)
|
|
}
|
|
|
|
// VisionPatchMerger implements patch merging for the Qwen vision model
|
|
type VisionPatchMerger struct {
|
|
LNQ *nn.RMSNorm `gguf:"ln_q"`
|
|
MLP0 *nn.Linear `gguf:"mlp.0"`
|
|
MLP2 *nn.Linear `gguf:"mlp.2"`
|
|
}
|
|
|
|
// Forward computes patch merging for the vision model
|
|
func (pm *VisionPatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
|
normalized := pm.LNQ.Forward(ctx, visionOutputs, opts.eps)
|
|
|
|
hiddenSize := visionOutputs.Dim(0) * (opts.spatialMergeSize * opts.spatialMergeSize)
|
|
|
|
// Reshape the normalized output to view the hidden size dimension
|
|
reshaped := normalized.Reshape(ctx, hiddenSize, normalized.Dim(1)/(opts.spatialMergeSize*opts.spatialMergeSize))
|
|
hidden := pm.MLP0.Forward(ctx, reshaped)
|
|
activated := hidden.GELU(ctx)
|
|
|
|
output := pm.MLP2.Forward(ctx, activated)
|
|
|
|
return output
|
|
}
|
|
|
|
// VisionModel implements the Qwen vision model
|
|
type VisionModel struct {
|
|
PatchEmbedding *PatchEmbedding
|
|
Layers []VisionEncoderLayer `gguf:"blk"`
|
|
PatchMerger *VisionPatchMerger `gguf:"merger"`
|
|
|
|
*VisionModelOptions
|
|
}
|
|
|
|
// Forward computes the vision model for an input tensor
|
|
func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) ml.Tensor {
|
|
// Extract patch embeddings
|
|
hiddenStates := m.PatchEmbedding.Forward(ctx, pixelValues, m.VisionModelOptions)
|
|
|
|
index, bounds := m.windowIndex(grid)
|
|
spatialMergeUnit := m.spatialMergeSize * m.spatialMergeSize
|
|
|
|
windowIndex := ctx.Input().FromInts(index, len(index))
|
|
hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0)*spatialMergeUnit, hiddenStates.Dim(1)/spatialMergeUnit)
|
|
hiddenStates = hiddenStates.Rows(ctx, windowIndex.Argsort(ctx))
|
|
hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0)/spatialMergeUnit, hiddenStates.Dim(1)*spatialMergeUnit)
|
|
|
|
positions := ctx.Input().FromInts(func() []int32 {
|
|
s := [][]int32{
|
|
make([]int32, grid.Height*grid.Width),
|
|
make([]int32, grid.Height*grid.Width),
|
|
make([]int32, grid.Height*grid.Width),
|
|
make([]int32, grid.Height*grid.Width),
|
|
}
|
|
|
|
var cur int
|
|
for y := 0; y < grid.Height; y += m.spatialMergeSize {
|
|
for x := 0; x < grid.Width; x += m.spatialMergeSize {
|
|
for dy := range 2 {
|
|
for dx := range 2 {
|
|
i := int(index[cur/spatialMergeUnit]) * spatialMergeUnit
|
|
i += cur % spatialMergeUnit
|
|
s[0][i] = int32(y + dy)
|
|
s[1][i] = int32(x + dx)
|
|
s[2][i] = int32(y + dy)
|
|
s[3][i] = int32(x + dx)
|
|
cur++
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return slices.Concat(s...)
|
|
}(), grid.Height*grid.Width*4)
|
|
|
|
mask := blockDiagonalMask(ctx, hiddenStates.Dim(1), bounds)
|
|
|
|
// Apply encoder layers
|
|
for i, layer := range m.Layers {
|
|
if slices.Contains(m.fullAttnBlocks, int32(i)) {
|
|
hiddenStates = layer.Forward(ctx, hiddenStates, positions, nil, m.VisionModelOptions)
|
|
} else {
|
|
hiddenStates = layer.Forward(
|
|
ctx,
|
|
hiddenStates,
|
|
positions,
|
|
mask,
|
|
m.VisionModelOptions,
|
|
)
|
|
}
|
|
}
|
|
|
|
hiddenStates = m.PatchMerger.Forward(ctx, hiddenStates, m.VisionModelOptions)
|
|
return hiddenStates.Rows(ctx, windowIndex)
|
|
}
|
|
|
|
// windowIndex divides the grid into windows and returns:
|
|
// 1. A slice of grid point indices organized by windows
|
|
// 2. A slice of boundaries that mark where each window's data begins and ends
|
|
// in the flattened representation, scaled by spatialMergeSize squared
|
|
//
|
|
// The boundaries slice always starts with 0 and contains cumulative ending
|
|
// positions for each window, allowing downstream processing to identify
|
|
// window boundaries in the tensor data.
|
|
func (m *VisionModel) windowIndex(grid *Grid) (index []int32, bounds []int) {
|
|
height := grid.Height / m.spatialMergeSize
|
|
width := grid.Width / m.spatialMergeSize
|
|
window := m.windowSize / m.patchSize / m.spatialMergeSize
|
|
|
|
index = make([]int32, height*width)
|
|
|
|
bounds = make([]int, 0, ((height+window-1)/window)*((width+window-1)/window)+1)
|
|
bounds = append(bounds, 0)
|
|
|
|
var cur int32
|
|
for y := 0; y < height; y += window {
|
|
for x := 0; x < width; x += window {
|
|
h1 := min(window, height-y)
|
|
w1 := min(window, width-x)
|
|
for dy := range h1 {
|
|
for dx := range w1 {
|
|
win := (y+dy)*width + (x + dx)
|
|
index[win] = cur
|
|
cur++
|
|
}
|
|
}
|
|
bounds = append(bounds, int(cur)*window)
|
|
}
|
|
}
|
|
return index, bounds
|
|
}
|
|
|
|
// newVisionModel creates a new instance of the Qwen vision model
|
|
func newVisionModel(c fs.Config) *VisionModel {
|
|
patchSize := int(c.Uint("vision.patch_size", 14))
|
|
hiddenSize := int(c.Uint("vision.embedding_length", 1280))
|
|
numHeads := int(c.Uint("vision.attention.head_count", 16))
|
|
numChannels := int(c.Uint("vision.num_channels", 3))
|
|
eps := c.Float("vision.attention.layer_norm_epsilon", 1e-6)
|
|
ropeTheta := c.Float("vision.rope.freq_base", 10000.0)
|
|
spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2))
|
|
windowSize := int(c.Uint("vision.window_size", 112))
|
|
fullAttnBlocks := c.Ints("qwen25vl.vision.fullatt_block_indexes", []int32{7, 15, 23, 31})
|
|
temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2))
|
|
|
|
model := &VisionModel{
|
|
Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 32)),
|
|
VisionModelOptions: &VisionModelOptions{
|
|
hiddenSize: hiddenSize,
|
|
numHeads: numHeads,
|
|
headDim: hiddenSize / numHeads,
|
|
patchSize: patchSize,
|
|
numChannels: numChannels,
|
|
eps: eps,
|
|
ropeTheta: ropeTheta,
|
|
spatialMergeSize: spatialMergeSize,
|
|
windowSize: windowSize,
|
|
temporalPatchSize: temporalPatchSize,
|
|
fullAttnBlocks: fullAttnBlocks,
|
|
},
|
|
}
|
|
|
|
return model
|
|
}
|