ollama/model/models/deepseekocr/imageprocessor.go

package deepseekocr

import (
	"bytes"
	"image"
	"image/color"
	"math"
	"slices"

	"golang.org/x/image/draw"

	"github.com/ollama/ollama/ml"
	"github.com/ollama/ollama/model/imageproc"
)

type ratio struct {
	x, y int
}

func ProcessImage(ctx ml.Context, bts []byte) (ml.Tensor, ml.Tensor, []int, error) {
	img, _, err := image.Decode(bytes.NewReader(bts))
	if err != nil {
		return nil, nil, nil, err
	}

	minNum, maxNum, imageSize, baseSize := 2, 9, 640, 1024
	var targetRatios []ratio
	for n := minNum; n <= maxNum; n++ {
		for i := 1; i <= n; i++ {
			for j := 1; j <= n; j++ {
				if i*j <= maxNum && i*j >= minNum && !slices.Contains(targetRatios, ratio{i, j}) {
					targetRatios = append(targetRatios, ratio{i, j})
				}
			}
		}
	}

	targetRatio := findBestAspectRatio(targetRatios, img.Bounds().Dx(), img.Bounds().Dy(), imageSize)
	targetWidth, targetHeight := imageSize*targetRatio.x, imageSize*targetRatio.y
	blocks := targetRatio.x * targetRatio.y

	mean := imageproc.ImageNetStandardMean
	std := imageproc.ImageNetStandardSTD

	var patches []float32
	resized := imageproc.Resize(img, image.Point{X: targetWidth, Y: targetHeight}, imageproc.ResizeBilinear)
	for i := range blocks {
		patch := image.NewRGBA(image.Rect(0, 0, imageSize, imageSize))
		draw.Draw(patch, patch.Bounds(), resized, image.Point{
			X: i % (targetWidth / imageSize) * imageSize,
			Y: i / (targetWidth / imageSize) * imageSize,
		}, draw.Over)

		patches = append(patches, imageproc.Normalize(patch, mean, std, true, true)...)
	}

	img = imageproc.CompositeColor(img, color.Gray{})
	img = imageproc.Pad(img, image.Point{X: baseSize, Y: baseSize}, color.Gray{127}, draw.BiLinear)

	return ctx.Input().FromFloats(patches, imageSize, imageSize, 3, blocks),
		ctx.Input().FromFloats(imageproc.Normalize(img, mean, std, true, true), baseSize, baseSize, 3),
		[]int{targetRatio.x, targetRatio.y},
		nil
}

func findBestAspectRatio(targetRatios []ratio, width, height, imageSize int) ratio {
	bestDiff := math.MaxFloat64
	best := ratio{1, 1}
	realRatio := float64(width) / float64(height)
	for _, target := range targetRatios {
		targetRatio := float64(target.x) / float64(target.y)
		diff := math.Abs(realRatio - targetRatio)
		if diff < bestDiff {
			bestDiff = diff
			best = target
		} else if diff == bestDiff {
			if float64(width*height) > 0.5*float64(imageSize*imageSize*best.x*best.y) {
				best = target
			}
		}
	}
	return best
}