use ollama engine for bert models (#13501)

register bpe tokenizer which enables granite-embedding
2025-12-16 11:29:19 -08:00 · 2025-12-16 11:29:19 -08:00 · 903b1fc97f
parent 89eb795293
commit 903b1fc97f
2 changed files with 33 additions and 30 deletions
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@ -241,19 +241,20 @@ func (kv KV) Bools(key string, defaultValue ...[]bool) []bool {

 func (kv KV) OllamaEngineRequired() bool {
 	return slices.Contains([]string{
+		"bert",
+		"deepseek2",
+		"deepseekocr",
 		"gemma3",
 		"gemma3n",
 		"gptoss", "gpt-oss",
 		"llama4",
 		"mistral3",
 		"mllama",
+		"nomic-bert",
+		"olmo3",
 		"qwen25vl",
 		"qwen3", "qwen3moe",
 		"qwen3vl", "qwen3vlmoe",
-		"deepseekocr",
-		"deepseek2",
-		"nomic-bert",
-		"olmo3",
 	}, kv.Architecture())
 }

@ -839,6 +840,7 @@ func (f GGML) SupportsFlashAttention() bool {
 // FlashAttention checks if the model should enable flash attention
 func (f GGML) FlashAttention() bool {
 	return slices.Contains([]string{
+		"bert",
 		"gemma3",
 		"gptoss", "gpt-oss",
 		"mistral3",
--- a/model/models/bert/embed.go
+++ b/model/models/bert/embed.go
@ -129,11 +129,7 @@ func (o Options) headDim() int {
 }

 func New(c fs.Config) (model.Model, error) {
-	var processor model.TextProcessor
-	switch c.String("tokenizer.ggml.model", "bert") {
-	case "bert":
-		processor = model.NewWordPiece(
-			&model.Vocabulary{
+	vocab := &model.Vocabulary{
 		Values: c.Strings("tokenizer.ggml.tokens"),
 		Scores: c.Floats("tokenizer.ggml.scores"),
 		Types:  c.Ints("tokenizer.ggml.token_type"),
@ -155,9 +151,14 @@ func New(c fs.Config) (model.Model, error) {
 				c.Uint("tokenizer.ggml.eos_token_id"),
 			)),
 		},
-			},
-			true,
-		)
+	}
+
+	var processor model.TextProcessor
+	switch c.String("tokenizer.ggml.model", "bert") {
+	case "bert":
+		processor = model.NewWordPiece(vocab, true)
+	case "gpt2":
+		processor = model.NewBytePairEncoding(vocab)
 	default:
 		return nil, model.ErrUnsupportedTokenizer
 	}