mirror of https://github.com/ollama/ollama
llm: Support KV cache quantization with gpt-oss
With the new version of GGML in #12245, KV cache quantization no longer causes a fallback to CPU.
This commit is contained in:
parent
33801c1597
commit
19e6796eac
|
|
@ -870,11 +870,6 @@ func (f GGML) SupportsKVCacheType(cacheType string) bool {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
if arch := f.KV().Architecture(); slices.Contains([]string{"gptoss", "gpt-oss"}, arch) {
|
|
||||||
// gpt-oss uses attention with sinks which does not support quantized cache types
|
|
||||||
slog.Warn("model only supports non-quantized cache types", "model", arch)
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
|
return slices.Contains([]string{"q8_0", "q4_0"}, cacheType)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue