Add optional setting for preventing unloading current models for new requests

- Added optional setting (OLLAMA_NO_MODEL_EVICT) for preventing unloading current models for new requests
- Added tests for this feature
This commit is contained in:
YetheSamartaka 2025-12-05 10:33:25 +01:00
parent 31b8c6a214
commit 251512721f
5 changed files with 93 additions and 0 deletions

View File

@ -1873,6 +1873,7 @@ func NewCLI() *cobra.Command {
envVars["OLLAMA_CONTEXT_LENGTH"],
envVars["OLLAMA_KEEP_ALIVE"],
envVars["OLLAMA_MAX_LOADED_MODELS"],
envVars["OLLAMA_NO_MODEL_EVICT"],
envVars["OLLAMA_MAX_QUEUE"],
envVars["OLLAMA_MODELS"],
envVars["OLLAMA_NUM_PARALLEL"],

View File

@ -312,6 +312,7 @@ Parallel request processing for a given model results in increasing the context
The following server settings may be used to adjust how Ollama handles concurrent requests on most platforms:
- `OLLAMA_MAX_LOADED_MODELS` - The maximum number of models that can be loaded concurrently provided they fit in available memory. The default is 3 \* the number of GPUs or 3 for CPU inference.
- `OLLAMA_NO_MODEL_EVICT` - If set to `1`, Ollama will not unload already loaded models to make room for a new request. Requests that would require unloading another model will return an error instead.
- `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time. The default will auto-select either 4 or 1 based on available memory.
- `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512

View File

@ -198,6 +198,8 @@ var (
SchedSpread = Bool("OLLAMA_SCHED_SPREAD")
// MultiUserCache optimizes prompt caching for multi-user scenarios
MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
// NoEvict prevents unloading currently loaded models to make room for a new model.
NoEvict = Bool("OLLAMA_NO_MODEL_EVICT")
// Enable the new Ollama engine
NewEngine = Bool("OLLAMA_NEW_ENGINE")
// ContextLength sets the default context length
@ -285,6 +287,7 @@ func AsMap() map[string]EnvVar {
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
"OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"},
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"},
"OLLAMA_NO_MODEL_EVICT": {"OLLAMA_NO_MODEL_EVICT", NoEvict(), "Prevent unloading loaded models to make room for new models"},
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},

View File

@ -62,6 +62,7 @@ type Scheduler struct {
var defaultModelsPerGPU = 3
var ErrMaxQueue = errors.New("server busy, please try again. maximum pending requests exceeded")
var ErrEvictionDisabled = errors.New("unloading existing models is disabled (set OLLAMA_NO_MODEL_EVICT=0 to allow evictions)")
func InitScheduler(ctx context.Context) *Scheduler {
maxQueue := envconfig.MaxQueue()
@ -129,6 +130,7 @@ func (s *Scheduler) Run(ctx context.Context) {
func (s *Scheduler) processPending(ctx context.Context) {
maxRunners := envconfig.MaxRunners()
preventEvictions := envconfig.NoEvict()
for {
select {
@ -167,6 +169,13 @@ func (s *Scheduler) processPending(ctx context.Context) {
break
}
} else if maxRunners > 0 && loadedCount >= int(maxRunners) {
if preventEvictions {
err := fmt.Errorf("%w: maximum loaded models reached while loading %s", ErrEvictionDisabled, pending.model.ModelPath)
slog.Info("skipping eviction because OLLAMA_NO_MODEL_EVICT is set", "runner_count", loadedCount, "model", pending.model.ModelPath)
pending.errCh <- err
s.abortActiveLoading()
break
}
slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
runnerToExpire = s.findRunnerToUnload()
} else {
@ -222,6 +231,14 @@ func (s *Scheduler) processPending(ctx context.Context) {
break
}
if preventEvictions {
err := fmt.Errorf("%w: unable to load %s without unloading a loaded model", ErrEvictionDisabled, pending.model.ModelPath)
slog.Info("skipping eviction because OLLAMA_NO_MODEL_EVICT is set", "model", pending.model.ModelPath)
pending.errCh <- err
s.abortActiveLoading()
break
}
runnerToExpire = s.findRunnerToUnload()
}
@ -847,6 +864,16 @@ func (s *Scheduler) unloadAllRunners() {
}
}
func (s *Scheduler) abortActiveLoading() {
s.loadedMu.Lock()
defer s.loadedMu.Unlock()
if s.activeLoading != nil {
s.activeLoading.Close()
s.activeLoading = nil
}
}
func (s *Scheduler) expireRunner(model *Model) {
s.loadedMu.Lock()
runner, ok := s.loaded[model.ModelPath]

View File

@ -634,6 +634,67 @@ func TestSchedFindRunnerToUnload(t *testing.T) {
require.Equal(t, r1, resp)
}
func TestSchedNoEvictPreventsUnload(t *testing.T) {
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
t.Setenv("OLLAMA_NO_MODEL_EVICT", "1")
ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
defer done()
first := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: time.Second}, nil)
second := newScenarioRequest(t, ctx, "ollama-model-2", 10, &api.Duration{Duration: time.Second}, nil)
s := InitScheduler(ctx)
s.waitForRecovery = 10 * time.Millisecond
s.getGpuFn = getGpuFn
s.getSystemInfoFn = getSystemInfoFn
servers := map[string]*mockLlm{
first.req.model.ModelPath: first.srv,
second.req.model.ModelPath: second.srv,
}
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
if srv, ok := servers[model]; ok {
srv.modelPath = model
return srv, nil
}
return nil, errors.New("unexpected model")
}
s.Run(ctx)
firstSuccess, firstErr := s.GetRunner(first.ctx, first.req.model, first.req.opts, first.req.sessionDuration)
select {
case resp := <-firstSuccess:
require.Equal(t, first.srv, resp.llama)
require.Empty(t, firstErr)
case err := <-firstErr:
t.Fatalf("unexpected error: %s", err)
case <-ctx.Done():
t.Fatal("timeout waiting for first runner")
}
secondSuccess, secondErr := s.GetRunner(second.ctx, second.req.model, second.req.opts, second.req.sessionDuration)
select {
case <-secondSuccess:
t.Fatal("expected eviction to be blocked")
case err := <-secondErr:
require.ErrorContains(t, err, "OLLAMA_NO_MODEL_EVICT")
case <-ctx.Done():
t.Fatal("timeout waiting for eviction error")
}
s.loadedMu.Lock()
require.Len(t, s.loaded, 1)
_, ok := s.loaded[first.req.model.ModelPath]
s.loadedMu.Unlock()
require.True(t, ok)
require.False(t, first.srv.closeCalled)
require.False(t, second.srv.closeCalled)
first.ctxDone()
time.Sleep(10 * time.Millisecond)
}
func TestSchedNeedsReload(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done()