mirror of https://github.com/ollama/ollama
Add optional setting for preventing unloading current models for new requests
- Added optional setting (OLLAMA_NO_MODEL_EVICT) for preventing unloading current models for new requests - Added tests for this feature
This commit is contained in:
parent
31b8c6a214
commit
251512721f
|
|
@ -1873,6 +1873,7 @@ func NewCLI() *cobra.Command {
|
|||
envVars["OLLAMA_CONTEXT_LENGTH"],
|
||||
envVars["OLLAMA_KEEP_ALIVE"],
|
||||
envVars["OLLAMA_MAX_LOADED_MODELS"],
|
||||
envVars["OLLAMA_NO_MODEL_EVICT"],
|
||||
envVars["OLLAMA_MAX_QUEUE"],
|
||||
envVars["OLLAMA_MODELS"],
|
||||
envVars["OLLAMA_NUM_PARALLEL"],
|
||||
|
|
|
|||
|
|
@ -312,6 +312,7 @@ Parallel request processing for a given model results in increasing the context
|
|||
The following server settings may be used to adjust how Ollama handles concurrent requests on most platforms:
|
||||
|
||||
- `OLLAMA_MAX_LOADED_MODELS` - The maximum number of models that can be loaded concurrently provided they fit in available memory. The default is 3 \* the number of GPUs or 3 for CPU inference.
|
||||
- `OLLAMA_NO_MODEL_EVICT` - If set to `1`, Ollama will not unload already loaded models to make room for a new request. Requests that would require unloading another model will return an error instead.
|
||||
- `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time. The default will auto-select either 4 or 1 based on available memory.
|
||||
- `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512
|
||||
|
||||
|
|
|
|||
|
|
@ -198,6 +198,8 @@ var (
|
|||
SchedSpread = Bool("OLLAMA_SCHED_SPREAD")
|
||||
// MultiUserCache optimizes prompt caching for multi-user scenarios
|
||||
MultiUserCache = Bool("OLLAMA_MULTIUSER_CACHE")
|
||||
// NoEvict prevents unloading currently loaded models to make room for a new model.
|
||||
NoEvict = Bool("OLLAMA_NO_MODEL_EVICT")
|
||||
// Enable the new Ollama engine
|
||||
NewEngine = Bool("OLLAMA_NEW_ENGINE")
|
||||
// ContextLength sets the default context length
|
||||
|
|
@ -285,6 +287,7 @@ func AsMap() map[string]EnvVar {
|
|||
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
|
||||
"OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"},
|
||||
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"},
|
||||
"OLLAMA_NO_MODEL_EVICT": {"OLLAMA_NO_MODEL_EVICT", NoEvict(), "Prevent unloading loaded models to make room for new models"},
|
||||
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
|
||||
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
|
||||
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"},
|
||||
|
|
|
|||
|
|
@ -62,6 +62,7 @@ type Scheduler struct {
|
|||
var defaultModelsPerGPU = 3
|
||||
|
||||
var ErrMaxQueue = errors.New("server busy, please try again. maximum pending requests exceeded")
|
||||
var ErrEvictionDisabled = errors.New("unloading existing models is disabled (set OLLAMA_NO_MODEL_EVICT=0 to allow evictions)")
|
||||
|
||||
func InitScheduler(ctx context.Context) *Scheduler {
|
||||
maxQueue := envconfig.MaxQueue()
|
||||
|
|
@ -129,6 +130,7 @@ func (s *Scheduler) Run(ctx context.Context) {
|
|||
|
||||
func (s *Scheduler) processPending(ctx context.Context) {
|
||||
maxRunners := envconfig.MaxRunners()
|
||||
preventEvictions := envconfig.NoEvict()
|
||||
|
||||
for {
|
||||
select {
|
||||
|
|
@ -167,6 +169,13 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||
break
|
||||
}
|
||||
} else if maxRunners > 0 && loadedCount >= int(maxRunners) {
|
||||
if preventEvictions {
|
||||
err := fmt.Errorf("%w: maximum loaded models reached while loading %s", ErrEvictionDisabled, pending.model.ModelPath)
|
||||
slog.Info("skipping eviction because OLLAMA_NO_MODEL_EVICT is set", "runner_count", loadedCount, "model", pending.model.ModelPath)
|
||||
pending.errCh <- err
|
||||
s.abortActiveLoading()
|
||||
break
|
||||
}
|
||||
slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
|
||||
runnerToExpire = s.findRunnerToUnload()
|
||||
} else {
|
||||
|
|
@ -222,6 +231,14 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||
break
|
||||
}
|
||||
|
||||
if preventEvictions {
|
||||
err := fmt.Errorf("%w: unable to load %s without unloading a loaded model", ErrEvictionDisabled, pending.model.ModelPath)
|
||||
slog.Info("skipping eviction because OLLAMA_NO_MODEL_EVICT is set", "model", pending.model.ModelPath)
|
||||
pending.errCh <- err
|
||||
s.abortActiveLoading()
|
||||
break
|
||||
}
|
||||
|
||||
runnerToExpire = s.findRunnerToUnload()
|
||||
}
|
||||
|
||||
|
|
@ -847,6 +864,16 @@ func (s *Scheduler) unloadAllRunners() {
|
|||
}
|
||||
}
|
||||
|
||||
func (s *Scheduler) abortActiveLoading() {
|
||||
s.loadedMu.Lock()
|
||||
defer s.loadedMu.Unlock()
|
||||
|
||||
if s.activeLoading != nil {
|
||||
s.activeLoading.Close()
|
||||
s.activeLoading = nil
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Scheduler) expireRunner(model *Model) {
|
||||
s.loadedMu.Lock()
|
||||
runner, ok := s.loaded[model.ModelPath]
|
||||
|
|
|
|||
|
|
@ -634,6 +634,67 @@ func TestSchedFindRunnerToUnload(t *testing.T) {
|
|||
require.Equal(t, r1, resp)
|
||||
}
|
||||
|
||||
func TestSchedNoEvictPreventsUnload(t *testing.T) {
|
||||
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
|
||||
t.Setenv("OLLAMA_NO_MODEL_EVICT", "1")
|
||||
|
||||
ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
|
||||
defer done()
|
||||
|
||||
first := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: time.Second}, nil)
|
||||
second := newScenarioRequest(t, ctx, "ollama-model-2", 10, &api.Duration{Duration: time.Second}, nil)
|
||||
|
||||
s := InitScheduler(ctx)
|
||||
s.waitForRecovery = 10 * time.Millisecond
|
||||
s.getGpuFn = getGpuFn
|
||||
s.getSystemInfoFn = getSystemInfoFn
|
||||
servers := map[string]*mockLlm{
|
||||
first.req.model.ModelPath: first.srv,
|
||||
second.req.model.ModelPath: second.srv,
|
||||
}
|
||||
s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
if srv, ok := servers[model]; ok {
|
||||
srv.modelPath = model
|
||||
return srv, nil
|
||||
}
|
||||
return nil, errors.New("unexpected model")
|
||||
}
|
||||
|
||||
s.Run(ctx)
|
||||
|
||||
firstSuccess, firstErr := s.GetRunner(first.ctx, first.req.model, first.req.opts, first.req.sessionDuration)
|
||||
select {
|
||||
case resp := <-firstSuccess:
|
||||
require.Equal(t, first.srv, resp.llama)
|
||||
require.Empty(t, firstErr)
|
||||
case err := <-firstErr:
|
||||
t.Fatalf("unexpected error: %s", err)
|
||||
case <-ctx.Done():
|
||||
t.Fatal("timeout waiting for first runner")
|
||||
}
|
||||
|
||||
secondSuccess, secondErr := s.GetRunner(second.ctx, second.req.model, second.req.opts, second.req.sessionDuration)
|
||||
select {
|
||||
case <-secondSuccess:
|
||||
t.Fatal("expected eviction to be blocked")
|
||||
case err := <-secondErr:
|
||||
require.ErrorContains(t, err, "OLLAMA_NO_MODEL_EVICT")
|
||||
case <-ctx.Done():
|
||||
t.Fatal("timeout waiting for eviction error")
|
||||
}
|
||||
|
||||
s.loadedMu.Lock()
|
||||
require.Len(t, s.loaded, 1)
|
||||
_, ok := s.loaded[first.req.model.ModelPath]
|
||||
s.loadedMu.Unlock()
|
||||
require.True(t, ok)
|
||||
require.False(t, first.srv.closeCalled)
|
||||
require.False(t, second.srv.closeCalled)
|
||||
|
||||
first.ctxDone()
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
|
||||
func TestSchedNeedsReload(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
|
||||
defer done()
|
||||
|
|
|
|||
Loading…
Reference in New Issue