mirror of https://github.com/ollama/ollama
llamarunner: update metrics
this change updates how metrics are collected. until now, performance metrics, specifically initial input processing and subsequent generation durations, were collected by taking the timestamp when creating a new sequence, the first token generation, and completing generation. the processing duration is taken as first token generation sub sequence creation while generation is taken as completing generation sub first token generation. while this approach is an accurate end-to-end metric of processing and generation, it's not comparable to other tools which only measure the active, i.e. decode, duration. this change updates the metrics to only capture decode duration so it can be more directly compared to other tools
This commit is contained in:
parent
15e3611d3d
commit
bbbc73d637
|
|
@ -82,10 +82,10 @@ type Sequence struct {
|
||||||
doneReason llm.DoneReason
|
doneReason llm.DoneReason
|
||||||
|
|
||||||
// Metrics
|
// Metrics
|
||||||
startProcessingTime time.Time
|
processingDuration time.Duration
|
||||||
startGenerationTime time.Time
|
generationDuration time.Duration
|
||||||
numDecoded int
|
numDecoded int
|
||||||
numPromptInputs int
|
numPromptInputs int
|
||||||
}
|
}
|
||||||
|
|
||||||
type NewSequenceParams struct {
|
type NewSequenceParams struct {
|
||||||
|
|
@ -99,8 +99,6 @@ type NewSequenceParams struct {
|
||||||
func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSequenceParams) (*Sequence, error) {
|
func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSequenceParams) (*Sequence, error) {
|
||||||
s.ready.Wait()
|
s.ready.Wait()
|
||||||
|
|
||||||
startTime := time.Now()
|
|
||||||
|
|
||||||
inputs, err := s.inputs(prompt, images)
|
inputs, err := s.inputs(prompt, images)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to process inputs: %w", err)
|
return nil, fmt.Errorf("failed to process inputs: %w", err)
|
||||||
|
|
@ -142,18 +140,17 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
|
||||||
}
|
}
|
||||||
|
|
||||||
return &Sequence{
|
return &Sequence{
|
||||||
inputs: inputs,
|
inputs: inputs,
|
||||||
numPromptInputs: len(inputs),
|
numPromptInputs: len(inputs),
|
||||||
startProcessingTime: startTime,
|
numPredict: params.numPredict,
|
||||||
numPredict: params.numPredict,
|
pendingResponses: make([]string, 0),
|
||||||
pendingResponses: make([]string, 0),
|
responses: make(chan string, 100),
|
||||||
responses: make(chan string, 100),
|
quit: make(chan bool, 1),
|
||||||
quit: make(chan bool, 1),
|
embedding: make(chan []float32, 1),
|
||||||
embedding: make(chan []float32, 1),
|
samplingCtx: sc,
|
||||||
samplingCtx: sc,
|
embeddingOnly: params.embedding,
|
||||||
embeddingOnly: params.embedding,
|
stop: params.stop,
|
||||||
stop: params.stop,
|
numKeep: params.numKeep,
|
||||||
numKeep: params.numKeep,
|
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -438,8 +435,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
err := s.lc.Decode(batch)
|
t := time.Now()
|
||||||
if err != nil {
|
if err := s.lc.Decode(batch); err != nil {
|
||||||
return fmt.Errorf("failed to decode batch: %w", err)
|
return fmt.Errorf("failed to decode batch: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -459,9 +456,12 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
seq.numDecoded += 1
|
s.lc.Synchronize()
|
||||||
if seq.numDecoded == 1 {
|
seq.numDecoded++
|
||||||
seq.startGenerationTime = time.Now()
|
if seq.numDecoded > 1 {
|
||||||
|
seq.generationDuration += time.Since(t)
|
||||||
|
} else {
|
||||||
|
seq.processingDuration += time.Since(t)
|
||||||
}
|
}
|
||||||
|
|
||||||
// if done processing the prompt, generate an embedding and return
|
// if done processing the prompt, generate an embedding and return
|
||||||
|
|
@ -646,9 +646,9 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
||||||
Done: true,
|
Done: true,
|
||||||
DoneReason: seq.doneReason,
|
DoneReason: seq.doneReason,
|
||||||
PromptEvalCount: seq.numPromptInputs,
|
PromptEvalCount: seq.numPromptInputs,
|
||||||
PromptEvalDuration: seq.startGenerationTime.Sub(seq.startProcessingTime),
|
PromptEvalDuration: seq.processingDuration,
|
||||||
EvalCount: seq.numDecoded,
|
EvalCount: seq.numDecoded,
|
||||||
EvalDuration: time.Since(seq.startGenerationTime),
|
EvalDuration: seq.generationDuration,
|
||||||
}); err != nil {
|
}); err != nil {
|
||||||
http.Error(w, fmt.Sprintf("failed to encode final response: %v", err), http.StatusInternalServerError)
|
http.Error(w, fmt.Sprintf("failed to encode final response: %v", err), http.StatusInternalServerError)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue