mirror of
https://github.com/fnproject/fn.git
synced 2022-10-28 21:29:17 +03:00
fn: introducing lb placer basic metrics (#1058)
* fn: introducing lb placer basic metrics This change adds basic metrics to naive and consistent hash LB placers. The stats show how many times we scanned the full runner list, if runner pool failed to return a runner list or if runner pool returned an empty list. Placed and not placed status are also tracked along with if TryExec returned an error or not. Most common error code, Too-Busy is specifically tracked. If client cancels/times out, this is also tracked as a client cancel metric. For placer latency, we would like to know how much time the placer spent on searching for a runner until it successfully places a call. This includes round-trip times for NACK responses from the runners until a successful TryExec() call. By excluding last successful TryExec() latency, we try to exclude function execution & runner container startup time from this metric in an attempt to isolate Placer only latency. * fn: latency and attempt tracker Removing full scan metric. Tracking number of runners attempted is a better metric for this purpose. Also, if rp.Runners() fail, this is an unrecoverable error and we should bail out instead of retrying. * fn: typo fix, ch placer finalize err return * fn: enable LB placer metrics in WithAgentFromEnv if prometheus is enabled
This commit is contained in:
103
api/runnerpool/placer_stats.go
Normal file
103
api/runnerpool/placer_stats.go
Normal file
@@ -0,0 +1,103 @@
|
||||
package runnerpool
|
||||
|
||||
import (
|
||||
"context"
|
||||
"math"
|
||||
"time"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
"go.opencensus.io/stats"
|
||||
"go.opencensus.io/stats/view"
|
||||
"go.opencensus.io/tag"
|
||||
)
|
||||
|
||||
var (
|
||||
attemptCountMeasure = stats.Int64("lb_placer_attempt_count", "LB Placer Number of Runners Attempted Count", "")
|
||||
errorPoolCountMeasure = stats.Int64("lb_placer_rp_error_count", "LB Placer RunnerPool RunnerList Error Count", "")
|
||||
emptyPoolCountMeasure = stats.Int64("lb_placer_rp_empty_count", "LB Placer RunnerPool RunnerList Empty Count", "")
|
||||
cancelCountMeasure = stats.Int64("lb_placer_client_cancelled_count", "LB Placer Client Cancel Count", "")
|
||||
placedErrorCountMeasure = stats.Int64("lb_placer_placed_error_count", "LB Placer Placed Call Count With Errors", "")
|
||||
placedOKCountMeasure = stats.Int64("lb_placer_placed_ok_count", "LB Placer Placed Call Count Without Errors", "")
|
||||
retryTooBusyCountMeasure = stats.Int64("lb_placer_retry_busy_count", "LB Placer Retry Count - Too Busy", "")
|
||||
retryErrorCountMeasure = stats.Int64("lb_placer_retry_error_count", "LB Placer Retry Count - Errors", "")
|
||||
placerLatencyMeasure = stats.Int64("lb_placer_latency", "LB Placer Latency", "msecs")
|
||||
)
|
||||
|
||||
// Helper struct for tracking LB Placer latency and attempt counts
|
||||
type attemptTracker struct {
|
||||
ctx context.Context
|
||||
startTime time.Time
|
||||
lastAttemptTime time.Time
|
||||
attemptCount int64
|
||||
}
|
||||
|
||||
func newAttemptTracker(ctx context.Context) *attemptTracker {
|
||||
return &attemptTracker{
|
||||
ctx: ctx,
|
||||
startTime: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
func (data *attemptTracker) finalizeAttempts(isSuccess bool) {
|
||||
stats.Record(data.ctx, attemptCountMeasure.M(data.attemptCount))
|
||||
|
||||
// IMPORTANT: here we use (lastAttemptTime - startTime). We want to exclude TryExec
|
||||
// latency *if* TryExec() goes through with success. Placer latency metric only shows
|
||||
// how much time are spending in Placer loop/retries. The metric includes rtt/latency of
|
||||
// *all* unsuccessful NACK (retriable) responses from runners as well. For example, if
|
||||
// Placer loop here retries 4 runners (which takes 5 msecs each) and then 5th runner
|
||||
// succeeds (but takes 35 seconds to finish execution), we report 20 msecs as our LB
|
||||
// latency.
|
||||
endTime := data.lastAttemptTime
|
||||
if !isSuccess {
|
||||
endTime = time.Now()
|
||||
}
|
||||
|
||||
stats.Record(data.ctx, placerLatencyMeasure.M(int64(endTime.Sub(data.startTime)/time.Millisecond)))
|
||||
}
|
||||
|
||||
func (data *attemptTracker) recordAttempt() {
|
||||
data.lastAttemptTime = time.Now()
|
||||
if data.attemptCount != math.MaxInt64 {
|
||||
data.attemptCount++
|
||||
}
|
||||
}
|
||||
|
||||
func makeKeys(names []string) []tag.Key {
|
||||
var tagKeys []tag.Key
|
||||
for _, name := range names {
|
||||
key, err := tag.NewKey(name)
|
||||
if err != nil {
|
||||
logrus.WithError(err).Fatal("cannot create tag key for %v", name)
|
||||
}
|
||||
tagKeys = append(tagKeys, key)
|
||||
}
|
||||
return tagKeys
|
||||
}
|
||||
|
||||
func createView(measure stats.Measure, agg *view.Aggregation, tagKeys []string) *view.View {
|
||||
return &view.View{
|
||||
Name: measure.Name(),
|
||||
Description: measure.Description(),
|
||||
TagKeys: makeKeys(tagKeys),
|
||||
Measure: measure,
|
||||
Aggregation: agg,
|
||||
}
|
||||
}
|
||||
|
||||
func RegisterPlacerViews(tagKeys []string) {
|
||||
err := view.Register(
|
||||
createView(attemptCountMeasure, view.Distribution(0, 1, 2, 4, 8, 32, 64, 256), tagKeys),
|
||||
createView(errorPoolCountMeasure, view.Count(), tagKeys),
|
||||
createView(emptyPoolCountMeasure, view.Count(), tagKeys),
|
||||
createView(cancelCountMeasure, view.Count(), tagKeys),
|
||||
createView(placedErrorCountMeasure, view.Count(), tagKeys),
|
||||
createView(placedOKCountMeasure, view.Count(), tagKeys),
|
||||
createView(retryTooBusyCountMeasure, view.Count(), tagKeys),
|
||||
createView(retryErrorCountMeasure, view.Count(), tagKeys),
|
||||
createView(placerLatencyMeasure, view.Distribution(1, 10, 25, 50, 200, 1000, 10000, 60000), tagKeys),
|
||||
)
|
||||
if err != nil {
|
||||
logrus.WithError(err).Fatal("cannot create view")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user