mirror of
https://github.com/fnproject/fn.git
synced 2022-10-28 21:29:17 +03:00
fn: placer stats and client context (#1322)
For the sake of completeness and also as defensive coding, let's record client context cancel/timeout cases. In retry reason, if error is same as client context (timeout/cancel), we should not report as retry due to error. Similarly in placed calls, do not flag the placed call as error if client canceled or timedout. We track client context timeout/cancel separately in addition to this.
This commit is contained in:
@@ -17,8 +17,10 @@ var (
|
||||
errorPoolCountMeasure = common.MakeMeasure("lb_placer_rp_error_count", "LB Placer RunnerPool RunnerList Error Count", "")
|
||||
emptyPoolCountMeasure = common.MakeMeasure("lb_placer_rp_empty_count", "LB Placer RunnerPool RunnerList Empty Count", "")
|
||||
cancelCountMeasure = common.MakeMeasure("lb_placer_client_cancelled_count", "LB Placer Client Cancel Count", "")
|
||||
timeoutCountMeasure = common.MakeMeasure("lb_placer_client_timeout_count", "LB Placer Client Timeout Count", "")
|
||||
placerTimeoutMeasure = common.MakeMeasure("lb_placer_timeout_count", "LB Placer Timeout Count", "")
|
||||
placedErrorCountMeasure = common.MakeMeasure("lb_placer_placed_error_count", "LB Placer Placed Call Count With Errors", "")
|
||||
placedAbortCountMeasure = common.MakeMeasure("lb_placer_placed_abort_count", "LB Placer Placed Call Count With Client Timeout/Cancel", "")
|
||||
placedOKCountMeasure = common.MakeMeasure("lb_placer_placed_ok_count", "LB Placer Placed Call Count Without Errors", "")
|
||||
retryTooBusyCountMeasure = common.MakeMeasure("lb_placer_retry_busy_count", "LB Placer Retry Count - Too Busy", "")
|
||||
retryErrorCountMeasure = common.MakeMeasure("lb_placer_retry_error_count", "LB Placer Retry Count - Errors", "")
|
||||
@@ -70,9 +72,11 @@ func RegisterPlacerViews(tagKeys []string, latencyDist []float64) {
|
||||
common.CreateView(attemptCountMeasure, view.Distribution(0, 2, 3, 4, 8, 16, 32, 64, 128, 256), tagKeys),
|
||||
common.CreateView(errorPoolCountMeasure, view.Count(), tagKeys),
|
||||
common.CreateView(emptyPoolCountMeasure, view.Count(), tagKeys),
|
||||
common.CreateView(timeoutCountMeasure, view.Count(), tagKeys),
|
||||
common.CreateView(cancelCountMeasure, view.Count(), tagKeys),
|
||||
common.CreateView(placerTimeoutMeasure, view.Count(), tagKeys),
|
||||
common.CreateView(placedErrorCountMeasure, view.Count(), tagKeys),
|
||||
common.CreateView(placedAbortCountMeasure, view.Count(), tagKeys),
|
||||
common.CreateView(placedOKCountMeasure, view.Count(), tagKeys),
|
||||
common.CreateView(retryTooBusyCountMeasure, view.Count(), tagKeys),
|
||||
common.CreateView(retryErrorCountMeasure, view.Count(), tagKeys),
|
||||
|
||||
@@ -63,22 +63,24 @@ func (tr *placerTracker) TryRunner(r Runner, call RunnerCall) (bool, error) {
|
||||
// Too Busy is super common case, we track it separately
|
||||
if err == models.ErrCallTimeoutServerBusy {
|
||||
stats.Record(tr.requestCtx, retryTooBusyCountMeasure.M(0))
|
||||
} else {
|
||||
} else if tr.requestCtx.Err() != err {
|
||||
// only record retry due to an error if client did not abort/cancel/timeout
|
||||
stats.Record(tr.requestCtx, retryErrorCountMeasure.M(0))
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
// Only log unusual (except for too-busy) errors for isPlaced (customer impacting) calls
|
||||
if err != nil && err != models.ErrCallTimeoutServerBusy {
|
||||
// Only log unusual errors for isPlaced (customer impacting) calls
|
||||
if err != nil && tr.requestCtx.Err() != err {
|
||||
logger := common.Logger(ctx).WithField("runner_addr", r.Address())
|
||||
logger.WithError(err).Errorf("Failed during call placement")
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
stats.Record(tr.requestCtx, placedErrorCountMeasure.M(0))
|
||||
} else {
|
||||
if err == nil {
|
||||
stats.Record(tr.requestCtx, placedOKCountMeasure.M(0))
|
||||
} else if tr.requestCtx.Err() == err {
|
||||
stats.Record(tr.requestCtx, placedAbortCountMeasure.M(0))
|
||||
} else {
|
||||
stats.Record(tr.requestCtx, placedErrorCountMeasure.M(0))
|
||||
}
|
||||
|
||||
// Call is now committed. In other words, it was 'run'. We are done.
|
||||
@@ -93,8 +95,10 @@ func (tr *placerTracker) TryRunner(r Runner, call RunnerCall) (bool, error) {
|
||||
func (tr *placerTracker) HandleDone() {
|
||||
|
||||
// Cancel Exit Path / Client cancelled/timedout
|
||||
if tr.requestCtx.Err() != nil {
|
||||
if tr.requestCtx.Err() == context.Canceled {
|
||||
stats.Record(tr.requestCtx, cancelCountMeasure.M(0))
|
||||
} else if tr.requestCtx.Err() == context.DeadlineExceeded {
|
||||
stats.Record(tr.requestCtx, timeoutCountMeasure.M(0))
|
||||
}
|
||||
|
||||
// This means our placer timed out. We ignore tr.isPlaced calls
|
||||
|
||||
Reference in New Issue
Block a user