From 549ba65deab1e2a5e3ae6f92ef074425dc12940e Mon Sep 17 00:00:00 2001 From: Tolga Ceylan Date: Wed, 28 Nov 2018 10:32:55 -0800 Subject: [PATCH] fn: placer stats and client context (#1322) For the sake of completeness and also as defensive coding, let's record client context cancel/timeout cases. In retry reason, if error is same as client context (timeout/cancel), we should not report as retry due to error. Similarly in placed calls, do not flag the placed call as error if client canceled or timedout. We track client context timeout/cancel separately in addition to this. --- api/runnerpool/placer_stats.go | 4 ++++ api/runnerpool/placer_tracker.go | 20 ++++++++++++-------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/api/runnerpool/placer_stats.go b/api/runnerpool/placer_stats.go index 7f02ba4ce..740a83779 100644 --- a/api/runnerpool/placer_stats.go +++ b/api/runnerpool/placer_stats.go @@ -17,8 +17,10 @@ var ( errorPoolCountMeasure = common.MakeMeasure("lb_placer_rp_error_count", "LB Placer RunnerPool RunnerList Error Count", "") emptyPoolCountMeasure = common.MakeMeasure("lb_placer_rp_empty_count", "LB Placer RunnerPool RunnerList Empty Count", "") cancelCountMeasure = common.MakeMeasure("lb_placer_client_cancelled_count", "LB Placer Client Cancel Count", "") + timeoutCountMeasure = common.MakeMeasure("lb_placer_client_timeout_count", "LB Placer Client Timeout Count", "") placerTimeoutMeasure = common.MakeMeasure("lb_placer_timeout_count", "LB Placer Timeout Count", "") placedErrorCountMeasure = common.MakeMeasure("lb_placer_placed_error_count", "LB Placer Placed Call Count With Errors", "") + placedAbortCountMeasure = common.MakeMeasure("lb_placer_placed_abort_count", "LB Placer Placed Call Count With Client Timeout/Cancel", "") placedOKCountMeasure = common.MakeMeasure("lb_placer_placed_ok_count", "LB Placer Placed Call Count Without Errors", "") retryTooBusyCountMeasure = common.MakeMeasure("lb_placer_retry_busy_count", "LB Placer Retry Count - Too Busy", "") retryErrorCountMeasure = common.MakeMeasure("lb_placer_retry_error_count", "LB Placer Retry Count - Errors", "") @@ -70,9 +72,11 @@ func RegisterPlacerViews(tagKeys []string, latencyDist []float64) { common.CreateView(attemptCountMeasure, view.Distribution(0, 2, 3, 4, 8, 16, 32, 64, 128, 256), tagKeys), common.CreateView(errorPoolCountMeasure, view.Count(), tagKeys), common.CreateView(emptyPoolCountMeasure, view.Count(), tagKeys), + common.CreateView(timeoutCountMeasure, view.Count(), tagKeys), common.CreateView(cancelCountMeasure, view.Count(), tagKeys), common.CreateView(placerTimeoutMeasure, view.Count(), tagKeys), common.CreateView(placedErrorCountMeasure, view.Count(), tagKeys), + common.CreateView(placedAbortCountMeasure, view.Count(), tagKeys), common.CreateView(placedOKCountMeasure, view.Count(), tagKeys), common.CreateView(retryTooBusyCountMeasure, view.Count(), tagKeys), common.CreateView(retryErrorCountMeasure, view.Count(), tagKeys), diff --git a/api/runnerpool/placer_tracker.go b/api/runnerpool/placer_tracker.go index 208a347cb..8a8a58133 100644 --- a/api/runnerpool/placer_tracker.go +++ b/api/runnerpool/placer_tracker.go @@ -63,22 +63,24 @@ func (tr *placerTracker) TryRunner(r Runner, call RunnerCall) (bool, error) { // Too Busy is super common case, we track it separately if err == models.ErrCallTimeoutServerBusy { stats.Record(tr.requestCtx, retryTooBusyCountMeasure.M(0)) - } else { + } else if tr.requestCtx.Err() != err { + // only record retry due to an error if client did not abort/cancel/timeout stats.Record(tr.requestCtx, retryErrorCountMeasure.M(0)) } } else { - // Only log unusual (except for too-busy) errors for isPlaced (customer impacting) calls - if err != nil && err != models.ErrCallTimeoutServerBusy { + // Only log unusual errors for isPlaced (customer impacting) calls + if err != nil && tr.requestCtx.Err() != err { logger := common.Logger(ctx).WithField("runner_addr", r.Address()) logger.WithError(err).Errorf("Failed during call placement") } - - if err != nil { - stats.Record(tr.requestCtx, placedErrorCountMeasure.M(0)) - } else { + if err == nil { stats.Record(tr.requestCtx, placedOKCountMeasure.M(0)) + } else if tr.requestCtx.Err() == err { + stats.Record(tr.requestCtx, placedAbortCountMeasure.M(0)) + } else { + stats.Record(tr.requestCtx, placedErrorCountMeasure.M(0)) } // Call is now committed. In other words, it was 'run'. We are done. @@ -93,8 +95,10 @@ func (tr *placerTracker) TryRunner(r Runner, call RunnerCall) (bool, error) { func (tr *placerTracker) HandleDone() { // Cancel Exit Path / Client cancelled/timedout - if tr.requestCtx.Err() != nil { + if tr.requestCtx.Err() == context.Canceled { stats.Record(tr.requestCtx, cancelCountMeasure.M(0)) + } else if tr.requestCtx.Err() == context.DeadlineExceeded { + stats.Record(tr.requestCtx, timeoutCountMeasure.M(0)) } // This means our placer timed out. We ignore tr.isPlaced calls