fn: placer stats and client context (#1322)

For the sake of completeness and also as defensive coding,
let's record client context cancel/timeout cases. In retry
reason, if error is same as client context (timeout/cancel),
we should not report as retry due to error. Similarly in
placed calls, do not flag the placed call as error if client
canceled or timedout. We track client context timeout/cancel
separately in addition to this.
This commit is contained in:
Tolga Ceylan
2018-11-28 10:32:55 -08:00
committed by GitHub
parent 272887b1e2
commit 549ba65dea
2 changed files with 16 additions and 8 deletions

View File

@@ -17,8 +17,10 @@ var (
errorPoolCountMeasure = common.MakeMeasure("lb_placer_rp_error_count", "LB Placer RunnerPool RunnerList Error Count", "")
emptyPoolCountMeasure = common.MakeMeasure("lb_placer_rp_empty_count", "LB Placer RunnerPool RunnerList Empty Count", "")
cancelCountMeasure = common.MakeMeasure("lb_placer_client_cancelled_count", "LB Placer Client Cancel Count", "")
timeoutCountMeasure = common.MakeMeasure("lb_placer_client_timeout_count", "LB Placer Client Timeout Count", "")
placerTimeoutMeasure = common.MakeMeasure("lb_placer_timeout_count", "LB Placer Timeout Count", "")
placedErrorCountMeasure = common.MakeMeasure("lb_placer_placed_error_count", "LB Placer Placed Call Count With Errors", "")
placedAbortCountMeasure = common.MakeMeasure("lb_placer_placed_abort_count", "LB Placer Placed Call Count With Client Timeout/Cancel", "")
placedOKCountMeasure = common.MakeMeasure("lb_placer_placed_ok_count", "LB Placer Placed Call Count Without Errors", "")
retryTooBusyCountMeasure = common.MakeMeasure("lb_placer_retry_busy_count", "LB Placer Retry Count - Too Busy", "")
retryErrorCountMeasure = common.MakeMeasure("lb_placer_retry_error_count", "LB Placer Retry Count - Errors", "")
@@ -70,9 +72,11 @@ func RegisterPlacerViews(tagKeys []string, latencyDist []float64) {
common.CreateView(attemptCountMeasure, view.Distribution(0, 2, 3, 4, 8, 16, 32, 64, 128, 256), tagKeys),
common.CreateView(errorPoolCountMeasure, view.Count(), tagKeys),
common.CreateView(emptyPoolCountMeasure, view.Count(), tagKeys),
common.CreateView(timeoutCountMeasure, view.Count(), tagKeys),
common.CreateView(cancelCountMeasure, view.Count(), tagKeys),
common.CreateView(placerTimeoutMeasure, view.Count(), tagKeys),
common.CreateView(placedErrorCountMeasure, view.Count(), tagKeys),
common.CreateView(placedAbortCountMeasure, view.Count(), tagKeys),
common.CreateView(placedOKCountMeasure, view.Count(), tagKeys),
common.CreateView(retryTooBusyCountMeasure, view.Count(), tagKeys),
common.CreateView(retryErrorCountMeasure, view.Count(), tagKeys),

View File

@@ -63,22 +63,24 @@ func (tr *placerTracker) TryRunner(r Runner, call RunnerCall) (bool, error) {
// Too Busy is super common case, we track it separately
if err == models.ErrCallTimeoutServerBusy {
stats.Record(tr.requestCtx, retryTooBusyCountMeasure.M(0))
} else {
} else if tr.requestCtx.Err() != err {
// only record retry due to an error if client did not abort/cancel/timeout
stats.Record(tr.requestCtx, retryErrorCountMeasure.M(0))
}
} else {
// Only log unusual (except for too-busy) errors for isPlaced (customer impacting) calls
if err != nil && err != models.ErrCallTimeoutServerBusy {
// Only log unusual errors for isPlaced (customer impacting) calls
if err != nil && tr.requestCtx.Err() != err {
logger := common.Logger(ctx).WithField("runner_addr", r.Address())
logger.WithError(err).Errorf("Failed during call placement")
}
if err != nil {
stats.Record(tr.requestCtx, placedErrorCountMeasure.M(0))
} else {
if err == nil {
stats.Record(tr.requestCtx, placedOKCountMeasure.M(0))
} else if tr.requestCtx.Err() == err {
stats.Record(tr.requestCtx, placedAbortCountMeasure.M(0))
} else {
stats.Record(tr.requestCtx, placedErrorCountMeasure.M(0))
}
// Call is now committed. In other words, it was 'run'. We are done.
@@ -93,8 +95,10 @@ func (tr *placerTracker) TryRunner(r Runner, call RunnerCall) (bool, error) {
func (tr *placerTracker) HandleDone() {
// Cancel Exit Path / Client cancelled/timedout
if tr.requestCtx.Err() != nil {
if tr.requestCtx.Err() == context.Canceled {
stats.Record(tr.requestCtx, cancelCountMeasure.M(0))
} else if tr.requestCtx.Err() == context.DeadlineExceeded {
stats.Record(tr.requestCtx, timeoutCountMeasure.M(0))
}
// This means our placer timed out. We ignore tr.isPlaced calls