fn: introducing lb placer basic metrics (#1058)

* fn: introducing lb placer basic metrics This change adds basic metrics to naive and consistent hash LB placers. The stats show how many times we scanned the full runner list, if runner pool failed to return a runner list or if runner pool returned an empty list. Placed and not placed status are also tracked along with if TryExec returned an error or not. Most common error code, Too-Busy is specifically tracked. If client cancels/times out, this is also tracked as a client cancel metric. For placer latency, we would like to know how much time the placer spent on searching for a runner until it successfully places a call. This includes round-trip times for NACK responses from the runners until a successful TryExec() call. By excluding last successful TryExec() latency, we try to exclude function execution & runner container startup time from this metric in an attempt to isolate Placer only latency. * fn: latency and attempt tracker Removing full scan metric. Tracking number of runners attempted is a better metric for this purpose. Also, if rp.Runners() fail, this is an unrecoverable error and we should bail out instead of retrying. * fn: typo fix, ch placer finalize err return * fn: enable LB placer metrics in WithAgentFromEnv if prometheus is enabled
2022-10-28 21:29:17 +03:00 · 2018-06-12 13:36:05 -07:00
parent bd5150f1ac
commit f24172aa9d
9 changed files with 269 additions and 59 deletions
--- a/api/runnerpool/naive_placer.go
+++ b/api/runnerpool/naive_placer.go
@@ -5,10 +5,11 @@ import (
 	"sync/atomic"
 	"time"

+	"github.com/fnproject/fn/api/common"
 	"github.com/fnproject/fn/api/models"

-	"github.com/fnproject/fn/api/common"
 	"github.com/sirupsen/logrus"
+	"go.opencensus.io/stats"
 )

 type naivePlacer struct {
@@ -27,41 +28,69 @@ func NewNaivePlacer() Placer {

 func (sp *naivePlacer) PlaceCall(rp RunnerPool, ctx context.Context, call RunnerCall) error {

+	tracker := newAttemptTracker(ctx)
 	log := common.Logger(ctx)
+
+OutTries:
 	for {
 		runners, err := rp.Runners(call)
 		if err != nil {
 			log.WithError(err).Error("Failed to find runners for call")
-		} else {
-			for j := 0; j < len(runners); j++ {
+			stats.Record(ctx, errorPoolCountMeasure.M(0))
+			tracker.finalizeAttempts(false)
+			return err
+		}

-				select {
-				case <-ctx.Done():
-					return models.ErrCallTimeoutServerBusy
-				default:
-				}
-
-				i := atomic.AddUint64(&sp.rrIndex, uint64(1))
-				r := runners[int(i)%len(runners)]
-
-				tryCtx, tryCancel := context.WithCancel(ctx)
-				placed, err := r.TryExec(tryCtx, call)
-				tryCancel()
-
-				if err != nil && err != models.ErrCallTimeoutServerBusy {
-					log.WithError(err).Error("Failed during call placement")
-				}
-				if placed {
-					return err
-				}
+		for j := 0; j < len(runners); j++ {
+			if ctx.Err() != nil {
+				break OutTries
 			}
+
+			i := atomic.AddUint64(&sp.rrIndex, uint64(1))
+			r := runners[int(i)%len(runners)]
+
+			tracker.recordAttempt()
+			tryCtx, tryCancel := context.WithCancel(ctx)
+			placed, err := r.TryExec(tryCtx, call)
+			tryCancel()
+
+			// Only log unusual (except for too-busy) errors
+			if err != nil && err != models.ErrCallTimeoutServerBusy {
+				log.WithError(err).Errorf("Failed during call placement, placed=%v", placed)
+			}
+
+			if placed {
+				if err != nil {
+					stats.Record(ctx, placedErrorCountMeasure.M(0))
+				} else {
+					stats.Record(ctx, placedOKCountMeasure.M(0))
+				}
+				tracker.finalizeAttempts(true)
+				return err
+			}
+
+			// Too Busy is super common case, we track it separately
+			if err == models.ErrCallTimeoutServerBusy {
+				stats.Record(ctx, retryTooBusyCountMeasure.M(0))
+			} else {
+				stats.Record(ctx, retryErrorCountMeasure.M(0))
+			}
+		}
+
+		if len(runners) == 0 {
+			stats.Record(ctx, emptyPoolCountMeasure.M(0))
 		}

 		// backoff
 		select {
 		case <-ctx.Done():
-			return models.ErrCallTimeoutServerBusy
+			break OutTries
 		case <-time.After(sp.rrInterval):
 		}
 	}
+
+	// Cancel Exit Path / Client cancelled/timedout
+	stats.Record(ctx, cancelCountMeasure.M(0))
+	tracker.finalizeAttempts(false)
+	return models.ErrCallTimeoutServerBusy
 }