fn-serverless/api/runnerpool/placer_tracker.go

package runnerpool

import (
	"context"
	"time"

	"github.com/fnproject/fn/api/common"
	"github.com/fnproject/fn/api/models"

	"go.opencensus.io/stats"
)

type placerTracker struct {
	cfg        *PlacerConfig
	requestCtx context.Context
	placerCtx  context.Context
	cancel     context.CancelFunc
	tracker    *attemptTracker
	isPlaced   bool
}

func NewPlacerTracker(requestCtx context.Context, cfg *PlacerConfig) *placerTracker {
	ctx, cancel := context.WithTimeout(context.Background(), cfg.PlacerTimeout)
	return &placerTracker{
		cfg:        cfg,
		requestCtx: requestCtx,
		placerCtx:  ctx,
		cancel:     cancel,
		tracker:    newAttemptTracker(requestCtx),
	}
}

// IsDone is a non-blocking check to see if the underlying deadlines are exceeded.
func (tr *placerTracker) IsDone() bool {
	return tr.requestCtx.Err() != nil || tr.placerCtx.Err() != nil
}

// HandleFindRunnersFailure is a convenience function to record error from runnerpool.Runners()
func (tr *placerTracker) HandleFindRunnersFailure(err error) {
	common.Logger(tr.requestCtx).WithError(err).Error("Failed to find runners for call")
	stats.Record(tr.requestCtx, errorPoolCountMeasure.M(0))
}

// TryRunner is a convenience function to TryExec a call on a runner and
// analyze the results.
func (tr *placerTracker) TryRunner(r Runner, call RunnerCall) (bool, error) {
	tr.tracker.recordAttempt()

	// WARNING: Do not use placerCtx here to let requestCtx take its time
	// during container execution.
	ctx, cancel := context.WithCancel(tr.requestCtx)
	isPlaced, err := r.TryExec(ctx, call)
	cancel()

	if !isPlaced {

		// Too Busy is super common case, we track it separately
		if err == models.ErrCallTimeoutServerBusy {
			stats.Record(tr.requestCtx, retryTooBusyCountMeasure.M(0))
		} else {
			stats.Record(tr.requestCtx, retryErrorCountMeasure.M(0))
		}

	} else {

		// Only log unusual (except for too-busy) errors for isPlaced (customer impacting) calls
		if err != nil && err != models.ErrCallTimeoutServerBusy {
			logger := common.Logger(ctx).WithField("runner_addr", r.Address())
			logger.WithError(err).Errorf("Failed during call placement")
		}

		if err != nil {
			stats.Record(tr.requestCtx, placedErrorCountMeasure.M(0))
		} else {
			stats.Record(tr.requestCtx, placedOKCountMeasure.M(0))
		}

		// Call is now committed. In other words, it was 'run'. We are done.
		tr.isPlaced = true
	}

	return isPlaced, err
}

// HandleDone is cleanup function to cancel pending contexts and to
// record stats for the placement session.
func (tr *placerTracker) HandleDone() {

	// Cancel Exit Path / Client cancelled/timedout
	if tr.requestCtx.Err() != nil {
		stats.Record(tr.requestCtx, cancelCountMeasure.M(0))
	}

	// This means our placer timed out. We ignore tr.isPlaced calls
	// since we do not check/track placer ctx timeout if a call was
	// actually ran on a runner. This means, placer timeout can be
	// 10 secs, but a call can execute for 60 secs in a container.
	if !tr.isPlaced && tr.placerCtx.Err() != nil {
		stats.Record(tr.requestCtx, placerTimeoutMeasure.M(0))
	}

	tr.tracker.finalizeAttempts(tr.isPlaced)
	tr.cancel()
}

// RetryAllBackoff blocks until it is time to try the runner list again. Returns
// false if the placer should stop trying.
func (tr *placerTracker) RetryAllBackoff(numOfRunners int) bool {

	// This means Placer is operating on an empty list. No runners
	// available. Record it.
	if numOfRunners == 0 {
		stats.Record(tr.requestCtx, emptyPoolCountMeasure.M(0))
	}

	select {
	case <-tr.requestCtx.Done(): // client side timeout/cancel
		return false
	case <-tr.placerCtx.Done(): // placer wait timeout
		return false
	case <-time.After(tr.cfg.RetryAllDelay):
	}

	return true
}