fn: naive placer rr fixup with tests (#1566)

If naive placer is not instantiated per call/runner group (aka LBG), then the rr index will not trigger an round-robin behavior since the index is initialized and stored in the placer configuration. With this PR, moving rr index to per RunnerPool.Runners() inner loop to ensure a round robin within that set. Each time we fetch a set, since the set might be different, we reset our rr index. This means we rr within that set once, then randomly start from another node for the next RunnerPool.Runners() iteration. In busy systems, no significant behavior change is expected (accept the removal of atomic operations with respect to performance), but in idle systems round robin behavior should be more observable and simple to follow and can reduce same hit cases for the given RunnerPool.Runners(). In addition, introducing naive placer tests to ensure we observe this behavior.
2022-10-28 21:29:17 +03:00 · 2019-11-04 10:21:13 -08:00
parent 02445ef54d
commit 850508a9bd
3 changed files with 235 additions and 44 deletions
--- a/api/agent/lb_agent_test.go
+++ b/api/agent/lb_agent_test.go
@@ -200,43 +200,6 @@ func TestDetachedPlacerTimeout(t *testing.T) {

 }

-func TestRRRunner(t *testing.T) {
-	cfg := pool.NewPlacerConfig()
-	placer := pool.NewNaivePlacer(&cfg)
-	// TEST-NET-1 unreachable
-	rp := setupMockRunnerPool([]string{"192.0.2.0", "192.0.2.1"}, 10*time.Millisecond, 2)
-
-	parallelCalls := 2
-	var wg sync.WaitGroup
-	failures := make(chan error, parallelCalls)
-	for i := 0; i < parallelCalls; i++ {
-		wg.Add(1)
-		go func(i int) {
-			defer wg.Done()
-			ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(10*time.Millisecond))
-			defer cancel()
-			modelCall := &models.Call{Type: models.TypeSync}
-			call := &mockRunnerCall{model: modelCall}
-
-			err := placer.PlaceCall(ctx, rp, call)
-			if err != nil {
-				failures <- fmt.Errorf("Timed out call %d", i)
-			}
-		}(i)
-	}
-
-	wg.Wait()
-	close(failures)
-
-	err := <-failures
-	if err != nil {
-		t.Fatalf("Expected no error %s", err.Error())
-	}
-	if rp.runners[1].(*mockRunner).procCalls != 1 && rp.runners[0].(*mockRunner).procCalls != 1 {
-		t.Fatal("Expected rr runner")
-	}
-}
-
 func TestEnforceLbTimeout(t *testing.T) {
 	cfg := pool.NewPlacerConfig()
 	placer := pool.NewNaivePlacer(&cfg)
--- a/api/runnerpool/naive_placer.go
+++ b/api/runnerpool/naive_placer.go
@@ -2,7 +2,6 @@ package runnerpool

 import (
 	"context"
-	"sync/atomic"
 	"time"

 	"github.com/fnproject/fn/api/models"
@@ -12,14 +11,12 @@ import (

 type naivePlacer struct {
 	cfg PlacerConfig
-	rrIndex uint64
 }

 func NewNaivePlacer(cfg *PlacerConfig) Placer {
 	logrus.Infof("Creating new naive runnerpool placer with config=%+v", cfg)
 	return &naivePlacer{
 		cfg: *cfg,
-		rrIndex: uint64(time.Now().Nanosecond()),
 	}
 }

@@ -36,10 +33,12 @@ func (sp *naivePlacer) PlaceCall(ctx context.Context, rp RunnerPool, call Runner
 		var runners []Runner
 		runners, runnerPoolErr = rp.Runners(ctx, call)

+		rrIndex := uint64(time.Now().Nanosecond())
+
 		for j := 0; j < len(runners) && !state.IsDone(); j++ {

-			i := atomic.AddUint64(&sp.rrIndex, uint64(1))
-			r := runners[int(i)%len(runners)]
+			rrIndex += 1
+			r := runners[rrIndex%uint64(len(runners))]

 			placed, err := state.TryRunner(r, call)
 			if placed {
--- a/api/runnerpool/naive_placer_test.go
+++ b/api/runnerpool/naive_placer_test.go
@@ -0,0 +1,229 @@
+package runnerpool
+
+import (
+	"context"
+	"io"
+	"math"
+	"net/http"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/mock"
+
+	"github.com/fnproject/fn/api/models"
+)
+
+// implements RunnerCall
+type dummyCall struct {
+	models.Call
+}
+
+func (o *dummyCall) SlotHashId() string                     { return "" }
+func (o *dummyCall) Extensions() map[string]string          { return nil }
+func (o *dummyCall) RequestBody() io.ReadCloser             { return nil }
+func (o *dummyCall) ResponseWriter() http.ResponseWriter    { return nil }
+func (o *dummyCall) StdErr() io.ReadWriteCloser             { return nil }
+func (o *dummyCall) Model() *models.Call                    { return &o.Call }
+func (o *dummyCall) AddUserExecutionTime(dur time.Duration) {}
+func (o *dummyCall) GetUserExecutionTime() *time.Duration   { return nil }
+
+var _ RunnerCall = &dummyCall{}
+
+// implements RunnerPool
+type dummyPool struct {
+	mock.Mock
+}
+
+func (o *dummyPool) Shutdown(ctx context.Context) error { return o.Called(ctx).Error(0) }
+func (o *dummyPool) Runners(ctx context.Context, call RunnerCall) ([]Runner, error) {
+	args := o.Called(ctx, call)
+	return args.Get(0).([]Runner), args.Error(1)
+}
+
+var _ RunnerPool = &dummyPool{}
+
+// implements Runner
+type dummyRunner struct {
+	mock.Mock
+}
+
+func (o *dummyRunner) Status(ctx context.Context) (*RunnerStatus, error) { return nil, nil }
+func (o *dummyRunner) Close(ctx context.Context) error                   { return nil }
+func (o *dummyRunner) Address() string                                   { return "" }
+func (o *dummyRunner) TryExec(ctx context.Context, call RunnerCall) (bool, error) {
+	args := o.Called(ctx, call)
+	return args.Bool(0), args.Error(1)
+}
+
+var _ Runner = &dummyRunner{}
+
+// Convenience function for us to get number of calls
+func CallCount(o *mock.Mock, funcName string) int {
+	var actualCalls int
+	for _, call := range o.Calls {
+		if call.Method == funcName {
+			actualCalls++
+		}
+	}
+	return actualCalls
+}
+
+// Simple empty list with user error, should fail quick with user error
+func TestNaivePlacer_EmptyList_UserError(t *testing.T) {
+
+	ctx, cancel := context.WithTimeout(context.Background(), time.Duration(2*time.Second))
+	defer cancel()
+
+	cfg := NewPlacerConfig()
+	placer := NewNaivePlacer(&cfg)
+
+	pool := &dummyPool{}
+	call := &dummyCall{}
+
+	// api error with 502, should short cut
+	poolErr := models.ErrFunctionFailed
+
+	pool.On("Runners", ctx, call).Return([]Runner{}, poolErr)
+
+	assert.Equal(t, poolErr, placer.PlaceCall(ctx, pool, call))
+	assert.Nil(t, ctx.Err()) // no ctx timeout
+
+	pCount := CallCount(&pool.Mock, "Runners")
+	assert.True(t, pCount <= 1, "should not be spinning, hit count %d", pCount)
+}
+
+// Simple empty list without no error, should spin and wait for runners until placer timeout
+func TestNaivePlacer_EmptyList_NoError(t *testing.T) {
+
+	ctx, cancel := context.WithTimeout(context.Background(), time.Duration(2*time.Second))
+	defer cancel()
+
+	cfg := NewPlacerConfig()
+	cfg.PlacerTimeout = time.Duration(500 * time.Millisecond)
+
+	placer := NewNaivePlacer(&cfg)
+
+	pool := &dummyPool{}
+	call := &dummyCall{}
+
+	pool.On("Runners", ctx, call).Return([]Runner{}, nil)
+
+	// we should get 503
+	assert.Equal(t, models.ErrCallTimeoutServerBusy, placer.PlaceCall(ctx, pool, call))
+	assert.Nil(t, ctx.Err()) // no ctx timeout (placer timeout handled internally)
+
+	pCount := CallCount(&pool.Mock, "Runners")
+	assert.True(t, pCount > 1, "should be spinning, hit count %d", pCount)
+}
+
+// Simple empty list with service error, should spin and wait for runners until placer timeout
+func TestNaivePlacer_EmptyList_WithServiceError(t *testing.T) {
+
+	ctx, cancel := context.WithTimeout(context.Background(), time.Duration(2*time.Second))
+	defer cancel()
+
+	cfg := NewPlacerConfig()
+	cfg.PlacerTimeout = time.Duration(500 * time.Millisecond)
+
+	placer := NewNaivePlacer(&cfg)
+
+	pool := &dummyPool{}
+	call := &dummyCall{}
+
+	// service error with 500, should not short cut, but should return this error
+	poolErr := models.ErrCallHandlerNotFound
+
+	pool.On("Runners", ctx, call).Return([]Runner{}, poolErr)
+
+	// we should get service error back
+	assert.Equal(t, poolErr, placer.PlaceCall(ctx, pool, call))
+	assert.Nil(t, ctx.Err()) // no ctx timeout (placer timeout handled internally)
+
+	pCount := CallCount(&pool.Mock, "Runners")
+	assert.True(t, pCount > 1, "should be spinning, hit count %d", pCount)
+}
+
+// Start with simple list with no error with runners-503 twice, then empty list with user error.
+// should round robin the runners twice, but then fail quick with user error once the runner
+// list becomes empty.
+func TestNaivePlacer_SimpleList_UserError(t *testing.T) {
+
+	ctx, cancel := context.WithTimeout(context.Background(), time.Duration(2*time.Second))
+	defer cancel()
+
+	cfg := NewPlacerConfig()
+	cfg.PlacerTimeout = time.Duration(500 * time.Millisecond)
+	placer := NewNaivePlacer(&cfg)
+
+	pool := &dummyPool{}
+	call := &dummyCall{}
+
+	// two runners
+	runner1 := &dummyRunner{}
+	runner2 := &dummyRunner{}
+
+	// backpressure from runners (returns "not placed" along with 503)
+	runner1.On("TryExec", mock.AnythingOfType("*context.cancelCtx"), call).Return(false, models.ErrCallTimeoutServerBusy)
+	runner2.On("TryExec", mock.AnythingOfType("*context.cancelCtx"), call).Return(false, models.ErrCallTimeoutServerBusy)
+
+	// return 2-runner list twice.
+	pool.On("Runners", ctx, call).Return([]Runner{runner1, runner2}, nil).Twice()
+
+	// finally, return empty list, with api error with 502, should short cut
+	poolErr := models.ErrFunctionFailed
+	pool.On("Runners", ctx, call).Return([]Runner{}, poolErr).Once()
+
+	assert.Equal(t, poolErr, placer.PlaceCall(ctx, pool, call))
+	assert.Nil(t, ctx.Err()) // no ctx timeout
+
+	pCount := CallCount(&pool.Mock, "Runners")
+	assert.True(t, pCount == 3, "should not be spinning, hit count %d", pCount)
+
+	r1Count := CallCount(&runner1.Mock, "TryExec")
+	r2Count := CallCount(&runner2.Mock, "TryExec")
+
+	assert.True(t, r1Count == 2, "runner1 round robin twice failed, hit count %d", r1Count)
+	assert.True(t, r2Count == 2, "runner2 round robin twice failed, hit count %d", r2Count)
+}
+
+// Start with simple list with no error with runners-503 indefinitely
+func TestNaivePlacer_SimpleList_NoError(t *testing.T) {
+
+	ctx, cancel := context.WithTimeout(context.Background(), time.Duration(2*time.Second))
+	defer cancel()
+
+	cfg := NewPlacerConfig()
+	cfg.PlacerTimeout = time.Duration(500 * time.Millisecond)
+	placer := NewNaivePlacer(&cfg)
+
+	pool := &dummyPool{}
+	call := &dummyCall{}
+
+	// two runners
+	runner1 := &dummyRunner{}
+	runner2 := &dummyRunner{}
+
+	// backpressure from runners (returns "not placed" along with 503)
+	runner1.On("TryExec", mock.AnythingOfType("*context.cancelCtx"), call).Return(false, models.ErrCallTimeoutServerBusy)
+	runner2.On("TryExec", mock.AnythingOfType("*context.cancelCtx"), call).Return(false, models.ErrCallTimeoutServerBusy)
+
+	// return 2-runner list twice.
+	pool.On("Runners", ctx, call).Return([]Runner{runner1, runner2}, nil)
+
+	assert.Equal(t, models.ErrCallTimeoutServerBusy, placer.PlaceCall(ctx, pool, call))
+	assert.Nil(t, ctx.Err()) // no ctx timeout
+
+	pCount := CallCount(&pool.Mock, "Runners")
+	assert.True(t, pCount > 1, "should be spinning hit count %d", pCount)
+
+	r1Count := CallCount(&runner1.Mock, "TryExec")
+	r2Count := CallCount(&runner2.Mock, "TryExec")
+
+	assert.True(t, r1Count > 1, "runner1 round robin failed, not enough hit count %d", r1Count)
+	assert.True(t, r2Count > 1, "runner2 round robin failed, not enough hit count %d", r2Count)
+
+	// once the timeout occurs even in the middle of runner list processing, the hit delta
+	// should not be more than 1
+	assert.True(t, math.Abs(float64(r1Count)-float64(r2Count)) <= float64(1), "runner hit count inbalance")
+}