fn: naive placer rr fixup with tests (#1566)

If naive placer is not instantiated per call/runner group (aka LBG),
then the rr index will not trigger an round-robin behavior since
the index is initialized and stored in the placer configuration.

With this PR, moving rr index to per RunnerPool.Runners() inner loop
to ensure a round robin within that set. Each time we fetch a set,
since the set might be different, we reset our rr index. This means
we rr within that set once, then randomly start from another node
for the next RunnerPool.Runners() iteration. In busy systems,
no significant behavior change is expected (accept the removal of
atomic operations with respect to performance), but in idle systems
round robin behavior should be more observable and simple to follow and can
reduce same hit cases for the given RunnerPool.Runners().

In addition, introducing naive placer tests to ensure we observe
this behavior.
This commit is contained in:
Tolga Ceylan
2019-11-04 10:21:13 -08:00
committed by GitHub
parent 02445ef54d
commit 850508a9bd
3 changed files with 235 additions and 44 deletions

View File

@@ -200,43 +200,6 @@ func TestDetachedPlacerTimeout(t *testing.T) {
}
func TestRRRunner(t *testing.T) {
cfg := pool.NewPlacerConfig()
placer := pool.NewNaivePlacer(&cfg)
// TEST-NET-1 unreachable
rp := setupMockRunnerPool([]string{"192.0.2.0", "192.0.2.1"}, 10*time.Millisecond, 2)
parallelCalls := 2
var wg sync.WaitGroup
failures := make(chan error, parallelCalls)
for i := 0; i < parallelCalls; i++ {
wg.Add(1)
go func(i int) {
defer wg.Done()
ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(10*time.Millisecond))
defer cancel()
modelCall := &models.Call{Type: models.TypeSync}
call := &mockRunnerCall{model: modelCall}
err := placer.PlaceCall(ctx, rp, call)
if err != nil {
failures <- fmt.Errorf("Timed out call %d", i)
}
}(i)
}
wg.Wait()
close(failures)
err := <-failures
if err != nil {
t.Fatalf("Expected no error %s", err.Error())
}
if rp.runners[1].(*mockRunner).procCalls != 1 && rp.runners[0].(*mockRunner).procCalls != 1 {
t.Fatal("Expected rr runner")
}
}
func TestEnforceLbTimeout(t *testing.T) {
cfg := pool.NewPlacerConfig()
placer := pool.NewNaivePlacer(&cfg)

View File

@@ -2,7 +2,6 @@ package runnerpool
import (
"context"
"sync/atomic"
"time"
"github.com/fnproject/fn/api/models"
@@ -12,14 +11,12 @@ import (
type naivePlacer struct {
cfg PlacerConfig
rrIndex uint64
}
func NewNaivePlacer(cfg *PlacerConfig) Placer {
logrus.Infof("Creating new naive runnerpool placer with config=%+v", cfg)
return &naivePlacer{
cfg: *cfg,
rrIndex: uint64(time.Now().Nanosecond()),
}
}
@@ -36,10 +33,12 @@ func (sp *naivePlacer) PlaceCall(ctx context.Context, rp RunnerPool, call Runner
var runners []Runner
runners, runnerPoolErr = rp.Runners(ctx, call)
rrIndex := uint64(time.Now().Nanosecond())
for j := 0; j < len(runners) && !state.IsDone(); j++ {
i := atomic.AddUint64(&sp.rrIndex, uint64(1))
r := runners[int(i)%len(runners)]
rrIndex += 1
r := runners[rrIndex%uint64(len(runners))]
placed, err := state.TryRunner(r, call)
if placed {

View File

@@ -0,0 +1,229 @@
package runnerpool
import (
"context"
"io"
"math"
"net/http"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
"github.com/fnproject/fn/api/models"
)
// implements RunnerCall
type dummyCall struct {
models.Call
}
func (o *dummyCall) SlotHashId() string { return "" }
func (o *dummyCall) Extensions() map[string]string { return nil }
func (o *dummyCall) RequestBody() io.ReadCloser { return nil }
func (o *dummyCall) ResponseWriter() http.ResponseWriter { return nil }
func (o *dummyCall) StdErr() io.ReadWriteCloser { return nil }
func (o *dummyCall) Model() *models.Call { return &o.Call }
func (o *dummyCall) AddUserExecutionTime(dur time.Duration) {}
func (o *dummyCall) GetUserExecutionTime() *time.Duration { return nil }
var _ RunnerCall = &dummyCall{}
// implements RunnerPool
type dummyPool struct {
mock.Mock
}
func (o *dummyPool) Shutdown(ctx context.Context) error { return o.Called(ctx).Error(0) }
func (o *dummyPool) Runners(ctx context.Context, call RunnerCall) ([]Runner, error) {
args := o.Called(ctx, call)
return args.Get(0).([]Runner), args.Error(1)
}
var _ RunnerPool = &dummyPool{}
// implements Runner
type dummyRunner struct {
mock.Mock
}
func (o *dummyRunner) Status(ctx context.Context) (*RunnerStatus, error) { return nil, nil }
func (o *dummyRunner) Close(ctx context.Context) error { return nil }
func (o *dummyRunner) Address() string { return "" }
func (o *dummyRunner) TryExec(ctx context.Context, call RunnerCall) (bool, error) {
args := o.Called(ctx, call)
return args.Bool(0), args.Error(1)
}
var _ Runner = &dummyRunner{}
// Convenience function for us to get number of calls
func CallCount(o *mock.Mock, funcName string) int {
var actualCalls int
for _, call := range o.Calls {
if call.Method == funcName {
actualCalls++
}
}
return actualCalls
}
// Simple empty list with user error, should fail quick with user error
func TestNaivePlacer_EmptyList_UserError(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(2*time.Second))
defer cancel()
cfg := NewPlacerConfig()
placer := NewNaivePlacer(&cfg)
pool := &dummyPool{}
call := &dummyCall{}
// api error with 502, should short cut
poolErr := models.ErrFunctionFailed
pool.On("Runners", ctx, call).Return([]Runner{}, poolErr)
assert.Equal(t, poolErr, placer.PlaceCall(ctx, pool, call))
assert.Nil(t, ctx.Err()) // no ctx timeout
pCount := CallCount(&pool.Mock, "Runners")
assert.True(t, pCount <= 1, "should not be spinning, hit count %d", pCount)
}
// Simple empty list without no error, should spin and wait for runners until placer timeout
func TestNaivePlacer_EmptyList_NoError(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(2*time.Second))
defer cancel()
cfg := NewPlacerConfig()
cfg.PlacerTimeout = time.Duration(500 * time.Millisecond)
placer := NewNaivePlacer(&cfg)
pool := &dummyPool{}
call := &dummyCall{}
pool.On("Runners", ctx, call).Return([]Runner{}, nil)
// we should get 503
assert.Equal(t, models.ErrCallTimeoutServerBusy, placer.PlaceCall(ctx, pool, call))
assert.Nil(t, ctx.Err()) // no ctx timeout (placer timeout handled internally)
pCount := CallCount(&pool.Mock, "Runners")
assert.True(t, pCount > 1, "should be spinning, hit count %d", pCount)
}
// Simple empty list with service error, should spin and wait for runners until placer timeout
func TestNaivePlacer_EmptyList_WithServiceError(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(2*time.Second))
defer cancel()
cfg := NewPlacerConfig()
cfg.PlacerTimeout = time.Duration(500 * time.Millisecond)
placer := NewNaivePlacer(&cfg)
pool := &dummyPool{}
call := &dummyCall{}
// service error with 500, should not short cut, but should return this error
poolErr := models.ErrCallHandlerNotFound
pool.On("Runners", ctx, call).Return([]Runner{}, poolErr)
// we should get service error back
assert.Equal(t, poolErr, placer.PlaceCall(ctx, pool, call))
assert.Nil(t, ctx.Err()) // no ctx timeout (placer timeout handled internally)
pCount := CallCount(&pool.Mock, "Runners")
assert.True(t, pCount > 1, "should be spinning, hit count %d", pCount)
}
// Start with simple list with no error with runners-503 twice, then empty list with user error.
// should round robin the runners twice, but then fail quick with user error once the runner
// list becomes empty.
func TestNaivePlacer_SimpleList_UserError(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(2*time.Second))
defer cancel()
cfg := NewPlacerConfig()
cfg.PlacerTimeout = time.Duration(500 * time.Millisecond)
placer := NewNaivePlacer(&cfg)
pool := &dummyPool{}
call := &dummyCall{}
// two runners
runner1 := &dummyRunner{}
runner2 := &dummyRunner{}
// backpressure from runners (returns "not placed" along with 503)
runner1.On("TryExec", mock.AnythingOfType("*context.cancelCtx"), call).Return(false, models.ErrCallTimeoutServerBusy)
runner2.On("TryExec", mock.AnythingOfType("*context.cancelCtx"), call).Return(false, models.ErrCallTimeoutServerBusy)
// return 2-runner list twice.
pool.On("Runners", ctx, call).Return([]Runner{runner1, runner2}, nil).Twice()
// finally, return empty list, with api error with 502, should short cut
poolErr := models.ErrFunctionFailed
pool.On("Runners", ctx, call).Return([]Runner{}, poolErr).Once()
assert.Equal(t, poolErr, placer.PlaceCall(ctx, pool, call))
assert.Nil(t, ctx.Err()) // no ctx timeout
pCount := CallCount(&pool.Mock, "Runners")
assert.True(t, pCount == 3, "should not be spinning, hit count %d", pCount)
r1Count := CallCount(&runner1.Mock, "TryExec")
r2Count := CallCount(&runner2.Mock, "TryExec")
assert.True(t, r1Count == 2, "runner1 round robin twice failed, hit count %d", r1Count)
assert.True(t, r2Count == 2, "runner2 round robin twice failed, hit count %d", r2Count)
}
// Start with simple list with no error with runners-503 indefinitely
func TestNaivePlacer_SimpleList_NoError(t *testing.T) {
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(2*time.Second))
defer cancel()
cfg := NewPlacerConfig()
cfg.PlacerTimeout = time.Duration(500 * time.Millisecond)
placer := NewNaivePlacer(&cfg)
pool := &dummyPool{}
call := &dummyCall{}
// two runners
runner1 := &dummyRunner{}
runner2 := &dummyRunner{}
// backpressure from runners (returns "not placed" along with 503)
runner1.On("TryExec", mock.AnythingOfType("*context.cancelCtx"), call).Return(false, models.ErrCallTimeoutServerBusy)
runner2.On("TryExec", mock.AnythingOfType("*context.cancelCtx"), call).Return(false, models.ErrCallTimeoutServerBusy)
// return 2-runner list twice.
pool.On("Runners", ctx, call).Return([]Runner{runner1, runner2}, nil)
assert.Equal(t, models.ErrCallTimeoutServerBusy, placer.PlaceCall(ctx, pool, call))
assert.Nil(t, ctx.Err()) // no ctx timeout
pCount := CallCount(&pool.Mock, "Runners")
assert.True(t, pCount > 1, "should be spinning hit count %d", pCount)
r1Count := CallCount(&runner1.Mock, "TryExec")
r2Count := CallCount(&runner2.Mock, "TryExec")
assert.True(t, r1Count > 1, "runner1 round robin failed, not enough hit count %d", r1Count)
assert.True(t, r2Count > 1, "runner2 round robin failed, not enough hit count %d", r2Count)
// once the timeout occurs even in the middle of runner list processing, the hit delta
// should not be more than 1
assert.True(t, math.Abs(float64(r1Count)-float64(r2Count)) <= float64(1), "runner hit count inbalance")
}