fn: lb and pure-runner with non-blocking agent (#989)

* fn: lb and pure-runner with non-blocking agent

*) Removed pure-runner capacity tracking code. This did
not play well with internal agent resource tracker.
*) In LB and runner gRPC comm, removed ACK. Now,
upon TryCall, pure-runner quickly proceeds to call
Submit. This is good since at this stage pure-runner
already has all relevant data to initiate the call.
*) Unless pure-runner emits a NACK, LB immediately
streams http body to runners.
*) For retriable requests added a CachedReader for
http.Request Body.
*) Idempotenty/retry is similar to previous code.
After initial success in Engament, after attempting
a TryCall, unless we receive NACK, we cannot retry
that call.
*) ch and naive places now wraps each TryExec with
a cancellable context to clean up gRPC contexts
quicker.

* fn: err for simpler one-time read GetBody approach

This allows for a more flexible approach since we let
users to define GetBody() to allow repetitive http body
read. In default LB case, LB executes a one-time io.ReadAll
and sets of GetBody, which is detected by RunnerCall.RequestBody().

* fn: additional check for non-nil req.body

* fn: attempt to override IO errors with ctx for TryExec

* fn: system-tests log dest

* fn: LB: EOF send handling

* fn: logging for partial IO

* fn: use buffer pool for IO storage in lb agent

* fn: pure runner should use chunks for data msgs

* fn: required config validations and pass APIErrors

* fn: additional tests and gRPC proto simplification

*) remove ACK/NACK messages as Finish message type works
OK for this purpose.
*) return resp in api tests for check for status code
*) empty body json test in api tests for lb & pure-runner

* fn: buffer adjustments

*) setRequestBody result handling correction
*) switch to bytes.Reader for read-only safety
*) io.EOF can be returned for non-nil Body in request.

* fn: clarify detection of 503 / Server Too Busy
This commit is contained in:
Tolga Ceylan
2018-05-17 12:09:03 -07:00
committed by GitHub
parent 1083623045
commit 4ccde8897e
13 changed files with 541 additions and 336 deletions

View File

@@ -5,6 +5,7 @@ import (
"encoding/json"
"errors"
"fmt"
"net/http"
"net/url"
"path"
"strings"
@@ -55,6 +56,7 @@ func TestCanExecuteFunction(t *testing.T) {
rt := s.BasicRoute()
rt.Image = "fnproject/fn-test-utils"
rt.Format = "json"
rt.Memory = 64
rt.Type = "sync"
s.GivenRouteExists(t, s.AppName, rt)
@@ -73,7 +75,7 @@ func TestCanExecuteFunction(t *testing.T) {
content := bytes.NewBuffer([]byte(body))
output := &bytes.Buffer{}
_, err = apiutils.CallFN(u.String(), content, output, "POST", []string{})
resp, err := apiutils.CallFN(u.String(), content, output, "POST", []string{})
if err != nil {
t.Errorf("Got unexpected error: %v", err)
}
@@ -82,11 +84,147 @@ func TestCanExecuteFunction(t *testing.T) {
if err != nil || echo != "HelloWorld" {
t.Fatalf("getEchoContent/HelloWorld check failed on %v", output)
}
if resp.StatusCode != http.StatusOK {
t.Fatalf("StatusCode check failed on %v", resp.StatusCode)
}
}
func TestCanExecuteBigOutput(t *testing.T) {
s := apiutils.SetupHarness()
s.GivenAppExists(t, &sdkmodels.App{Name: s.AppName})
defer s.Cleanup()
rt := s.BasicRoute()
rt.Image = "fnproject/fn-test-utils"
rt.Format = "json"
rt.Memory = 64
rt.Type = "sync"
s.GivenRouteExists(t, s.AppName, rt)
lb, err := LB()
if err != nil {
t.Fatalf("Got unexpected error: %v", err)
}
u := url.URL{
Scheme: "http",
Host: lb,
}
u.Path = path.Join(u.Path, "r", s.AppName, s.RoutePath)
// Approx 5.3MB output
body := `{"echoContent": "HelloWorld", "sleepTime": 0, "isDebug": true, "trailerRepeat": 410000}`
content := bytes.NewBuffer([]byte(body))
output := &bytes.Buffer{}
resp, err := apiutils.CallFN(u.String(), content, output, "POST", []string{})
if err != nil {
t.Errorf("Got unexpected error: %v", err)
}
t.Logf("getEchoContent/HelloWorld size %d", len(output.Bytes()))
echo, err := getEchoContent(output.Bytes())
if err != nil || echo != "HelloWorld" {
t.Fatalf("getEchoContent/HelloWorld check failed on %v", output)
}
if resp.StatusCode != http.StatusOK {
t.Fatalf("StatusCode check failed on %v", resp.StatusCode)
}
}
func TestCanExecuteTooBigOutput(t *testing.T) {
s := apiutils.SetupHarness()
s.GivenAppExists(t, &sdkmodels.App{Name: s.AppName})
defer s.Cleanup()
rt := s.BasicRoute()
rt.Image = "fnproject/fn-test-utils"
rt.Format = "json"
rt.Memory = 64
rt.Type = "sync"
s.GivenRouteExists(t, s.AppName, rt)
lb, err := LB()
if err != nil {
t.Fatalf("Got unexpected error: %v", err)
}
u := url.URL{
Scheme: "http",
Host: lb,
}
u.Path = path.Join(u.Path, "r", s.AppName, s.RoutePath)
// > 6MB output
body := `{"echoContent": "HelloWorld", "sleepTime": 0, "isDebug": true, "trailerRepeat": 600000}`
content := bytes.NewBuffer([]byte(body))
output := &bytes.Buffer{}
resp, err := apiutils.CallFN(u.String(), content, output, "POST", []string{})
if err != nil {
t.Errorf("Got unexpected error: %v", err)
}
exp := "{\"error\":{\"message\":\"function response too large\"}}\n"
actual := output.String()
if !strings.Contains(exp, actual) || len(exp) != len(actual) {
t.Errorf("Assertion error.\n\tExpected: %v\n\tActual: %v", exp, output.String())
}
if resp.StatusCode != http.StatusBadGateway {
t.Fatalf("StatusCode check failed on %v", resp.StatusCode)
}
}
func TestCanExecuteEmptyOutput(t *testing.T) {
s := apiutils.SetupHarness()
s.GivenAppExists(t, &sdkmodels.App{Name: s.AppName})
defer s.Cleanup()
rt := s.BasicRoute()
rt.Image = "fnproject/fn-test-utils"
rt.Format = "json"
rt.Memory = 64
rt.Type = "sync"
s.GivenRouteExists(t, s.AppName, rt)
lb, err := LB()
if err != nil {
t.Fatalf("Got unexpected error: %v", err)
}
u := url.URL{
Scheme: "http",
Host: lb,
}
u.Path = path.Join(u.Path, "r", s.AppName, s.RoutePath)
// empty body output
body := `{"sleepTime": 0, "isDebug": true, "isEmptyBody": true}`
content := bytes.NewBuffer([]byte(body))
output := &bytes.Buffer{}
resp, err := apiutils.CallFN(u.String(), content, output, "POST", []string{})
if err != nil {
t.Errorf("Got unexpected error: %v", err)
}
actual := output.String()
if 0 != len(actual) {
t.Errorf("Assertion error.\n\tExpected empty\n\tActual: %v", output.String())
}
if resp.StatusCode != http.StatusOK {
t.Fatalf("StatusCode check failed on %v", resp.StatusCode)
}
}
func TestBasicConcurrentExecution(t *testing.T) {
SystemTweaker().ChangeNodeCapacities(512)
defer SystemTweaker().RestoreInitialNodeCapacities()
s := apiutils.SetupHarness()
@@ -96,6 +234,7 @@ func TestBasicConcurrentExecution(t *testing.T) {
rt := s.BasicRoute()
rt.Image = "fnproject/fn-test-utils"
rt.Format = "json"
rt.Memory = 32
rt.Type = "sync"
s.GivenRouteExists(t, s.AppName, rt)
@@ -117,7 +256,7 @@ func TestBasicConcurrentExecution(t *testing.T) {
body := `{"echoContent": "HelloWorld", "sleepTime": 0, "isDebug": true}`
content := bytes.NewBuffer([]byte(body))
output := &bytes.Buffer{}
_, err = apiutils.CallFN(u.String(), content, output, "POST", []string{})
resp, err := apiutils.CallFN(u.String(), content, output, "POST", []string{})
if err != nil {
results <- fmt.Errorf("Got unexpected error: %v", err)
return
@@ -128,6 +267,10 @@ func TestBasicConcurrentExecution(t *testing.T) {
results <- fmt.Errorf("Assertion error.\n\tActual: %v", output.String())
return
}
if resp.StatusCode != http.StatusOK {
results <- fmt.Errorf("StatusCode check failed on %v", resp.StatusCode)
return
}
results <- nil
}()
@@ -142,18 +285,19 @@ func TestBasicConcurrentExecution(t *testing.T) {
}
func TestSaturatedSystem(t *testing.T) {
// Set the capacity to 0 so we always look out of capacity.
SystemTweaker().ChangeNodeCapacities(0)
defer SystemTweaker().RestoreInitialNodeCapacities()
s := apiutils.SetupHarness()
s.GivenAppExists(t, &sdkmodels.App{Name: s.AppName})
defer s.Cleanup()
timeout := int32(5)
rt := s.BasicRoute()
rt.Image = "fnproject/fn-test-utils"
rt.Format = "json"
rt.Timeout = &timeout
rt.Memory = 300
rt.Type = "sync"
s.GivenRouteExists(t, s.AppName, rt)
@@ -172,15 +316,28 @@ func TestSaturatedSystem(t *testing.T) {
content := bytes.NewBuffer([]byte(body))
output := &bytes.Buffer{}
_, err = apiutils.CallFN(u.String(), content, output, "POST", []string{})
resp, err := apiutils.CallFN(u.String(), content, output, "POST", []string{})
if err != nil {
if err != apimodels.ErrCallTimeoutServerBusy {
t.Errorf("Got unexpected error: %v", err)
}
}
expectedOutput := "{\"error\":{\"message\":\"Timed out - server too busy\"}}\n"
// LB may respond either with:
// timeout: a timeout during a call to a runner
// too busy: a timeout during LB retry loop
exp1 := "{\"error\":{\"message\":\"Timed out - server too busy\"}}\n"
exp2 := "{\"error\":{\"message\":\"Timed out\"}}\n"
actual := output.String()
if !strings.Contains(expectedOutput, actual) || len(expectedOutput) != len(actual) {
t.Errorf("Assertion error.\n\tExpected: %v\n\tActual: %v", expectedOutput, output.String())
if strings.Contains(exp1, actual) && len(exp1) == len(actual) {
} else if strings.Contains(exp2, actual) && len(exp2) == len(actual) {
} else {
t.Errorf("Assertion error.\n\tExpected: %v or %v\n\tActual: %v", exp1, exp2, output.String())
}
if resp.StatusCode != http.StatusServiceUnavailable && resp.StatusCode != http.StatusGatewayTimeout {
t.Fatalf("StatusCode check failed on %v", resp.StatusCode)
}
}

View File

@@ -7,7 +7,6 @@ import (
"github.com/fnproject/fn/api/agent"
"github.com/fnproject/fn/api/agent/hybrid"
"github.com/fnproject/fn/api/models"
pool "github.com/fnproject/fn/api/runnerpool"
"github.com/fnproject/fn/api/server"
@@ -18,7 +17,6 @@ import (
"os"
"strconv"
"strings"
"sync"
"testing"
"time"
)
@@ -37,35 +35,42 @@ func NewSystemTestNodePool() (pool.RunnerPool, error) {
return agent.DefaultStaticRunnerPool(runners), nil
}
func SetUpSystem() error {
type state struct {
memory string
}
func SetUpSystem() (*state, error) {
ctx := context.Background()
state := &state{}
api, err := SetUpAPINode(ctx)
if err != nil {
return err
return state, err
}
logrus.Info("Created API node")
lb, err := SetUpLBNode(ctx)
if err != nil {
return err
return state, err
}
logrus.Info("Created LB node")
pr0, nc0, err := SetUpPureRunnerNode(ctx, 0)
state.memory = os.Getenv(agent.EnvMaxTotalMemory)
os.Setenv(agent.EnvMaxTotalMemory, strconv.FormatUint(256*1024*1024, 10))
pr0, err := SetUpPureRunnerNode(ctx, 0)
if err != nil {
return err
return state, err
}
pr1, nc1, err := SetUpPureRunnerNode(ctx, 1)
pr1, err := SetUpPureRunnerNode(ctx, 1)
if err != nil {
return err
return state, err
}
pr2, nc2, err := SetUpPureRunnerNode(ctx, 2)
pr2, err := SetUpPureRunnerNode(ctx, 2)
if err != nil {
return err
return state, err
}
logrus.Info("Created Pure Runner nodes")
internalSystemTweaker.nodeCaps = []*testCapacityGate{nc0, nc1, nc2}
go func() { api.Start(ctx) }()
logrus.Info("Started API node")
@@ -77,10 +82,10 @@ func SetUpSystem() error {
logrus.Info("Started Pure Runner nodes")
// Wait for init - not great
time.Sleep(5 * time.Second)
return nil
return state, nil
}
func CleanUpSystem() error {
func CleanUpSystem(st *state) error {
_, err := http.Get("http://127.0.0.1:8081/shutdown")
if err != nil {
return err
@@ -103,6 +108,13 @@ func CleanUpSystem() error {
}
// Wait for shutdown - not great
time.Sleep(5 * time.Second)
if st.memory != "" {
os.Setenv(agent.EnvMaxTotalMemory, st.memory)
} else {
os.Unsetenv(agent.EnvMaxTotalMemory)
}
return nil
}
@@ -116,7 +128,7 @@ func SetUpAPINode(ctx context.Context) (*server.Server, error) {
opts = append(opts, server.WithWebPort(8085))
opts = append(opts, server.WithType(nodeType))
opts = append(opts, server.WithLogLevel(getEnv(server.EnvLogLevel, server.DefaultLogLevel)))
opts = append(opts, server.WithLogDest(server.DefaultLogDest, "API"))
opts = append(opts, server.WithLogDest(getEnv(server.EnvLogDest, server.DefaultLogDest), "API"))
opts = append(opts, server.WithDBURL(getEnv(server.EnvDBURL, defaultDB)))
opts = append(opts, server.WithMQURL(getEnv(server.EnvMQURL, defaultMQ)))
opts = append(opts, server.WithLogURL(""))
@@ -131,7 +143,7 @@ func SetUpLBNode(ctx context.Context) (*server.Server, error) {
opts = append(opts, server.WithWebPort(8081))
opts = append(opts, server.WithType(nodeType))
opts = append(opts, server.WithLogLevel(getEnv(server.EnvLogLevel, server.DefaultLogLevel)))
opts = append(opts, server.WithLogDest(server.DefaultLogDest, "LB"))
opts = append(opts, server.WithLogDest(getEnv(server.EnvLogDest, server.DefaultLogDest), "LB"))
opts = append(opts, server.WithDBURL(""))
opts = append(opts, server.WithMQURL(""))
opts = append(opts, server.WithLogURL(""))
@@ -156,85 +168,14 @@ func SetUpLBNode(ctx context.Context) (*server.Server, error) {
return server.New(ctx, opts...), nil
}
type testCapacityGate struct {
runnerNumber int
committedCapacityUnits uint64
maxCapacityUnits uint64
mtx sync.Mutex
}
const (
InitialTestCapacityUnitsPerRunner = 1024
)
func NewTestCapacityGate(nodeNum int, capacity uint64) *testCapacityGate {
return &testCapacityGate{
runnerNumber: nodeNum,
maxCapacityUnits: capacity,
committedCapacityUnits: 0,
}
}
func (tcg *testCapacityGate) CheckAndReserveCapacity(units uint64) error {
tcg.mtx.Lock()
defer tcg.mtx.Unlock()
if tcg.committedCapacityUnits+units <= tcg.maxCapacityUnits {
logrus.WithField("nodeNumber", tcg.runnerNumber).WithField("units", units).WithField("currentlyCommitted", tcg.committedCapacityUnits).Info("Runner is committing capacity")
tcg.committedCapacityUnits = tcg.committedCapacityUnits + units
return nil
}
logrus.WithField("nodeNumber", tcg.runnerNumber).WithField("currentlyCommitted", tcg.committedCapacityUnits).Debug("Runner is out of capacity")
return models.ErrCallTimeoutServerBusy
}
func (tcg *testCapacityGate) ReleaseCapacity(units uint64) {
tcg.mtx.Lock()
defer tcg.mtx.Unlock()
if units <= tcg.committedCapacityUnits {
logrus.WithField("nodeNumber", tcg.runnerNumber).WithField("units", units).WithField("currentlyCommitted", tcg.committedCapacityUnits).Info("Runner is releasing capacity")
tcg.committedCapacityUnits = tcg.committedCapacityUnits - units
return
}
panic("Fatal error in test capacity calculation, getting to sub-zero capacity")
}
func (tcg *testCapacityGate) ChangeMaxCapacity(newCapacity uint64) {
tcg.mtx.Lock()
defer tcg.mtx.Unlock()
logrus.WithField("nodeNumber", tcg.runnerNumber).WithField("oldCapacity", tcg.maxCapacityUnits).WithField("newCapacity", newCapacity).Info("Runner is changing max capacity")
tcg.maxCapacityUnits = newCapacity
}
type systemTweaker struct {
nodeCaps []*testCapacityGate
}
var internalSystemTweaker systemTweaker
func SystemTweaker() *systemTweaker {
return &internalSystemTweaker
}
func (twk *systemTweaker) ChangeNodeCapacities(newCapacity uint64) {
for _, nc := range twk.nodeCaps {
nc.ChangeMaxCapacity(newCapacity)
}
}
func (twk *systemTweaker) RestoreInitialNodeCapacities() {
for _, nc := range twk.nodeCaps {
nc.ChangeMaxCapacity(InitialTestCapacityUnitsPerRunner)
}
}
func SetUpPureRunnerNode(ctx context.Context, nodeNum int) (*server.Server, *testCapacityGate, error) {
func SetUpPureRunnerNode(ctx context.Context, nodeNum int) (*server.Server, error) {
nodeType := server.ServerTypePureRunner
opts := make([]server.ServerOption, 0)
opts = append(opts, server.WithWebPort(8082+nodeNum))
opts = append(opts, server.WithGRPCPort(9190+nodeNum))
opts = append(opts, server.WithType(nodeType))
opts = append(opts, server.WithLogLevel(getEnv(server.EnvLogLevel, server.DefaultLogLevel)))
opts = append(opts, server.WithLogDest(server.DefaultLogDest, "PURE-RUNNER"))
opts = append(opts, server.WithLogDest(getEnv(server.EnvLogDest, server.DefaultLogDest), "PURE-RUNNER"))
opts = append(opts, server.WithDBURL(""))
opts = append(opts, server.WithMQURL(""))
opts = append(opts, server.WithLogURL(""))
@@ -242,18 +183,18 @@ func SetUpPureRunnerNode(ctx context.Context, nodeNum int) (*server.Server, *tes
ds, err := hybrid.NewNopDataStore()
if err != nil {
return nil, nil, err
return nil, err
}
grpcAddr := fmt.Sprintf(":%d", 9190+nodeNum)
cancelCtx, cancel := context.WithCancel(ctx)
capacityGate := NewTestCapacityGate(nodeNum, InitialTestCapacityUnitsPerRunner)
prAgent, err := agent.NewPureRunner(cancel, grpcAddr, ds, "", "", "", capacityGate)
prAgent, err := agent.NewPureRunner(cancel, grpcAddr, ds, "", "", "", nil)
if err != nil {
return nil, nil, err
return nil, err
}
opts = append(opts, server.WithAgent(prAgent), server.WithExtraCtx(cancelCtx))
return server.New(ctx, opts...), capacityGate, nil
return server.New(ctx, opts...), nil
}
func pwd() string {
@@ -308,20 +249,15 @@ func whoAmI() net.IP {
return nil
}
func TestCanInstantiateSystem(t *testing.T) {
SystemTweaker().ChangeNodeCapacities(128)
defer SystemTweaker().RestoreInitialNodeCapacities()
}
func TestMain(m *testing.M) {
err := SetUpSystem()
state, err := SetUpSystem()
if err != nil {
logrus.WithError(err).Fatal("Could not initialize system")
os.Exit(1)
}
// call flag.Parse() here if TestMain uses flags
result := m.Run()
err = CleanUpSystem()
err = CleanUpSystem(state)
if err != nil {
logrus.WithError(err).Warn("Could not clean up system")
}