fn: lb and pure-runner with non-blocking agent (#989)

* fn: lb and pure-runner with non-blocking agent *) Removed pure-runner capacity tracking code. This did not play well with internal agent resource tracker. *) In LB and runner gRPC comm, removed ACK. Now, upon TryCall, pure-runner quickly proceeds to call Submit. This is good since at this stage pure-runner already has all relevant data to initiate the call. *) Unless pure-runner emits a NACK, LB immediately streams http body to runners. *) For retriable requests added a CachedReader for http.Request Body. *) Idempotenty/retry is similar to previous code. After initial success in Engament, after attempting a TryCall, unless we receive NACK, we cannot retry that call. *) ch and naive places now wraps each TryExec with a cancellable context to clean up gRPC contexts quicker. * fn: err for simpler one-time read GetBody approach This allows for a more flexible approach since we let users to define GetBody() to allow repetitive http body read. In default LB case, LB executes a one-time io.ReadAll and sets of GetBody, which is detected by RunnerCall.RequestBody(). * fn: additional check for non-nil req.body * fn: attempt to override IO errors with ctx for TryExec * fn: system-tests log dest * fn: LB: EOF send handling * fn: logging for partial IO * fn: use buffer pool for IO storage in lb agent * fn: pure runner should use chunks for data msgs * fn: required config validations and pass APIErrors * fn: additional tests and gRPC proto simplification *) remove ACK/NACK messages as Finish message type works OK for this purpose. *) return resp in api tests for check for status code *) empty body json test in api tests for lb & pure-runner * fn: buffer adjustments *) setRequestBody result handling correction *) switch to bytes.Reader for read-only safety *) io.EOF can be returned for non-nil Body in request. * fn: clarify detection of 503 / Server Too Busy
2022-10-28 21:29:17 +03:00 · 2018-05-17 12:09:03 -07:00
parent 1083623045
commit 4ccde8897e
13 changed files with 541 additions and 336 deletions
--- a/test/fn-system-tests/exec_test.go
+++ b/test/fn-system-tests/exec_test.go
@@ -5,6 +5,7 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"net/http"
 	"net/url"
 	"path"
 	"strings"
@@ -55,6 +56,7 @@ func TestCanExecuteFunction(t *testing.T) {
 	rt := s.BasicRoute()
 	rt.Image = "fnproject/fn-test-utils"
 	rt.Format = "json"
+	rt.Memory = 64
 	rt.Type = "sync"

 	s.GivenRouteExists(t, s.AppName, rt)
@@ -73,7 +75,7 @@ func TestCanExecuteFunction(t *testing.T) {
 	content := bytes.NewBuffer([]byte(body))
 	output := &bytes.Buffer{}

-	_, err = apiutils.CallFN(u.String(), content, output, "POST", []string{})
+	resp, err := apiutils.CallFN(u.String(), content, output, "POST", []string{})
 	if err != nil {
 		t.Errorf("Got unexpected error: %v", err)
 	}
@@ -82,11 +84,147 @@ func TestCanExecuteFunction(t *testing.T) {
 	if err != nil || echo != "HelloWorld" {
 		t.Fatalf("getEchoContent/HelloWorld check failed on %v", output)
 	}
+
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("StatusCode check failed on %v", resp.StatusCode)
+	}
+}
+
+func TestCanExecuteBigOutput(t *testing.T) {
+	s := apiutils.SetupHarness()
+	s.GivenAppExists(t, &sdkmodels.App{Name: s.AppName})
+	defer s.Cleanup()
+
+	rt := s.BasicRoute()
+	rt.Image = "fnproject/fn-test-utils"
+	rt.Format = "json"
+	rt.Memory = 64
+	rt.Type = "sync"
+
+	s.GivenRouteExists(t, s.AppName, rt)
+
+	lb, err := LB()
+	if err != nil {
+		t.Fatalf("Got unexpected error: %v", err)
+	}
+	u := url.URL{
+		Scheme: "http",
+		Host:   lb,
+	}
+	u.Path = path.Join(u.Path, "r", s.AppName, s.RoutePath)
+
+	// Approx 5.3MB output
+	body := `{"echoContent": "HelloWorld", "sleepTime": 0, "isDebug": true, "trailerRepeat": 410000}`
+	content := bytes.NewBuffer([]byte(body))
+	output := &bytes.Buffer{}
+
+	resp, err := apiutils.CallFN(u.String(), content, output, "POST", []string{})
+	if err != nil {
+		t.Errorf("Got unexpected error: %v", err)
+	}
+
+	t.Logf("getEchoContent/HelloWorld size %d", len(output.Bytes()))
+
+	echo, err := getEchoContent(output.Bytes())
+	if err != nil || echo != "HelloWorld" {
+		t.Fatalf("getEchoContent/HelloWorld check failed on %v", output)
+	}
+
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("StatusCode check failed on %v", resp.StatusCode)
+	}
+}
+
+func TestCanExecuteTooBigOutput(t *testing.T) {
+	s := apiutils.SetupHarness()
+	s.GivenAppExists(t, &sdkmodels.App{Name: s.AppName})
+	defer s.Cleanup()
+
+	rt := s.BasicRoute()
+	rt.Image = "fnproject/fn-test-utils"
+	rt.Format = "json"
+	rt.Memory = 64
+	rt.Type = "sync"
+
+	s.GivenRouteExists(t, s.AppName, rt)
+
+	lb, err := LB()
+	if err != nil {
+		t.Fatalf("Got unexpected error: %v", err)
+	}
+	u := url.URL{
+		Scheme: "http",
+		Host:   lb,
+	}
+	u.Path = path.Join(u.Path, "r", s.AppName, s.RoutePath)
+
+	// > 6MB output
+	body := `{"echoContent": "HelloWorld", "sleepTime": 0, "isDebug": true, "trailerRepeat": 600000}`
+	content := bytes.NewBuffer([]byte(body))
+	output := &bytes.Buffer{}
+
+	resp, err := apiutils.CallFN(u.String(), content, output, "POST", []string{})
+	if err != nil {
+		t.Errorf("Got unexpected error: %v", err)
+	}
+
+	exp := "{\"error\":{\"message\":\"function response too large\"}}\n"
+	actual := output.String()
+
+	if !strings.Contains(exp, actual) || len(exp) != len(actual) {
+		t.Errorf("Assertion error.\n\tExpected: %v\n\tActual: %v", exp, output.String())
+	}
+
+	if resp.StatusCode != http.StatusBadGateway {
+		t.Fatalf("StatusCode check failed on %v", resp.StatusCode)
+	}
+}
+
+func TestCanExecuteEmptyOutput(t *testing.T) {
+	s := apiutils.SetupHarness()
+	s.GivenAppExists(t, &sdkmodels.App{Name: s.AppName})
+	defer s.Cleanup()
+
+	rt := s.BasicRoute()
+	rt.Image = "fnproject/fn-test-utils"
+	rt.Format = "json"
+	rt.Memory = 64
+	rt.Type = "sync"
+
+	s.GivenRouteExists(t, s.AppName, rt)
+
+	lb, err := LB()
+	if err != nil {
+		t.Fatalf("Got unexpected error: %v", err)
+	}
+	u := url.URL{
+		Scheme: "http",
+		Host:   lb,
+	}
+	u.Path = path.Join(u.Path, "r", s.AppName, s.RoutePath)
+
+	// empty body output
+	body := `{"sleepTime": 0, "isDebug": true, "isEmptyBody": true}`
+	content := bytes.NewBuffer([]byte(body))
+	output := &bytes.Buffer{}
+
+	resp, err := apiutils.CallFN(u.String(), content, output, "POST", []string{})
+	if err != nil {
+		t.Errorf("Got unexpected error: %v", err)
+	}
+
+	actual := output.String()
+
+	if 0 != len(actual) {
+		t.Errorf("Assertion error.\n\tExpected empty\n\tActual: %v", output.String())
+	}
+
+	if resp.StatusCode != http.StatusOK {
+		t.Fatalf("StatusCode check failed on %v", resp.StatusCode)
+	}
 }

 func TestBasicConcurrentExecution(t *testing.T) {
-	SystemTweaker().ChangeNodeCapacities(512)
-	defer SystemTweaker().RestoreInitialNodeCapacities()

 	s := apiutils.SetupHarness()

@@ -96,6 +234,7 @@ func TestBasicConcurrentExecution(t *testing.T) {
 	rt := s.BasicRoute()
 	rt.Image = "fnproject/fn-test-utils"
 	rt.Format = "json"
+	rt.Memory = 32
 	rt.Type = "sync"

 	s.GivenRouteExists(t, s.AppName, rt)
@@ -117,7 +256,7 @@ func TestBasicConcurrentExecution(t *testing.T) {
 			body := `{"echoContent": "HelloWorld", "sleepTime": 0, "isDebug": true}`
 			content := bytes.NewBuffer([]byte(body))
 			output := &bytes.Buffer{}
-			_, err = apiutils.CallFN(u.String(), content, output, "POST", []string{})
+			resp, err := apiutils.CallFN(u.String(), content, output, "POST", []string{})
 			if err != nil {
 				results <- fmt.Errorf("Got unexpected error: %v", err)
 				return
@@ -128,6 +267,10 @@ func TestBasicConcurrentExecution(t *testing.T) {
 				results <- fmt.Errorf("Assertion error.\n\tActual: %v", output.String())
 				return
 			}
+			if resp.StatusCode != http.StatusOK {
+				results <- fmt.Errorf("StatusCode check failed on %v", resp.StatusCode)
+				return
+			}

 			results <- nil
 		}()
@@ -142,18 +285,19 @@ func TestBasicConcurrentExecution(t *testing.T) {
 }

 func TestSaturatedSystem(t *testing.T) {
-	// Set the capacity to 0 so we always look out of capacity.
-	SystemTweaker().ChangeNodeCapacities(0)
-	defer SystemTweaker().RestoreInitialNodeCapacities()

 	s := apiutils.SetupHarness()

 	s.GivenAppExists(t, &sdkmodels.App{Name: s.AppName})
 	defer s.Cleanup()

+	timeout := int32(5)
+
 	rt := s.BasicRoute()
 	rt.Image = "fnproject/fn-test-utils"
 	rt.Format = "json"
+	rt.Timeout = &timeout
+	rt.Memory = 300
 	rt.Type = "sync"

 	s.GivenRouteExists(t, s.AppName, rt)
@@ -172,15 +316,28 @@ func TestSaturatedSystem(t *testing.T) {
 	content := bytes.NewBuffer([]byte(body))
 	output := &bytes.Buffer{}

-	_, err = apiutils.CallFN(u.String(), content, output, "POST", []string{})
+	resp, err := apiutils.CallFN(u.String(), content, output, "POST", []string{})
 	if err != nil {
 		if err != apimodels.ErrCallTimeoutServerBusy {
 			t.Errorf("Got unexpected error: %v", err)
 		}
 	}
-	expectedOutput := "{\"error\":{\"message\":\"Timed out - server too busy\"}}\n"
+
+	// LB may respond either with:
+	//  timeout: a timeout during a call to a runner
+	//  too busy: a timeout during LB retry loop
+	exp1 := "{\"error\":{\"message\":\"Timed out - server too busy\"}}\n"
+	exp2 := "{\"error\":{\"message\":\"Timed out\"}}\n"
+
 	actual := output.String()
-	if !strings.Contains(expectedOutput, actual) || len(expectedOutput) != len(actual) {
-		t.Errorf("Assertion error.\n\tExpected: %v\n\tActual: %v", expectedOutput, output.String())
+
+	if strings.Contains(exp1, actual) && len(exp1) == len(actual) {
+	} else if strings.Contains(exp2, actual) && len(exp2) == len(actual) {
+	} else {
+		t.Errorf("Assertion error.\n\tExpected: %v or %v\n\tActual: %v", exp1, exp2, output.String())
+	}
+
+	if resp.StatusCode != http.StatusServiceUnavailable && resp.StatusCode != http.StatusGatewayTimeout {
+		t.Fatalf("StatusCode check failed on %v", resp.StatusCode)
 	}
 }
--- a/test/fn-system-tests/system_test.go
+++ b/test/fn-system-tests/system_test.go
@@ -7,7 +7,6 @@ import (

 	"github.com/fnproject/fn/api/agent"
 	"github.com/fnproject/fn/api/agent/hybrid"
-	"github.com/fnproject/fn/api/models"
 	pool "github.com/fnproject/fn/api/runnerpool"
 	"github.com/fnproject/fn/api/server"

@@ -18,7 +17,6 @@ import (
 	"os"
 	"strconv"
 	"strings"
-	"sync"
 	"testing"
 	"time"
 )
@@ -37,35 +35,42 @@ func NewSystemTestNodePool() (pool.RunnerPool, error) {
 	return agent.DefaultStaticRunnerPool(runners), nil
 }

-func SetUpSystem() error {
+type state struct {
+	memory string
+}
+
+func SetUpSystem() (*state, error) {
 	ctx := context.Background()
+	state := &state{}

 	api, err := SetUpAPINode(ctx)
 	if err != nil {
-		return err
+		return state, err
 	}
 	logrus.Info("Created API node")

 	lb, err := SetUpLBNode(ctx)
 	if err != nil {
-		return err
+		return state, err
 	}
 	logrus.Info("Created LB node")

-	pr0, nc0, err := SetUpPureRunnerNode(ctx, 0)
+	state.memory = os.Getenv(agent.EnvMaxTotalMemory)
+	os.Setenv(agent.EnvMaxTotalMemory, strconv.FormatUint(256*1024*1024, 10))
+
+	pr0, err := SetUpPureRunnerNode(ctx, 0)
 	if err != nil {
-		return err
+		return state, err
 	}
-	pr1, nc1, err := SetUpPureRunnerNode(ctx, 1)
+	pr1, err := SetUpPureRunnerNode(ctx, 1)
 	if err != nil {
-		return err
+		return state, err
 	}
-	pr2, nc2, err := SetUpPureRunnerNode(ctx, 2)
+	pr2, err := SetUpPureRunnerNode(ctx, 2)
 	if err != nil {
-		return err
+		return state, err
 	}
 	logrus.Info("Created Pure Runner nodes")
-	internalSystemTweaker.nodeCaps = []*testCapacityGate{nc0, nc1, nc2}

 	go func() { api.Start(ctx) }()
 	logrus.Info("Started API node")
@@ -77,10 +82,10 @@ func SetUpSystem() error {
 	logrus.Info("Started Pure Runner nodes")
 	// Wait for init - not great
 	time.Sleep(5 * time.Second)
-	return nil
+	return state, nil
 }

-func CleanUpSystem() error {
+func CleanUpSystem(st *state) error {
 	_, err := http.Get("http://127.0.0.1:8081/shutdown")
 	if err != nil {
 		return err
@@ -103,6 +108,13 @@ func CleanUpSystem() error {
 	}
 	// Wait for shutdown - not great
 	time.Sleep(5 * time.Second)
+
+	if st.memory != "" {
+		os.Setenv(agent.EnvMaxTotalMemory, st.memory)
+	} else {
+		os.Unsetenv(agent.EnvMaxTotalMemory)
+	}
+
 	return nil
 }

@@ -116,7 +128,7 @@ func SetUpAPINode(ctx context.Context) (*server.Server, error) {
 	opts = append(opts, server.WithWebPort(8085))
 	opts = append(opts, server.WithType(nodeType))
 	opts = append(opts, server.WithLogLevel(getEnv(server.EnvLogLevel, server.DefaultLogLevel)))
-	opts = append(opts, server.WithLogDest(server.DefaultLogDest, "API"))
+	opts = append(opts, server.WithLogDest(getEnv(server.EnvLogDest, server.DefaultLogDest), "API"))
 	opts = append(opts, server.WithDBURL(getEnv(server.EnvDBURL, defaultDB)))
 	opts = append(opts, server.WithMQURL(getEnv(server.EnvMQURL, defaultMQ)))
 	opts = append(opts, server.WithLogURL(""))
@@ -131,7 +143,7 @@ func SetUpLBNode(ctx context.Context) (*server.Server, error) {
 	opts = append(opts, server.WithWebPort(8081))
 	opts = append(opts, server.WithType(nodeType))
 	opts = append(opts, server.WithLogLevel(getEnv(server.EnvLogLevel, server.DefaultLogLevel)))
-	opts = append(opts, server.WithLogDest(server.DefaultLogDest, "LB"))
+	opts = append(opts, server.WithLogDest(getEnv(server.EnvLogDest, server.DefaultLogDest), "LB"))
 	opts = append(opts, server.WithDBURL(""))
 	opts = append(opts, server.WithMQURL(""))
 	opts = append(opts, server.WithLogURL(""))
@@ -156,85 +168,14 @@ func SetUpLBNode(ctx context.Context) (*server.Server, error) {
 	return server.New(ctx, opts...), nil
 }

-type testCapacityGate struct {
-	runnerNumber           int
-	committedCapacityUnits uint64
-	maxCapacityUnits       uint64
-	mtx                    sync.Mutex
-}
-
-const (
-	InitialTestCapacityUnitsPerRunner = 1024
-)
-
-func NewTestCapacityGate(nodeNum int, capacity uint64) *testCapacityGate {
-	return &testCapacityGate{
-		runnerNumber:           nodeNum,
-		maxCapacityUnits:       capacity,
-		committedCapacityUnits: 0,
-	}
-}
-
-func (tcg *testCapacityGate) CheckAndReserveCapacity(units uint64) error {
-	tcg.mtx.Lock()
-	defer tcg.mtx.Unlock()
-	if tcg.committedCapacityUnits+units <= tcg.maxCapacityUnits {
-		logrus.WithField("nodeNumber", tcg.runnerNumber).WithField("units", units).WithField("currentlyCommitted", tcg.committedCapacityUnits).Info("Runner is committing capacity")
-		tcg.committedCapacityUnits = tcg.committedCapacityUnits + units
-		return nil
-	}
-	logrus.WithField("nodeNumber", tcg.runnerNumber).WithField("currentlyCommitted", tcg.committedCapacityUnits).Debug("Runner is out of capacity")
-	return models.ErrCallTimeoutServerBusy
-}
-
-func (tcg *testCapacityGate) ReleaseCapacity(units uint64) {
-	tcg.mtx.Lock()
-	defer tcg.mtx.Unlock()
-	if units <= tcg.committedCapacityUnits {
-		logrus.WithField("nodeNumber", tcg.runnerNumber).WithField("units", units).WithField("currentlyCommitted", tcg.committedCapacityUnits).Info("Runner is releasing capacity")
-		tcg.committedCapacityUnits = tcg.committedCapacityUnits - units
-		return
-	}
-	panic("Fatal error in test capacity calculation, getting to sub-zero capacity")
-}
-
-func (tcg *testCapacityGate) ChangeMaxCapacity(newCapacity uint64) {
-	tcg.mtx.Lock()
-	defer tcg.mtx.Unlock()
-	logrus.WithField("nodeNumber", tcg.runnerNumber).WithField("oldCapacity", tcg.maxCapacityUnits).WithField("newCapacity", newCapacity).Info("Runner is changing max capacity")
-	tcg.maxCapacityUnits = newCapacity
-}
-
-type systemTweaker struct {
-	nodeCaps []*testCapacityGate
-}
-
-var internalSystemTweaker systemTweaker
-
-func SystemTweaker() *systemTweaker {
-	return &internalSystemTweaker
-}
-
-func (twk *systemTweaker) ChangeNodeCapacities(newCapacity uint64) {
-	for _, nc := range twk.nodeCaps {
-		nc.ChangeMaxCapacity(newCapacity)
-	}
-}
-
-func (twk *systemTweaker) RestoreInitialNodeCapacities() {
-	for _, nc := range twk.nodeCaps {
-		nc.ChangeMaxCapacity(InitialTestCapacityUnitsPerRunner)
-	}
-}
-
-func SetUpPureRunnerNode(ctx context.Context, nodeNum int) (*server.Server, *testCapacityGate, error) {
+func SetUpPureRunnerNode(ctx context.Context, nodeNum int) (*server.Server, error) {
 	nodeType := server.ServerTypePureRunner
 	opts := make([]server.ServerOption, 0)
 	opts = append(opts, server.WithWebPort(8082+nodeNum))
 	opts = append(opts, server.WithGRPCPort(9190+nodeNum))
 	opts = append(opts, server.WithType(nodeType))
 	opts = append(opts, server.WithLogLevel(getEnv(server.EnvLogLevel, server.DefaultLogLevel)))
-	opts = append(opts, server.WithLogDest(server.DefaultLogDest, "PURE-RUNNER"))
+	opts = append(opts, server.WithLogDest(getEnv(server.EnvLogDest, server.DefaultLogDest), "PURE-RUNNER"))
 	opts = append(opts, server.WithDBURL(""))
 	opts = append(opts, server.WithMQURL(""))
 	opts = append(opts, server.WithLogURL(""))
@@ -242,18 +183,18 @@ func SetUpPureRunnerNode(ctx context.Context, nodeNum int) (*server.Server, *tes

 	ds, err := hybrid.NewNopDataStore()
 	if err != nil {
-		return nil, nil, err
+		return nil, err
 	}
 	grpcAddr := fmt.Sprintf(":%d", 9190+nodeNum)
 	cancelCtx, cancel := context.WithCancel(ctx)
-	capacityGate := NewTestCapacityGate(nodeNum, InitialTestCapacityUnitsPerRunner)
-	prAgent, err := agent.NewPureRunner(cancel, grpcAddr, ds, "", "", "", capacityGate)
+
+	prAgent, err := agent.NewPureRunner(cancel, grpcAddr, ds, "", "", "", nil)
 	if err != nil {
-		return nil, nil, err
+		return nil, err
 	}
 	opts = append(opts, server.WithAgent(prAgent), server.WithExtraCtx(cancelCtx))

-	return server.New(ctx, opts...), capacityGate, nil
+	return server.New(ctx, opts...), nil
 }

 func pwd() string {
@@ -308,20 +249,15 @@ func whoAmI() net.IP {
 	return nil
 }

-func TestCanInstantiateSystem(t *testing.T) {
-	SystemTweaker().ChangeNodeCapacities(128)
-	defer SystemTweaker().RestoreInitialNodeCapacities()
-}
-
 func TestMain(m *testing.M) {
-	err := SetUpSystem()
+	state, err := SetUpSystem()
 	if err != nil {
 		logrus.WithError(err).Fatal("Could not initialize system")
 		os.Exit(1)
 	}
 	// call flag.Parse() here if TestMain uses flags
 	result := m.Run()
-	err = CleanUpSystem()
+	err = CleanUpSystem(state)
 	if err != nil {
 		logrus.WithError(err).Warn("Could not clean up system")
 	}