fn: user friendly timeout handling changes (#1021)

* fn: user friendly timeout handling changes Timeout setting in routes now means "maximum amount of time a function can run in a container". Total wait time for a given http request is now expected to be handled by the client. As long as the client waits, the LB, runner or agents will search for resources to schedule it.
2022-10-28 21:29:17 +03:00 · 2018-06-01 13:18:13 -07:00
parent ffefcf5773
commit a57907eed0
15 changed files with 105 additions and 138 deletions
--- a/api/agent/agent.go
+++ b/api/agent/agent.go
@@ -228,10 +228,7 @@ func (a *agent) Submit(callI Call) error {

 	call := callI.(*call)

-	ctx, cancel := context.WithDeadline(call.req.Context(), call.execDeadline)
-	call.req = call.req.WithContext(ctx)
-	defer cancel()
-
+	ctx := call.req.Context()
 	ctx, span := trace.StartSpan(ctx, "agent_submit")
 	defer span.End()

@@ -277,7 +274,11 @@ func (a *agent) submit(ctx context.Context, call *call) error {

 	statsDequeueAndStart(ctx)

-	// pass this error (nil or otherwise) to end directly, to store status, etc
+	// We are about to execute the function, set container Exec Deadline (call.Timeout)
+	ctx, cancel := context.WithTimeout(ctx, time.Duration(call.Timeout)*time.Second)
+	defer cancel()
+
+	// Pass this error (nil or otherwise) to end directly, to store status, etc.
 	err = slot.exec(ctx, call)
 	return a.handleCallEnd(ctx, call, slot, err, true)
 }
@@ -383,9 +384,17 @@ func handleStatsEnd(ctx context.Context, err error) {
 // request type, this may launch a new container or wait for other containers to become idle
 // or it may wait for resources to become available to launch a new container.
 func (a *agent) getSlot(ctx context.Context, call *call) (Slot, error) {
-	// start the deadline context for waiting for slots
-	ctx, cancel := context.WithDeadline(ctx, call.slotDeadline)
-	defer cancel()
+	if call.Type == models.TypeAsync {
+		// *) for async, slot deadline is also call.Timeout. This is because we would like to
+		// allocate enough time for docker-pull, slot-wait, docker-start, etc.
+		// and also make sure we have call.Timeout inside the container. Total time
+		// to run an async becomes 2 * call.Timeout.
+		// *) for sync, there's no slot deadline, the timeout is controlled by http-client
+		// context (or runner gRPC context)
+		tmp, cancel := context.WithTimeout(ctx, time.Duration(call.Timeout)*time.Second)
+		ctx = tmp
+		defer cancel()
+	}

 	ctx, span := trace.StartSpan(ctx, "agent_get_slot")
 	defer span.End()
@@ -721,7 +730,7 @@ func (s *hotSlot) exec(ctx context.Context, call *call) error {

 	errApp := make(chan error, 1)
 	go func() {
-		ci := protocol.NewCallInfo(call.IsCloudEvent, call.Call, call.req)
+		ci := protocol.NewCallInfo(call.IsCloudEvent, call.Call, call.req.WithContext(ctx))
 		errApp <- proto.Dispatch(ctx, ci, call.w)
 	}()

@@ -752,8 +761,10 @@ func (a *agent) prepCold(ctx context.Context, call *call, tok ResourceToken, ch

 	call.containerState.UpdateState(ctx, ContainerStateStart, call.slots)

+	deadline := time.Now().Add(time.Duration(call.Timeout) * time.Second)
+
 	// add Fn-specific information to the config to shove everything into env vars for cold
-	call.Config["FN_DEADLINE"] = strfmt.DateTime(call.execDeadline).String()
+	call.Config["FN_DEADLINE"] = strfmt.DateTime(deadline).String()
 	call.Config["FN_METHOD"] = call.Model().Method
 	call.Config["FN_REQUEST_URL"] = call.Model().URL
 	call.Config["FN_CALL_ID"] = call.Model().ID
--- a/api/agent/call.go
+++ b/api/agent/call.go
@@ -254,40 +254,36 @@ func (a *agent) GetCall(opts ...CallOpt) (Call, error) {
 	}

 	mem := c.Memory + uint64(c.TmpFsSize)
-
 	if !a.resources.IsResourcePossible(mem, uint64(c.CPUs), c.Type == models.TypeAsync) {
 		// if we're not going to be able to run this call on this machine, bail here.
 		return nil, models.ErrCallTimeoutServerBusy
 	}
+
 	err := setMaxBodyLimit(&a.cfg, &c)
 	if err != nil {
 		return nil, err
 	}

+	setupCtx(&c)
+
 	c.da = a.da
 	c.ct = a
-
-	ctx, _ := common.LoggerWithFields(c.req.Context(),
-		logrus.Fields{"id": c.ID, "app_id": c.AppID, "route": c.Path})
-	c.req = c.req.WithContext(ctx)
-
-	c.stderr = setupLogger(ctx, a.cfg.MaxLogSize, c.Call)
+	c.stderr = setupLogger(c.req.Context(), a.cfg.MaxLogSize, c.Call)
 	if c.w == nil {
 		// send STDOUT to logs if no writer given (async...)
 		// TODO we could/should probably make this explicit to GetCall, ala 'WithLogger', but it's dupe code (who cares?)
 		c.w = c.stderr
 	}

-	now := time.Now()
-	slotDeadline := now.Add(time.Duration(c.Call.Timeout) * time.Second / 2)
-	execDeadline := now.Add(time.Duration(c.Call.Timeout) * time.Second)
-
-	c.slotDeadline = slotDeadline
-	c.execDeadline = execDeadline
-
 	return &c, nil
 }

+func setupCtx(c *call) {
+	ctx, _ := common.LoggerWithFields(c.req.Context(),
+		logrus.Fields{"id": c.ID, "app_id": c.AppID, "route": c.Path})
+	c.req = c.req.WithContext(ctx)
+}
+
 func setMaxBodyLimit(cfg *AgentConfig, c *call) error {
 	if cfg.MaxRequestSize > 0 && c.req.ContentLength > 0 && uint64(c.req.ContentLength) > cfg.MaxRequestSize {
 		return models.ErrRequestContentTooBig
@@ -310,16 +306,10 @@ type call struct {
 	stderr         io.ReadWriteCloser
 	ct             callTrigger
 	slots          *slotQueue
-	slotDeadline   time.Time
-	lbDeadline     time.Time
-	execDeadline   time.Time
 	requestState   RequestState
 	containerState ContainerState
 	slotHashId     string
-}
-
-func (c *call) LbDeadline() time.Time {
-	return c.lbDeadline
+	isLB           bool
 }

 func (c *call) SlotHashId() string {
@@ -358,8 +348,7 @@ func (c *call) Start(ctx context.Context) error {
 	c.StartedAt = strfmt.DateTime(time.Now())
 	c.Status = "running"

-	// Do not write this header if lb-agent
-	if c.lbDeadline.IsZero() {
+	if !c.isLB {
 		if rw, ok := c.w.(http.ResponseWriter); ok { // TODO need to figure out better way to wire response headers in
 			rw.Header().Set("XXX-FXLB-WAIT", time.Time(c.StartedAt).Sub(time.Time(c.CreatedAt)).String())
 		}
--- a/api/agent/lb_agent.go
+++ b/api/agent/lb_agent.go
@@ -7,7 +7,6 @@ import (
 	"io"
 	"io/ioutil"
 	"sync/atomic"
-	"time"

 	"github.com/sirupsen/logrus"
 	"go.opencensus.io/trace"
@@ -104,20 +103,18 @@ func (a *lbAgent) GetCall(opts ...CallOpt) (Call, error) {
 	if c.req == nil || c.Call == nil {
 		return nil, errors.New("no model or request provided for call")
 	}
+
 	err := setMaxBodyLimit(&a.cfg, &c)
 	if err != nil {
 		return nil, err
 	}

+	setupCtx(&c)
+
+	c.isLB = true
 	c.da = a.da
 	c.ct = a
 	c.stderr = &nullReadWriter{}
-
-	ctx, _ := common.LoggerWithFields(c.req.Context(),
-		logrus.Fields{"id": c.ID, "app_id": c.AppID, "route": c.Path})
-	c.req = c.req.WithContext(ctx)
-
-	c.lbDeadline = time.Now().Add(time.Duration(c.Call.Timeout) * time.Second)
 	c.slotHashId = getSlotQueueKey(&c)

 	return &c, nil
@@ -157,12 +154,7 @@ func (a *lbAgent) Submit(callI Call) error {
 	}

 	call := callI.(*call)
-
-	ctx, cancel := context.WithDeadline(call.req.Context(), call.lbDeadline)
-	call.req = call.req.WithContext(ctx)
-	defer cancel()
-
-	ctx, span := trace.StartSpan(ctx, "agent_submit")
+	ctx, span := trace.StartSpan(call.req.Context(), "agent_submit")
 	defer span.End()

 	statsEnqueue(ctx)
--- a/api/agent/lb_agent_test.go
+++ b/api/agent/lb_agent_test.go
@@ -117,7 +117,6 @@ func (r *mockRunner) Address() string {
 }

 type mockRunnerCall struct {
-	lbDeadline time.Time
 	r          *http.Request
 	rw         http.ResponseWriter
 	stdErr     io.ReadWriteCloser
@@ -125,10 +124,6 @@ type mockRunnerCall struct {
 	slotHashId string
 }

-func (c *mockRunnerCall) LbDeadline() time.Time {
-	return c.lbDeadline
-}
-
 func (c *mockRunnerCall) SlotHashId() string {
 	return c.slotHashId
 }
@@ -157,8 +152,10 @@ func setupMockRunnerPool(expectedRunners []string, execSleep time.Duration, maxC
 func TestOneRunner(t *testing.T) {
 	placer := pool.NewNaivePlacer()
 	rp := setupMockRunnerPool([]string{"171.19.0.1"}, 10*time.Millisecond, 5)
-	call := &mockRunnerCall{lbDeadline: time.Now().Add(1 * time.Second)}
-	err := placer.PlaceCall(rp, context.Background(), call)
+	call := &mockRunnerCall{}
+	ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(1*time.Second))
+	defer cancel()
+	err := placer.PlaceCall(rp, ctx, call)
 	if err != nil {
 		t.Fatalf("Failed to place call on runner %v", err)
 	}
@@ -167,7 +164,7 @@ func TestOneRunner(t *testing.T) {
 func TestEnforceTimeoutFromContext(t *testing.T) {
 	placer := pool.NewNaivePlacer()
 	rp := setupMockRunnerPool([]string{"171.19.0.1"}, 10*time.Millisecond, 5)
-	call := &mockRunnerCall{lbDeadline: time.Now().Add(1 * time.Second)}
+	call := &mockRunnerCall{}
 	ctx, cancel := context.WithDeadline(context.Background(), time.Now())
 	defer cancel()
 	err := placer.PlaceCall(rp, ctx, call)
@@ -187,8 +184,10 @@ func TestRRRunner(t *testing.T) {
 		wg.Add(1)
 		go func(i int) {
 			defer wg.Done()
-			call := &mockRunnerCall{lbDeadline: time.Now().Add(10 * time.Millisecond)}
-			err := placer.PlaceCall(rp, context.Background(), call)
+			ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(10*time.Millisecond))
+			defer cancel()
+			call := &mockRunnerCall{}
+			err := placer.PlaceCall(rp, ctx, call)
 			if err != nil {
 				failures <- fmt.Errorf("Timed out call %d", i)
 			}
@@ -218,8 +217,11 @@ func TestEnforceLbTimeout(t *testing.T) {
 		wg.Add(1)
 		go func(i int) {
 			defer wg.Done()
-			call := &mockRunnerCall{lbDeadline: time.Now().Add(10 * time.Millisecond)}
-			err := placer.PlaceCall(rp, context.Background(), call)
+			ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(10*time.Millisecond))
+			defer cancel()
+
+			call := &mockRunnerCall{}
+			err := placer.PlaceCall(rp, ctx, call)
 			if err != nil {
 				failures <- fmt.Errorf("Timed out call %d", i)
 			}
--- a/api/agent/protocol/factory.go
+++ b/api/agent/protocol/factory.go
@@ -5,7 +5,6 @@ import (
 	"errors"
 	"io"
 	"net/http"
-	"time"

 	"github.com/fnproject/fn/api/models"
 	"github.com/go-openapi/strfmt"
@@ -79,16 +78,8 @@ func (ci callInfoImpl) Input() io.Reader {
 func (ci callInfoImpl) Deadline() strfmt.DateTime {
 	deadline, ok := ci.req.Context().Deadline()
 	if !ok {
-		// In theory deadline must have been set here, but if it wasn't then
-		// at this point it is already too late to raise an error. Set it to
-		// something meaningful.
-		// This assumes StartedAt was set to something other than the default.
-		// If that isn't set either, then how many things have gone wrong?
-		if ci.call.StartedAt == strfmt.NewDateTime() {
-			// We just panic if StartedAt is the default (i.e. not set)
-			panic("No context deadline and zero-value StartedAt - this should never happen")
-		}
-		deadline = ((time.Time)(ci.call.StartedAt)).Add(time.Duration(ci.call.Timeout) * time.Second)
+		// In theory deadline must have been set here
+		panic("No context deadline is set in protocol, should never happen")
 	}
 	return strfmt.DateTime(deadline)
 }
--- a/api/agent/protocol/json_test.go
+++ b/api/agent/protocol/json_test.go
@@ -2,12 +2,14 @@ package protocol

 import (
 	"bytes"
+	"context"
 	"encoding/json"
 	"io"
 	"io/ioutil"
 	"net/http"
 	"net/url"
 	"testing"
+	"time"

 	"github.com/fnproject/fn/api/models"
 )
@@ -16,7 +18,7 @@ type RequestData struct {
 	A string `json:"a"`
 }

-func setupRequest(data interface{}) *callInfoImpl {
+func setupRequest(data interface{}) (*callInfoImpl, context.CancelFunc) {
 	req := &http.Request{
 		Method: http.MethodPost,
 		URL: &url.URL{
@@ -46,12 +48,14 @@ func setupRequest(data interface{}) *callInfoImpl {
 	// fixup URL in models.Call
 	call.URL = req.URL.String()

-	ci := &callInfoImpl{call: call, req: req}
-	return ci
+	ctx, cancel := context.WithTimeout(req.Context(), 1*time.Second)
+	ci := &callInfoImpl{call: call, req: req.WithContext(ctx)}
+	return ci, cancel
 }

 func TestJSONProtocolwriteJSONInputRequestBasicFields(t *testing.T) {
-	ci := setupRequest(nil)
+	ci, cancel := setupRequest(nil)
+	defer cancel()
 	r, w := io.Pipe()
 	proto := JSONProtocol{w, r}
 	go func() {
@@ -88,7 +92,8 @@ func TestJSONProtocolwriteJSONInputRequestBasicFields(t *testing.T) {

 func TestJSONProtocolwriteJSONInputRequestWithData(t *testing.T) {
 	rDataBefore := RequestData{A: "a"}
-	ci := setupRequest(rDataBefore)
+	ci, cancel := setupRequest(rDataBefore)
+	defer cancel()
 	r, w := io.Pipe()
 	proto := JSONProtocol{w, r}
 	go func() {
@@ -133,7 +138,8 @@ func TestJSONProtocolwriteJSONInputRequestWithData(t *testing.T) {
 }

 func TestJSONProtocolwriteJSONInputRequestWithoutData(t *testing.T) {
-	ci := setupRequest(nil)
+	ci, cancel := setupRequest(nil)
+	defer cancel()
 	r, w := io.Pipe()
 	proto := JSONProtocol{w, r}
 	go func() {
@@ -177,7 +183,8 @@ func TestJSONProtocolwriteJSONInputRequestWithoutData(t *testing.T) {
 }

 func TestJSONProtocolwriteJSONInputRequestWithQuery(t *testing.T) {
-	ci := setupRequest(nil)
+	ci, cancel := setupRequest(nil)
+	defer cancel()
 	r, w := io.Pipe()
 	proto := JSONProtocol{w, r}
 	go func() {
--- a/api/agent/pure_runner.go
+++ b/api/agent/pure_runner.go
@@ -67,8 +67,7 @@ type callHandle struct {
 	c          *call // the agent's version of call

 	// Timings, for metrics:
-	receivedTime  strfmt.DateTime // When was the call received?
-	allocatedTime strfmt.DateTime // When did we finish allocating capacity?
+	receivedTime strfmt.DateTime // When was the call received?

 	// For implementing http.ResponseWriter:
 	headers http.Header
@@ -530,7 +529,7 @@ func (pr *pureRunner) handleTryCall(tc *runner.TryCall, state *callHandle) error
 		return err
 	}

-	agent_call, err := pr.a.GetCall(FromModelAndInput(&c, state.pipeToFnR), WithWriter(state))
+	agent_call, err := pr.a.GetCall(FromModelAndInput(&c, state.pipeToFnR), WithWriter(state), WithContext(state.ctx))
 	if err != nil {
 		state.enqueueCallResponse(err)
 		return err
@@ -545,7 +544,6 @@ func (pr *pureRunner) handleTryCall(tc *runner.TryCall, state *callHandle) error
 		}
 		state.c.slotHashId = string(hashId[:])
 	}
-	state.allocatedTime = strfmt.DateTime(time.Now())
 	pr.spawnSubmit(state)

 	return nil