From e9d5221e15b842d28da79d8646bc9dc1d131b0cb Mon Sep 17 00:00:00 2001 From: Tolga Ceylan Date: Mon, 16 Jul 2018 18:33:23 -0700 Subject: [PATCH] fn: Status gRPC call timeout handling (#1125) Status calls should not directly use client gRPC context deadlines/timeouts during Status execution. Status should allow plenty of time for the scheduler agent and docker to run and emit useful error information. Setting this timeout to 60 seconds, which should surface disk I/O, docker, etc. issues. --- api/agent/pure_runner.go | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/api/agent/pure_runner.go b/api/agent/pure_runner.go index f32c2bd2f..89a43b000 100644 --- a/api/agent/pure_runner.go +++ b/api/agent/pure_runner.go @@ -501,6 +501,10 @@ const ( StatusCallTimeout = int32(5) StatusCallIdleTimeout = int32(1) StatusCallCacheDuration = time.Duration(500)*time.Millisecond + time.Duration(StatusCallIdleTimeout)*time.Second + + // Total context timeout (scheduler+execution.) We need to allocate plenty of time here. + // 60 seconds should be enough to provoke disk I/O errors, docker timeouts. etc. + StatusCtxTimeout = time.Duration(60 * time.Second) ) // statusTracker maintains cache data/state/locks for Status Call invocations. @@ -668,7 +672,16 @@ DataLoop: } // Runs a status call using status image with baked in parameters. -func (pr *pureRunner) runStatusCall(ctx context.Context, timeout, idleTimeout int32) *runner.RunnerStatus { +func (pr *pureRunner) runStatusCall(ctx context.Context) *runner.RunnerStatus { + + // IMPORTANT: We have to use our own context with a set timeout in order + // to ignore client timeouts. Original 'ctx' here carries client side deadlines. + // Since these deadlines can vary, in order to make sure Status runs predictably we + // use a hardcoded preset timeout 'ctxTimeout' instead. + // TODO: It would be good to copy original ctx key/value pairs into our new + // context for tracking, etc. But go-lang context today does not seem to allow this. + execCtx, execCtxCancel := context.WithTimeout(context.Background(), StatusCtxTimeout) + defer execCtxCancel() result := &runner.RunnerStatus{} log := common.Logger(ctx) @@ -693,19 +706,19 @@ func (pr *pureRunner) runStatusCall(ctx context.Context, timeout, idleTimeout in c.Config = make(models.Config) c.Config["FN_FORMAT"] = c.Format c.Payload = "{}" - c.Timeout = timeout - c.IdleTimeout = idleTimeout + c.Timeout = StatusCallTimeout + c.IdleTimeout = StatusCallIdleTimeout // TODO: reliably shutdown this container after executing one request. - log.Debugf("Running status call with id=%v timeout=%v image=%v", c.ID, c.Timeout, c.Image) + log.Debugf("Running status call with id=%v image=%v", c.ID, c.Image) recorder := httptest.NewRecorder() player := ioutil.NopCloser(strings.NewReader(c.Payload)) agent_call, err := pr.a.GetCall(FromModelAndInput(&c, player), WithWriter(recorder), - WithContext(ctx), + WithContext(execCtx), ) if err == nil { @@ -810,7 +823,7 @@ func (pr *pureRunner) handleStatusCall(ctx context.Context) (*runner.RunnerStatu return &cacheObj, nil } - cachePtr := pr.runStatusCall(ctx, StatusCallTimeout, StatusCallIdleTimeout) + cachePtr := pr.runStatusCall(ctx) cachePtr.Active = atomic.LoadInt32(&pr.status.inflight) now = time.Now()