fn: Status gRPC call timeout handling (#1125)

Status calls should not directly use client gRPC context deadlines/timeouts during Status execution. Status should allow plenty of time for the scheduler agent and docker to run and emit useful error information. Setting this timeout to 60 seconds, which should surface disk I/O, docker, etc. issues.
2022-10-28 21:29:17 +03:00 · 2018-07-16 18:33:23 -07:00
parent b0d27958e3
commit e9d5221e15
1 changed files with 19 additions and 6 deletions
--- a/api/agent/pure_runner.go
+++ b/api/agent/pure_runner.go
@@ -501,6 +501,10 @@ const (
 	StatusCallTimeout       = int32(5)
 	StatusCallIdleTimeout   = int32(1)
 	StatusCallCacheDuration = time.Duration(500)*time.Millisecond + time.Duration(StatusCallIdleTimeout)*time.Second
+
+	// Total context timeout (scheduler+execution.) We need to allocate plenty of time here.
+	// 60 seconds should be enough to provoke disk I/O errors, docker timeouts. etc.
+	StatusCtxTimeout = time.Duration(60 * time.Second)
 )

 // statusTracker maintains cache data/state/locks for Status Call invocations.
@@ -668,7 +672,16 @@ DataLoop:
 }

 // Runs a status call using status image with baked in parameters.
-func (pr *pureRunner) runStatusCall(ctx context.Context, timeout, idleTimeout int32) *runner.RunnerStatus {
+func (pr *pureRunner) runStatusCall(ctx context.Context) *runner.RunnerStatus {
+
+	// IMPORTANT: We have to use our own context with a set timeout in order
+	// to ignore client timeouts. Original 'ctx' here carries client side deadlines.
+	// Since these deadlines can vary, in order to make sure Status runs predictably we
+	// use a hardcoded preset timeout 'ctxTimeout' instead.
+	// TODO: It would be good to copy original ctx key/value pairs into our new
+	// context for tracking, etc. But go-lang context today does not seem to allow this.
+	execCtx, execCtxCancel := context.WithTimeout(context.Background(), StatusCtxTimeout)
+	defer execCtxCancel()

 	result := &runner.RunnerStatus{}
 	log := common.Logger(ctx)
@@ -693,19 +706,19 @@ func (pr *pureRunner) runStatusCall(ctx context.Context, timeout, idleTimeout in
 	c.Config = make(models.Config)
 	c.Config["FN_FORMAT"] = c.Format
 	c.Payload = "{}"
-	c.Timeout = timeout
-	c.IdleTimeout = idleTimeout
+	c.Timeout = StatusCallTimeout
+	c.IdleTimeout = StatusCallIdleTimeout

 	// TODO: reliably shutdown this container after executing one request.

-	log.Debugf("Running status call with id=%v timeout=%v image=%v", c.ID, c.Timeout, c.Image)
+	log.Debugf("Running status call with id=%v image=%v", c.ID, c.Image)

 	recorder := httptest.NewRecorder()
 	player := ioutil.NopCloser(strings.NewReader(c.Payload))

 	agent_call, err := pr.a.GetCall(FromModelAndInput(&c, player),
 		WithWriter(recorder),
-		WithContext(ctx),
+		WithContext(execCtx),
 	)

 	if err == nil {
@@ -810,7 +823,7 @@ func (pr *pureRunner) handleStatusCall(ctx context.Context) (*runner.RunnerStatu
 		return &cacheObj, nil
 	}

-	cachePtr := pr.runStatusCall(ctx, StatusCallTimeout, StatusCallIdleTimeout)
+	cachePtr := pr.runStatusCall(ctx)
 	cachePtr.Active = atomic.LoadInt32(&pr.status.inflight)
 	now = time.Now()