From a9a828cc4044edcea608fbf3b142bca120f4aa2f Mon Sep 17 00:00:00 2001 From: Tolga Ceylan Date: Mon, 3 Dec 2018 14:58:48 -0800 Subject: [PATCH] fn: error handling updates (#1337) * docker-pull timeout is now a 504 which classifies it as a service error. Avoid using 503 to make sure LB does not retry. * Only applicable to detached mode, a timeout on LB is now a ErrServiceReservationFailure (500). In detached mode, this is unlikely to make it back to a client and it is mostly for documentation/metrics purposes. * For Triggers, avoid scrubbing service code. --- api/agent/lb_agent.go | 16 ++++++++++++---- api/models/error.go | 7 +++++-- api/server/runner_httptrigger.go | 18 +++++++++--------- 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/api/agent/lb_agent.go b/api/agent/lb_agent.go index abb32f6a9..abba3b76f 100644 --- a/api/agent/lb_agent.go +++ b/api/agent/lb_agent.go @@ -304,6 +304,18 @@ func (a *lbAgent) handleCallEnd(ctx context.Context, call *call, err error, isFo if err == nil { statsComplete(ctx) recordCallLatency(ctx, call, completedMetricName) + } else if err == context.DeadlineExceeded { + // We are here because we were unable to service this request for the given + // reservation. In detached case, the reservation is calculated based on estimated + // total time to run a request. (See: spawnPlaceCall) Otherwise, there's no set + // deadline in the request context. This is also a bit more robust going forward + // if we start enforcing a maximum overall deadline for clients. For detached case, the + // error is unlikely to be delivered to the client since this is essentially an async + // operation. + statsTimedout(ctx) + recordCallLatency(ctx, call, timedoutMetricName) + // We have failed: http 500 Internal Server Error + return models.ErrServiceReservationFailure } } else { statsDequeue(ctx) @@ -318,10 +330,6 @@ func (a *lbAgent) handleCallEnd(ctx context.Context, call *call, err error, isFo statsTooBusy(ctx) recordCallLatency(ctx, call, serverBusyMetricName) return models.ErrCallTimeoutServerBusy - } else if err == context.DeadlineExceeded { - statsTimedout(ctx) - recordCallLatency(ctx, call, timedoutMetricName) - return models.ErrCallTimeout } else if err == context.Canceled { statsCanceled(ctx) recordCallLatency(ctx, call, canceledMetricName) diff --git a/api/models/error.go b/api/models/error.go index 05b9e2e3f..f470dfd56 100644 --- a/api/models/error.go +++ b/api/models/error.go @@ -37,7 +37,7 @@ var ( error: errors.New("Timed out - server too busy"), } ErrDockerPullTimeout = err{ - code: http.StatusServiceUnavailable, + code: http.StatusGatewayTimeout, error: errors.New("Docker pull timed out"), } ErrContainerInitTimeout = err{ @@ -188,7 +188,10 @@ var ( code: http.StatusInternalServerError, error: errors.New("Unable to find the call handle"), } - + ErrServiceReservationFailure = err{ + code: http.StatusInternalServerError, + error: errors.New("Unable to service the request for the reservation period"), + } ErrContainerInitFail = err{ code: http.StatusBadGateway, error: errors.New("container failed to initialize, please ensure you are using the latest fdk / format and check the logs"), diff --git a/api/server/runner_httptrigger.go b/api/server/runner_httptrigger.go index 5adba92a6..2197a6ded 100644 --- a/api/server/runner_httptrigger.go +++ b/api/server/runner_httptrigger.go @@ -76,13 +76,13 @@ func (trw *triggerResponseWriter) Write(b []byte) (int, error) { return trw.inner.Write(b) } -func (trw *triggerResponseWriter) WriteHeader(statusCode int) { +func (trw *triggerResponseWriter) WriteHeader(serviceStatus int) { if trw.committed { return } trw.committed = true - var fnStatus int + userStatus := 0 realHeaders := trw.Header() gwHeaders := make(http.Header, len(realHeaders)) for k, vs := range realHeaders { @@ -96,7 +96,7 @@ func (trw *triggerResponseWriter) WriteHeader(statusCode int) { if len(vs) > 0 { statusInt, err := strconv.Atoi(vs[0]) if err == nil { - fnStatus = statusInt + userStatus = statusInt } } case k == "Content-Type", k == "Fn-Call-Id": @@ -113,14 +113,14 @@ func (trw *triggerResponseWriter) WriteHeader(statusCode int) { } // XXX(reed): simplify / add tests for these behaviors... - gatewayStatus := 200 - if statusCode >= 400 { - gatewayStatus = 502 - } else if fnStatus > 0 { - gatewayStatus = fnStatus + finalStatus := 200 + if serviceStatus >= 400 { + finalStatus = serviceStatus + } else if userStatus > 0 { + finalStatus = userStatus } - trw.inner.WriteHeader(gatewayStatus) + trw.inner.WriteHeader(finalStatus) } var skipTriggerHeaders = map[string]bool{