From cbfd659e7eb15a3ea5bcdf09fa839d489a06aeba Mon Sep 17 00:00:00 2001 From: Reed Allman Date: Mon, 12 Feb 2018 18:45:30 -0800 Subject: [PATCH] cap docker retries to fixed number (#762) previously we would retry infinitely up to the context with some backoff in between. for hot functions, since we don't set any dead line on pulling or creating the image, this means it would retry forever without making any progress if e.g. the registry is inaccessable or any other temporary error that isn't actually temporary. this adds a hard cap of 10 retries, which gives approximately 13s if the ops take no time, still respecting the context deadline enclosed. the case where this was coming up is now tested for and was otherwise confusing for users to debug, now it spits out an ECONNREFUSED with the address of the registry, which should help users debug without having to poke around fn logs (though I don't like this as an excuse, not all users will be operators at some point in the near future, and this one makes sense) closes #727 --- api/agent/drivers/docker/docker_client.go | 7 +++++-- api/server/runner_test.go | 4 ++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/api/agent/drivers/docker/docker_client.go b/api/agent/drivers/docker/docker_client.go index 76b6ee842..e68fe6a35 100644 --- a/api/agent/drivers/docker/docker_client.go +++ b/api/agent/drivers/docker/docker_client.go @@ -100,12 +100,14 @@ type dockerWrap struct { func (d *dockerWrap) retry(ctx context.Context, f func() error) error { var i int + var err error span := opentracing.SpanFromContext(ctx) defer func() { span.LogFields(log.Int("docker_call_retries", i)) }() logger := common.Logger(ctx) var b common.Backoff - for ; ; i++ { + // 10 retries w/o change to backoff is ~13s if ops take ~0 time + for ; i < 10; i++ { select { case <-ctx.Done(): span.LogFields(log.String("task", "fail.docker")) @@ -114,7 +116,7 @@ func (d *dockerWrap) retry(ctx context.Context, f func() error) error { default: } - err := filter(ctx, f()) + err = filter(ctx, f()) if common.IsTemporary(err) || isDocker50x(err) { logger.WithError(err).Warn("docker temporary error, retrying") b.Sleep(ctx) @@ -126,6 +128,7 @@ func (d *dockerWrap) retry(ctx context.Context, f func() error) error { } return err } + return err // TODO could return context.DeadlineExceeded which ~makes sense } func isDocker50x(err error) bool { diff --git a/api/server/runner_test.go b/api/server/runner_test.go index 63c4c5bf1..57de18adb 100644 --- a/api/server/runner_test.go +++ b/api/server/runner_test.go @@ -137,6 +137,7 @@ func TestRouteRunnerExecution(t *testing.T) { {Path: "/myerror", AppName: "myapp", Image: "fnproject/fn-test-utils", Type: "sync", Memory: 128, Timeout: 30, IdleTimeout: 30, Headers: map[string][]string{"X-Function": {"Test"}}}, {Path: "/mydne", AppName: "myapp", Image: "fnproject/imagethatdoesnotexist", Type: "sync", Memory: 128, Timeout: 30, IdleTimeout: 30}, {Path: "/mydnehot", AppName: "myapp", Image: "fnproject/imagethatdoesnotexist", Type: "sync", Format: "http", Memory: 128, Timeout: 30, IdleTimeout: 30}, + {Path: "/mydneregistry", AppName: "myapp", Image: "localhost:5000/fnproject/imagethatdoesnotexist", Type: "sync", Format: "http", Memory: 128, Timeout: 30, IdleTimeout: 30}, {Path: "/myoom", AppName: "myapp", Image: "fnproject/fn-test-utils", Type: "sync", Memory: 8, Timeout: 30, IdleTimeout: 30}, }, nil, ) @@ -177,6 +178,9 @@ func TestRouteRunnerExecution(t *testing.T) { {"/r/myapp/myerror", crasher, "GET", http.StatusBadGateway, expHeaders, "container exit code 2"}, {"/r/myapp/mydne", ``, "GET", http.StatusNotFound, nil, "pull access denied"}, {"/r/myapp/mydnehot", ``, "GET", http.StatusNotFound, nil, "pull access denied"}, + // hit a registry that doesn't exist, make sure the real error body gets plumbed out + {"/r/myapp/mydneregistry", ``, "GET", http.StatusInternalServerError, nil, "connection refused"}, + {"/r/myapp/myoom", oomer, "GET", http.StatusBadGateway, nil, "container out of memory"}, } { body := strings.NewReader(test.body)