cap docker retries to fixed number (#762)

previously we would retry infinitely up to the context with some backoff in
between. for hot functions, since we don't set any dead line on pulling or
creating the image, this means it would retry forever without making any
progress if e.g. the registry is inaccessable or any other temporary error
that isn't actually temporary.  this adds a hard cap of 10 retries, which
gives approximately 13s if the ops take no time, still respecting the context
deadline enclosed.

the case where this was coming up is now tested for and was otherwise
confusing for users to debug, now it spits out an ECONNREFUSED with the
address of the registry, which should help users debug without having to poke
around fn logs (though I don't like this as an excuse, not all users will be
operators at some point in the near future, and this one makes sense)

closes #727
This commit is contained in:
Reed Allman
2018-02-12 18:45:30 -08:00
committed by GitHub
parent 726f615a03
commit cbfd659e7e
2 changed files with 9 additions and 2 deletions

View File

@@ -100,12 +100,14 @@ type dockerWrap struct {
func (d *dockerWrap) retry(ctx context.Context, f func() error) error { func (d *dockerWrap) retry(ctx context.Context, f func() error) error {
var i int var i int
var err error
span := opentracing.SpanFromContext(ctx) span := opentracing.SpanFromContext(ctx)
defer func() { span.LogFields(log.Int("docker_call_retries", i)) }() defer func() { span.LogFields(log.Int("docker_call_retries", i)) }()
logger := common.Logger(ctx) logger := common.Logger(ctx)
var b common.Backoff var b common.Backoff
for ; ; i++ { // 10 retries w/o change to backoff is ~13s if ops take ~0 time
for ; i < 10; i++ {
select { select {
case <-ctx.Done(): case <-ctx.Done():
span.LogFields(log.String("task", "fail.docker")) span.LogFields(log.String("task", "fail.docker"))
@@ -114,7 +116,7 @@ func (d *dockerWrap) retry(ctx context.Context, f func() error) error {
default: default:
} }
err := filter(ctx, f()) err = filter(ctx, f())
if common.IsTemporary(err) || isDocker50x(err) { if common.IsTemporary(err) || isDocker50x(err) {
logger.WithError(err).Warn("docker temporary error, retrying") logger.WithError(err).Warn("docker temporary error, retrying")
b.Sleep(ctx) b.Sleep(ctx)
@@ -126,6 +128,7 @@ func (d *dockerWrap) retry(ctx context.Context, f func() error) error {
} }
return err return err
} }
return err // TODO could return context.DeadlineExceeded which ~makes sense
} }
func isDocker50x(err error) bool { func isDocker50x(err error) bool {

View File

@@ -137,6 +137,7 @@ func TestRouteRunnerExecution(t *testing.T) {
{Path: "/myerror", AppName: "myapp", Image: "fnproject/fn-test-utils", Type: "sync", Memory: 128, Timeout: 30, IdleTimeout: 30, Headers: map[string][]string{"X-Function": {"Test"}}}, {Path: "/myerror", AppName: "myapp", Image: "fnproject/fn-test-utils", Type: "sync", Memory: 128, Timeout: 30, IdleTimeout: 30, Headers: map[string][]string{"X-Function": {"Test"}}},
{Path: "/mydne", AppName: "myapp", Image: "fnproject/imagethatdoesnotexist", Type: "sync", Memory: 128, Timeout: 30, IdleTimeout: 30}, {Path: "/mydne", AppName: "myapp", Image: "fnproject/imagethatdoesnotexist", Type: "sync", Memory: 128, Timeout: 30, IdleTimeout: 30},
{Path: "/mydnehot", AppName: "myapp", Image: "fnproject/imagethatdoesnotexist", Type: "sync", Format: "http", Memory: 128, Timeout: 30, IdleTimeout: 30}, {Path: "/mydnehot", AppName: "myapp", Image: "fnproject/imagethatdoesnotexist", Type: "sync", Format: "http", Memory: 128, Timeout: 30, IdleTimeout: 30},
{Path: "/mydneregistry", AppName: "myapp", Image: "localhost:5000/fnproject/imagethatdoesnotexist", Type: "sync", Format: "http", Memory: 128, Timeout: 30, IdleTimeout: 30},
{Path: "/myoom", AppName: "myapp", Image: "fnproject/fn-test-utils", Type: "sync", Memory: 8, Timeout: 30, IdleTimeout: 30}, {Path: "/myoom", AppName: "myapp", Image: "fnproject/fn-test-utils", Type: "sync", Memory: 8, Timeout: 30, IdleTimeout: 30},
}, nil, }, nil,
) )
@@ -177,6 +178,9 @@ func TestRouteRunnerExecution(t *testing.T) {
{"/r/myapp/myerror", crasher, "GET", http.StatusBadGateway, expHeaders, "container exit code 2"}, {"/r/myapp/myerror", crasher, "GET", http.StatusBadGateway, expHeaders, "container exit code 2"},
{"/r/myapp/mydne", ``, "GET", http.StatusNotFound, nil, "pull access denied"}, {"/r/myapp/mydne", ``, "GET", http.StatusNotFound, nil, "pull access denied"},
{"/r/myapp/mydnehot", ``, "GET", http.StatusNotFound, nil, "pull access denied"}, {"/r/myapp/mydnehot", ``, "GET", http.StatusNotFound, nil, "pull access denied"},
// hit a registry that doesn't exist, make sure the real error body gets plumbed out
{"/r/myapp/mydneregistry", ``, "GET", http.StatusInternalServerError, nil, "connection refused"},
{"/r/myapp/myoom", oomer, "GET", http.StatusBadGateway, nil, "container out of memory"}, {"/r/myapp/myoom", oomer, "GET", http.StatusBadGateway, nil, "container out of memory"},
} { } {
body := strings.NewReader(test.body) body := strings.NewReader(test.body)