From f08ea57bc015278dc70e43495395708b753cded1 Mon Sep 17 00:00:00 2001 From: Reed Allman Date: Fri, 17 Nov 2017 20:31:33 -0800 Subject: [PATCH] add docker health check waiter to start (#434) before returning the cookie in the driver, wait for health checks https://docs.docker.com/engine/reference/builder/#healthcheck if provided. for images that don't have health checks, this will have no affect (an added call to inspect container, for hot it's small potatoes). this will be useful for containers so that they can pull large files or do setup that takes a while before accepting tasks. since this is before start, it won't run into the idle timeout. we could likely use these for hot containers in general and check between runs or something, but didn't do that here. one nascient concern is that for hot if the containers never become healthy I don't think we will ever kill them and the slot will 'leak'. this is true for this and for other cases (pulling image) I think, we should probably recycle hot containers every hour or something which would also close this. anyway, not a huge blocker I don't think, there will likely be 1 user of this feature for a bit, it's not documented since we're not sure we want to support it. closes #336 --- api/agent/drivers/docker/docker.go | 30 +++++++++++++++++++++++ api/agent/drivers/docker/docker_client.go | 13 ++++++++++ 2 files changed, 43 insertions(+) diff --git a/api/agent/drivers/docker/docker.go b/api/agent/drivers/docker/docker.go index 7009bc0e1..0c3a00f84 100644 --- a/api/agent/drivers/docker/docker.go +++ b/api/agent/drivers/docker/docker.go @@ -10,6 +10,7 @@ import ( "os" "path" "strings" + "time" "github.com/fnproject/fn/api/agent/drivers" "github.com/fnproject/fn/api/common" @@ -440,6 +441,35 @@ func (drv *DockerDriver) startTask(ctx context.Context, container string) error return err } } + + // see if there's any healthcheck, and if so, wait for it to complete + return drv.awaitHealthcheck(ctx, container) +} + +func (drv *DockerDriver) awaitHealthcheck(ctx context.Context, container string) error { + // inspect the container and check if there is any health check presented, + // if there is, then wait for it to move to healthy before returning. + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + cont, err := drv.docker.InspectContainerWithContext(container, ctx) + if err != nil { + // TODO unknown fiddling to be had + return err + } + + // if no health check for this image (""), or it's healthy, then stop waiting. + // state machine is "starting" -> "healthy" | "unhealthy" + if cont.State.Health.Status == "" || cont.State.Health.Status == "healthy" { + break + } + + time.Sleep(100 * time.Millisecond) // avoid spin loop in case docker is actually fast + } return nil } diff --git a/api/agent/drivers/docker/docker_client.go b/api/agent/drivers/docker/docker_client.go index ee94fd430..82b24f9bc 100644 --- a/api/agent/drivers/docker/docker_client.go +++ b/api/agent/drivers/docker/docker_client.go @@ -36,6 +36,7 @@ type dockerClient interface { RemoveContainer(opts docker.RemoveContainerOptions) error PullImage(opts docker.PullImageOptions, auth docker.AuthConfiguration) error InspectImage(ctx context.Context, name string) (*docker.Image, error) + InspectContainerWithContext(container string, ctx context.Context) (*docker.Container, error) Stats(opts docker.StatsOptions) error } @@ -257,6 +258,18 @@ func (d *dockerWrap) InspectImage(ctx context.Context, name string) (i *docker.I return i, err } +func (d *dockerWrap) InspectContainerWithContext(container string, ctx context.Context) (c *docker.Container, err error) { + span, ctx := opentracing.StartSpanFromContext(ctx, "docker_inspect_container") + defer span.Finish() + ctx, cancel := context.WithTimeout(ctx, retryTimeout) + defer cancel() + err = d.retry(ctx, func() error { + c, err = d.docker.InspectContainerWithContext(container, ctx) + return err + }) + return c, err +} + func (d *dockerWrap) Stats(opts docker.StatsOptions) (err error) { // we can't retry this one this way since the callee closes the // stats chan, need a fancier retry mechanism where we can swap out