From f08ea57bc015278dc70e43495395708b753cded1 Mon Sep 17 00:00:00 2001
From: Reed Allman <rdallman10@gmail.com>
Date: Fri, 17 Nov 2017 20:31:33 -0800
Subject: [PATCH] add docker health check waiter to start (#434)

before returning the cookie in the driver, wait for health checks
https://docs.docker.com/engine/reference/builder/#healthcheck if provided.
for images that don't have health checks, this will have no affect (an added
call to inspect container, for hot it's small potatoes).

this will be useful for containers so that they can pull large files or do
setup that takes a while before accepting tasks. since this is before start,
it won't run into the idle timeout. we could likely use these for hot
containers in general and check between runs or something, but didn't do that
here.

one nascient concern is that for hot if the containers never become healthy
I don't think we will ever kill them and the slot will 'leak'. this is true
for this and for other cases (pulling image) I think, we should probably
recycle hot containers every hour or something which would also close this.
anyway, not a huge blocker I don't think, there will likely be 1 user of this
feature for a bit, it's not documented since we're not sure we want to support
it.

closes #336
---
 api/agent/drivers/docker/docker.go        | 30 +++++++++++++++++++++++
 api/agent/drivers/docker/docker_client.go | 13 ++++++++++
 2 files changed, 43 insertions(+)

diff --git a/api/agent/drivers/docker/docker.go b/api/agent/drivers/docker/docker.go
index 7009bc0e1..0c3a00f84 100644
--- a/api/agent/drivers/docker/docker.go
+++ b/api/agent/drivers/docker/docker.go
@@ -10,6 +10,7 @@ import (
 	"os"
 	"path"
 	"strings"
+	"time"
 
 	"github.com/fnproject/fn/api/agent/drivers"
 	"github.com/fnproject/fn/api/common"
@@ -440,6 +441,35 @@ func (drv *DockerDriver) startTask(ctx context.Context, container string) error
 			return err
 		}
 	}
+
+	// see if there's any healthcheck, and if so, wait for it to complete
+	return drv.awaitHealthcheck(ctx, container)
+}
+
+func (drv *DockerDriver) awaitHealthcheck(ctx context.Context, container string) error {
+	// inspect the container and check if there is any health check presented,
+	// if there is, then wait for it to move to healthy before returning.
+	for {
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		default:
+		}
+
+		cont, err := drv.docker.InspectContainerWithContext(container, ctx)
+		if err != nil {
+			// TODO unknown fiddling to be had
+			return err
+		}
+
+		// if no health check for this image (""), or it's healthy, then stop waiting.
+		// state machine is "starting" -> "healthy" | "unhealthy"
+		if cont.State.Health.Status == "" || cont.State.Health.Status == "healthy" {
+			break
+		}
+
+		time.Sleep(100 * time.Millisecond) // avoid spin loop in case docker is actually fast
+	}
 	return nil
 }
 
diff --git a/api/agent/drivers/docker/docker_client.go b/api/agent/drivers/docker/docker_client.go
index ee94fd430..82b24f9bc 100644
--- a/api/agent/drivers/docker/docker_client.go
+++ b/api/agent/drivers/docker/docker_client.go
@@ -36,6 +36,7 @@ type dockerClient interface {
 	RemoveContainer(opts docker.RemoveContainerOptions) error
 	PullImage(opts docker.PullImageOptions, auth docker.AuthConfiguration) error
 	InspectImage(ctx context.Context, name string) (*docker.Image, error)
+	InspectContainerWithContext(container string, ctx context.Context) (*docker.Container, error)
 	Stats(opts docker.StatsOptions) error
 }
 
@@ -257,6 +258,18 @@ func (d *dockerWrap) InspectImage(ctx context.Context, name string) (i *docker.I
 	return i, err
 }
 
+func (d *dockerWrap) InspectContainerWithContext(container string, ctx context.Context) (c *docker.Container, err error) {
+	span, ctx := opentracing.StartSpanFromContext(ctx, "docker_inspect_container")
+	defer span.Finish()
+	ctx, cancel := context.WithTimeout(ctx, retryTimeout)
+	defer cancel()
+	err = d.retry(ctx, func() error {
+		c, err = d.docker.InspectContainerWithContext(container, ctx)
+		return err
+	})
+	return c, err
+}
+
 func (d *dockerWrap) Stats(opts docker.StatsOptions) (err error) {
 	// we can't retry this one this way since the callee closes the
 	// stats chan, need a fancier retry mechanism where we can swap out