fn: cleanup of docker timeouts and docker health check (#1292)

Moving the timeout management of various docker operations
to agent. This allows for finer control over what operation
should use. For instance, for pause/unpause our tolerance
is very low to avoid resource issues. For docker remove,
the consequences of failure will lead to potential agent
failure and therefore we wait up to 10 minute.
For cookie create/prepare (which includes docker-pull)
we cap this at 10 minutes by default.

With new UDS/FDK contract, health check is now obsoleted
as container advertise health using UDS availibility.
This commit is contained in:
Tolga Ceylan
2018-11-01 14:22:47 -07:00
committed by GitHub
parent 1e3104c649
commit de9c2cbb63
5 changed files with 24 additions and 77 deletions

View File

@@ -233,7 +233,6 @@ func (drv *DockerDriver) CreateCookie(ctx context.Context, task drivers.Containe
ReadonlyRootfs: drv.conf.EnableReadOnlyRootFs,
Init: drv.conf.EnableTini,
},
Context: ctx,
}
cookie := &cookie{
@@ -276,6 +275,7 @@ func (drv *DockerDriver) PrepareCookie(ctx context.Context, c drivers.Cookie) er
return err
}
cookie.opts.Context = ctx
_, err = drv.docker.CreateContainer(cookie.opts)
if err != nil {
// since we retry under the hood, if the container gets created and retry fails, we can just ignore error
@@ -577,36 +577,7 @@ func (drv *DockerDriver) startTask(ctx context.Context, container string) error
return err
}
}
// see if there's any healthcheck, and if so, wait for it to complete
return drv.awaitHealthcheck(ctx, container)
}
func (drv *DockerDriver) awaitHealthcheck(ctx context.Context, container string) error {
// inspect the container and check if there is any health check presented,
// if there is, then wait for it to move to healthy before returning.
for {
select {
case <-ctx.Done():
return ctx.Err()
default:
}
cont, err := drv.docker.InspectContainerWithContext(container, ctx)
if err != nil {
// TODO unknown fiddling to be had
return err
}
// if no health check for this image (""), or it's healthy, then stop waiting.
// state machine is "starting" -> "healthy" | "unhealthy"
if cont.State.Health.Status == "" || cont.State.Health.Status == "healthy" {
break
}
time.Sleep(100 * time.Millisecond) // avoid spin loop in case docker is actually fast
}
return nil
return err
}
func (w *waitResult) wait(ctx context.Context) (status string, err error) {

View File

@@ -20,8 +20,6 @@ import (
)
const (
retryTimeout = 10 * time.Minute
pauseTimeout = 5 * time.Second
eventRetryDelay = 1 * time.Second
)
@@ -43,7 +41,6 @@ type dockerClient interface {
UnpauseContainer(id string, ctx context.Context) error
PullImage(opts docker.PullImageOptions, auth docker.AuthConfiguration) error
InspectImage(ctx context.Context, name string) (*docker.Image, error)
InspectContainerWithContext(container string, ctx context.Context) (*docker.Container, error)
Stats(opts docker.StatsOptions) error
Info(ctx context.Context) (*docker.DockerInfo, error)
LoadImages(ctx context.Context, filePath string) error
@@ -386,15 +383,9 @@ func (d *dockerWrap) PullImage(opts docker.PullImageOptions, auth docker.AuthCon
}
func (d *dockerWrap) RemoveContainer(opts docker.RemoveContainerOptions) (err error) {
// extract the span, but do not keep the context, since the enclosing context
// may be timed out, and we still want to remove the container. TODO in caller? who cares?
ctx := common.BackgroundContext(opts.Context)
ctx, closer := makeTracker(ctx, "docker_remove_container")
ctx, closer := makeTracker(opts.Context, "docker_remove_container")
defer closer()
ctx, cancel := context.WithTimeout(ctx, retryTimeout)
defer cancel()
logger := common.Logger(ctx).WithField("docker_cmd", "RemoveContainer")
err = d.retry(ctx, logger, func() error {
err = d.docker.RemoveContainer(opts)
@@ -407,9 +398,6 @@ func (d *dockerWrap) PauseContainer(id string, ctx context.Context) (err error)
ctx, closer := makeTracker(ctx, "docker_pause_container")
defer closer()
ctx, cancel := context.WithTimeout(ctx, pauseTimeout)
defer cancel()
logger := common.Logger(ctx).WithField("docker_cmd", "PauseContainer")
err = d.retry(ctx, logger, func() error {
err = d.docker.PauseContainer(id)
@@ -422,9 +410,6 @@ func (d *dockerWrap) UnpauseContainer(id string, ctx context.Context) (err error
ctx, closer := makeTracker(ctx, "docker_unpause_container")
defer closer()
ctx, cancel := context.WithTimeout(ctx, pauseTimeout)
defer cancel()
logger := common.Logger(ctx).WithField("docker_cmd", "UnpauseContainer")
err = d.retry(ctx, logger, func() error {
err = d.docker.UnpauseContainer(id)
@@ -437,9 +422,6 @@ func (d *dockerWrap) InspectImage(ctx context.Context, name string) (i *docker.I
ctx, closer := makeTracker(ctx, "docker_inspect_image")
defer closer()
ctx, cancel := context.WithTimeout(ctx, retryTimeout)
defer cancel()
logger := common.Logger(ctx).WithField("docker_cmd", "InspectImage")
err = d.retry(ctx, logger, func() error {
i, err = d.docker.InspectImage(name)
@@ -448,21 +430,6 @@ func (d *dockerWrap) InspectImage(ctx context.Context, name string) (i *docker.I
return i, err
}
func (d *dockerWrap) InspectContainerWithContext(container string, ctx context.Context) (c *docker.Container, err error) {
ctx, closer := makeTracker(ctx, "docker_inspect_container")
defer closer()
ctx, cancel := context.WithTimeout(ctx, retryTimeout)
defer cancel()
logger := common.Logger(ctx).WithField("docker_cmd", "InspectContainer")
err = d.retry(ctx, logger, func() error {
c, err = d.docker.InspectContainerWithContext(container, ctx)
return err
})
return c, err
}
func (d *dockerWrap) Stats(opts docker.StatsOptions) (err error) {
// we can't retry this one this way since the callee closes the
// stats chan, need a fancier retry mechanism where we can swap out

View File

@@ -44,8 +44,6 @@ const (
const (
LimitPerSec = 10
LimitBurst = 20
ShutdownTimeout = time.Duration(1) * time.Second
)
type poolTask struct {
@@ -254,15 +252,11 @@ func (pool *dockerPool) performReadyState(ctx context.Context, driver *DockerDri
}
func (pool *dockerPool) performTeardown(ctx context.Context, driver *DockerDriver, task *poolTask) {
ctx, cancel := context.WithTimeout(context.Background(), ShutdownTimeout)
defer cancel()
removeOpts := docker.RemoveContainerOptions{
ID: task.Id(),
Force: true,
RemoveVolumes: true,
Context: ctx,
Context: context.Background(),
}
driver.docker.RemoveContainer(removeOpts)