remove docker inspect container

we had the inspect container here for 3 reasons:

1) get exit code
2) see if container is still running (debugging madness)
3) see if docker thinks it was an OOM

1) is something wait returns, but due to 2) and 3) we just delayed it until
inspection

2) was really just for debugging since we had 3)

3) seems unnecessary. to me, an OOM is an OOM is an OOM. so why have a whole
docker inspect call just to find out? (we could move this down, since it's a
sad path, and make the call only when necessary, but are we really getting any
value from this distinction anyway? i've never ran into it, myself)

inspect was actually causing tasks to time out, since the call to inspect
could put us over our task timeout, even though our container ran to
completion. we could have fixed this by checking the context earlier, but we
don't really need inspect either, which will reduce the docker calls we make,
which will make more unicorn puppers. now tasks should have more 'true'
timeouts.

tried to boy scout, but tracing patch also cleans this block up too.
This commit is contained in:
Reed Allman
2017-07-24 13:37:29 -07:00
parent 9f9c9cc40e
commit c215dcf5dd
2 changed files with 28 additions and 71 deletions

View File

@@ -3,7 +3,6 @@ package docker
import (
"context"
"crypto/tls"
"errors"
"fmt"
"io"
"net"
@@ -385,7 +384,6 @@ func (drv *DockerDriver) pullImage(ctx context.Context, task drivers.ContainerTa
// Run executes the docker container. If task runs, drivers.RunResult will be returned. If something fails outside the task (ie: Docker), it will return error.
// The docker driver will attempt to cast the task to a Auther. If that succeeds, private image support is available. See the Auther interface for how to implement this.
func (drv *DockerDriver) run(ctx context.Context, container string, task drivers.ContainerTask) (drivers.RunResult, error) {
log := common.Logger(ctx)
timeout := task.Timeout()
var cancel context.CancelFunc
@@ -405,35 +403,28 @@ func (drv *DockerDriver) run(ctx context.Context, container string, task drivers
Stream: true, Logs: true, Stdout: true, Stderr: true,
Stdin: true, InputStream: task.Input()})
timer.Measure()
if err != nil {
if err != nil && ctx.Err() == nil {
// ignore if ctx has errored, rewrite status lay below
return nil, err
}
start := time.Now()
err = drv.startTask(ctx, container)
if err != nil {
if err == context.DeadlineExceeded {
// if there's just a timeout making the docker calls, rewrite it as such
return &runResult{start: start, status: drivers.StatusTimeout}, nil
}
if err != nil && ctx.Err() == nil {
// if there's just a timeout making the docker calls, drv.wait below will rewrite it to timeout
return nil, err
}
taskTimer := drv.NewTimer("docker", "container_runtime", 1)
// can discard error, inspect will tell us about the task and wait will retry under the hood
drv.docker.WaitContainerWithContext(container, ctx)
taskTimer.Measure()
defer func() {
taskTimer.Measure()
waiter.Close()
waiter.Wait() // make sure we gather all logs
}()
waiter.Close()
err = waiter.Wait()
if err != nil {
// TODO need to make sure this error isn't just a context error or something we can ignore
log.WithError(err).Error("attach to container returned error, task may be missing logs")
}
status, err := drv.status(ctx, container)
status, err := drv.wait(ctx, container)
return &runResult{
start: start,
status: status,
@@ -553,40 +544,26 @@ func (drv *DockerDriver) startTask(ctx context.Context, container string) error
return nil
}
func (drv *DockerDriver) status(ctx context.Context, container string) (status string, err error) {
log := common.Logger(ctx)
func (drv *DockerDriver) wait(ctx context.Context, container string) (status string, err error) {
// wait retries internally until ctx is up, so we can ignore the error and
// just say it was a timeout if we have [fatal] errors talking to docker, etc.
// a more prevalent case is calling wait & container already finished, so again ignore err.
exitCode, _ := drv.docker.WaitContainerWithContext(container, ctx)
cinfo, err := drv.docker.InspectContainer(container)
if err != nil {
// this is pretty sad, but better to say we had an error than to not.
// task has run to completion and logs will be uploaded, user can decide
log.WithFields(logrus.Fields{"container": container}).WithError(err).Error("Inspecting container")
return drivers.StatusError, err
}
exitCode := cinfo.State.ExitCode
log.WithFields(logrus.Fields{
"exit_code": exitCode,
"container_running": cinfo.State.Running,
"container_status": cinfo.State.Status,
"container_finished": cinfo.State.FinishedAt,
"container_error": cinfo.State.Error,
}).Info("container status")
select { // do this after inspect so we can see exit code
case <-ctx.Done(): // check if task was canceled or timed out
switch ctx.Err() {
case context.DeadlineExceeded:
return drivers.StatusTimeout, nil
case context.Canceled:
return drivers.StatusCancelled, nil
// check the context first, if it's done then exitCode is invalid iff zero
// (can't know 100% without inspecting, but that's expensive and this is a good guess)
// if exitCode is non-zero, we prefer that since it proves termination.
if exitCode == 0 {
select {
case <-ctx.Done(): // check if task was canceled or timed out
switch ctx.Err() {
case context.DeadlineExceeded:
return drivers.StatusTimeout, nil
case context.Canceled:
return drivers.StatusCancelled, nil
}
default:
}
default:
}
if cinfo.State.Running {
log.Warn("getting status of task that is still running, need to fix this")
return drivers.StatusError, errors.New("task in running state but not timed out. weird")
}
switch exitCode {
@@ -596,15 +573,6 @@ func (drv *DockerDriver) status(ctx context.Context, container string) (status s
return drivers.StatusSuccess, nil
case 137: // OOM
drv.Inc("docker", "oom", 1, 1)
if !cinfo.State.OOMKilled {
// It is possible that the host itself is running out of memory and
// the host kernel killed one of the container processes.
// See: https://github.com/moby/moby/issues/15621
// TODO reed: isn't an OOM an OOM? this is wasting space imo
log.WithFields(logrus.Fields{"container": container}).Info("Setting task as OOM killed, but docker disagreed.")
drv.Inc("docker", "possible_oom_false_alarm", 1, 1.0)
}
return drivers.StatusKilled, drivers.ErrOutOfMemory
}
}

View File

@@ -36,7 +36,6 @@ type dockerClient interface {
RemoveContainer(opts docker.RemoveContainerOptions) error
PullImage(opts docker.PullImageOptions, auth docker.AuthConfiguration) error
InspectImage(name string) (*docker.Image, error)
InspectContainer(id string) (*docker.Container, error)
Stats(opts docker.StatsOptions) error
}
@@ -270,16 +269,6 @@ func (d *dockerWrap) InspectImage(name string) (i *docker.Image, err error) {
return i, err
}
func (d *dockerWrap) InspectContainer(id string) (c *docker.Container, err error) {
ctx, cancel := context.WithTimeout(context.Background(), retryTimeout)
defer cancel()
err = d.retry(ctx, func() error {
c, err = d.docker.InspectContainer(id)
return err
})
return c, err
}
func (d *dockerWrap) Stats(opts docker.StatsOptions) (err error) {
// we can't retry this one this way since the callee closes the
// stats chan, need a fancier retry mechanism where we can swap out