Files
fn-serverless/api/runner/drivers/docker/docker_client.go
Reed Allman c215dcf5dd remove docker inspect container
we had the inspect container here for 3 reasons:

1) get exit code
2) see if container is still running (debugging madness)
3) see if docker thinks it was an OOM

1) is something wait returns, but due to 2) and 3) we just delayed it until
inspection

2) was really just for debugging since we had 3)

3) seems unnecessary. to me, an OOM is an OOM is an OOM. so why have a whole
docker inspect call just to find out? (we could move this down, since it's a
sad path, and make the call only when necessary, but are we really getting any
value from this distinction anyway? i've never ran into it, myself)

inspect was actually causing tasks to time out, since the call to inspect
could put us over our task timeout, even though our container ran to
completion. we could have fixed this by checking the context earlier, but we
don't really need inspect either, which will reduce the docker calls we make,
which will make more unicorn puppers. now tasks should have more 'true'
timeouts.

tried to boy scout, but tracing patch also cleans this block up too.
2017-07-24 13:37:29 -07:00

284 lines
7.9 KiB
Go

// +build go1.7
package docker
import (
"context"
"crypto/tls"
"encoding/json"
"fmt"
"net"
"net/http"
"strings"
"time"
"github.com/Sirupsen/logrus"
"github.com/fsouza/go-dockerclient"
"gitlab-odx.oracle.com/odx/functions/api/runner/common"
)
const (
retryTimeout = 10 * time.Minute
)
// wrap docker client calls so we can retry 500s, kind of sucks but fsouza doesn't
// bake in retries we can use internally, could contribute it at some point, would
// be much more convenient if we didn't have to do this, but it's better than ad hoc retries.
// also adds timeouts to many operations, varying by operation
// TODO could generate this, maybe not worth it, may not change often
type dockerClient interface {
// Each of these are github.com/fsouza/go-dockerclient methods
AttachToContainerNonBlocking(opts docker.AttachToContainerOptions) (docker.CloseWaiter, error)
WaitContainerWithContext(id string, ctx context.Context) (int, error)
StartContainerWithContext(id string, hostConfig *docker.HostConfig, ctx context.Context) error
CreateContainer(opts docker.CreateContainerOptions) (*docker.Container, error)
RemoveContainer(opts docker.RemoveContainerOptions) error
PullImage(opts docker.PullImageOptions, auth docker.AuthConfiguration) error
InspectImage(name string) (*docker.Image, error)
Stats(opts docker.StatsOptions) error
}
// TODO: switch to github.com/docker/engine-api
func newClient(env *common.Environment) dockerClient {
// TODO this was much easier, don't need special settings at the moment
// docker, err := docker.NewClient(conf.Docker)
client, err := docker.NewClientFromEnv()
if err != nil {
logrus.WithError(err).Fatal("couldn't create docker client")
}
t := &http.Transport{
Dial: (&net.Dialer{
Timeout: 10 * time.Second,
KeepAlive: 1 * time.Minute,
}).Dial,
TLSClientConfig: &tls.Config{
ClientSessionCache: tls.NewLRUClientSessionCache(8192),
},
TLSHandshakeTimeout: 10 * time.Second,
MaxIdleConnsPerHost: 512,
Proxy: http.ProxyFromEnvironment,
MaxIdleConns: 512,
IdleConnTimeout: 90 * time.Second,
ExpectContinueTimeout: 1 * time.Second,
}
client.HTTPClient = &http.Client{Transport: t}
if err := client.Ping(); err != nil {
logrus.WithError(err).Fatal("couldn't connect to docker daemon")
}
client.SetTimeout(120 * time.Second)
// get 2 clients, one with a small timeout, one with no timeout to use contexts
clientNoTimeout, err := docker.NewClientFromEnv()
if err != nil {
logrus.WithError(err).Fatal("couldn't create other docker client")
}
clientNoTimeout.HTTPClient = &http.Client{Transport: t}
if err := clientNoTimeout.Ping(); err != nil {
logrus.WithError(err).Fatal("couldn't connect to other docker daemon")
}
return &dockerWrap{client, clientNoTimeout, env}
}
type dockerWrap struct {
docker *docker.Client
dockerNoTimeout *docker.Client
*common.Environment
}
func (d *dockerWrap) retry(ctx context.Context, f func() error) error {
log := common.Logger(ctx)
var b common.Backoff
for {
select {
case <-ctx.Done():
d.Inc("task", "fail.docker", 1, 1)
log.WithError(ctx.Err()).Warnf("retrying on docker errors timed out, restart docker or rotate this instance?")
return ctx.Err()
default:
}
err := filter(ctx, f())
if common.IsTemporary(err) || isDocker50x(err) {
log.WithError(err).Warn("docker temporary error, retrying")
b.Sleep()
d.Inc("task", "error.docker", 1, 1)
continue
}
if err != nil {
d.Inc("task", "error.docker", 1, 1)
}
return err
}
}
func isDocker50x(err error) bool {
derr, ok := err.(*docker.Error)
return ok && derr.Status >= 500
}
func containerConfigError(err error) error {
derr, ok := err.(*docker.Error)
if ok && derr.Status == 400 {
// derr.Message is a JSON response from docker, which has a "message" field we want to extract if possible.
var v struct {
Msg string `json:"message"`
}
err := json.Unmarshal([]byte(derr.Message), &v)
if err != nil {
// If message was not valid JSON, the raw body is still better than nothing.
return fmt.Errorf("%s", derr.Message)
}
return fmt.Errorf("%s", v.Msg)
}
return nil
}
type temporary struct {
error
}
func (t *temporary) Temporary() bool { return true }
func temp(err error) error {
return &temporary{err}
}
// some 500s are totally cool
func filter(ctx context.Context, err error) error {
log := common.Logger(ctx)
// "API error (500): {\"message\":\"service endpoint with name task-57d722ecdecb9e7be16aff17 already exists\"}\n" -> ok since container exists
switch {
default:
return err
case err == nil:
return err
case strings.Contains(err.Error(), "service endpoint with name"):
}
log.WithError(err).Warn("filtering error")
return nil
}
func filterNoSuchContainer(ctx context.Context, err error) error {
log := common.Logger(ctx)
if err == nil {
return nil
}
_, containerNotFound := err.(*docker.NoSuchContainer)
dockerErr, ok := err.(*docker.Error)
if containerNotFound || (ok && dockerErr.Status == 404) {
log.WithError(err).Error("filtering error")
return nil
}
return err
}
func filterNotRunning(ctx context.Context, err error) error {
log := common.Logger(ctx)
if err == nil {
return nil
}
_, containerNotRunning := err.(*docker.ContainerNotRunning)
dockerErr, ok := err.(*docker.Error)
if containerNotRunning || (ok && dockerErr.Status == 304) {
log.WithError(err).Error("filtering error")
return nil
}
return err
}
func (d *dockerWrap) AttachToContainerNonBlocking(opts docker.AttachToContainerOptions) (w docker.CloseWaiter, err error) {
ctx, cancel := context.WithTimeout(context.Background(), retryTimeout)
defer cancel()
err = d.retry(ctx, func() error {
w, err = d.docker.AttachToContainerNonBlocking(opts)
if err != nil {
// always retry if attach errors, task is running, we want logs!
err = temp(err)
}
return err
})
return w, err
}
func (d *dockerWrap) WaitContainerWithContext(id string, ctx context.Context) (code int, err error) {
err = d.retry(ctx, func() error {
code, err = d.dockerNoTimeout.WaitContainerWithContext(id, ctx)
return err
})
return code, filterNoSuchContainer(ctx, err)
}
func (d *dockerWrap) StartContainerWithContext(id string, hostConfig *docker.HostConfig, ctx context.Context) (err error) {
err = d.retry(ctx, func() error {
err = d.dockerNoTimeout.StartContainerWithContext(id, hostConfig, ctx)
if _, ok := err.(*docker.NoSuchContainer); ok {
// for some reason create will sometimes return successfully then say no such container here. wtf. so just retry like normal
return temp(err)
}
return err
})
return err
}
func (d *dockerWrap) CreateContainer(opts docker.CreateContainerOptions) (c *docker.Container, err error) {
err = d.retry(opts.Context, func() error {
c, err = d.dockerNoTimeout.CreateContainer(opts)
return err
})
return c, err
}
func (d *dockerWrap) PullImage(opts docker.PullImageOptions, auth docker.AuthConfiguration) (err error) {
err = d.retry(opts.Context, func() error {
err = d.dockerNoTimeout.PullImage(opts, auth)
return err
})
return err
}
func (d *dockerWrap) RemoveContainer(opts docker.RemoveContainerOptions) (err error) {
ctx, cancel := context.WithTimeout(context.Background(), retryTimeout)
defer cancel()
err = d.retry(ctx, func() error {
err = d.docker.RemoveContainer(opts)
return err
})
return filterNoSuchContainer(ctx, err)
}
func (d *dockerWrap) InspectImage(name string) (i *docker.Image, err error) {
ctx, cancel := context.WithTimeout(context.Background(), retryTimeout)
defer cancel()
err = d.retry(ctx, func() error {
i, err = d.docker.InspectImage(name)
return err
})
return i, err
}
func (d *dockerWrap) Stats(opts docker.StatsOptions) (err error) {
// we can't retry this one this way since the callee closes the
// stats chan, need a fancier retry mechanism where we can swap out
// channels, but stats isn't crucial so... be lazy for now
return d.docker.Stats(opts)
//err = d.retry(func() error {
//err = d.docker.Stats(opts)
//return err
//})
//return err
}