mirror of
https://github.com/fnproject/fn.git
synced 2022-10-28 21:29:17 +03:00
we had the inspect container here for 3 reasons: 1) get exit code 2) see if container is still running (debugging madness) 3) see if docker thinks it was an OOM 1) is something wait returns, but due to 2) and 3) we just delayed it until inspection 2) was really just for debugging since we had 3) 3) seems unnecessary. to me, an OOM is an OOM is an OOM. so why have a whole docker inspect call just to find out? (we could move this down, since it's a sad path, and make the call only when necessary, but are we really getting any value from this distinction anyway? i've never ran into it, myself) inspect was actually causing tasks to time out, since the call to inspect could put us over our task timeout, even though our container ran to completion. we could have fixed this by checking the context earlier, but we don't really need inspect either, which will reduce the docker calls we make, which will make more unicorn puppers. now tasks should have more 'true' timeouts. tried to boy scout, but tracing patch also cleans this block up too.
284 lines
7.9 KiB
Go
284 lines
7.9 KiB
Go
// +build go1.7
|
|
|
|
package docker
|
|
|
|
import (
|
|
"context"
|
|
"crypto/tls"
|
|
"encoding/json"
|
|
"fmt"
|
|
"net"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/Sirupsen/logrus"
|
|
"github.com/fsouza/go-dockerclient"
|
|
"gitlab-odx.oracle.com/odx/functions/api/runner/common"
|
|
)
|
|
|
|
const (
|
|
retryTimeout = 10 * time.Minute
|
|
)
|
|
|
|
// wrap docker client calls so we can retry 500s, kind of sucks but fsouza doesn't
|
|
// bake in retries we can use internally, could contribute it at some point, would
|
|
// be much more convenient if we didn't have to do this, but it's better than ad hoc retries.
|
|
// also adds timeouts to many operations, varying by operation
|
|
// TODO could generate this, maybe not worth it, may not change often
|
|
type dockerClient interface {
|
|
// Each of these are github.com/fsouza/go-dockerclient methods
|
|
|
|
AttachToContainerNonBlocking(opts docker.AttachToContainerOptions) (docker.CloseWaiter, error)
|
|
WaitContainerWithContext(id string, ctx context.Context) (int, error)
|
|
StartContainerWithContext(id string, hostConfig *docker.HostConfig, ctx context.Context) error
|
|
CreateContainer(opts docker.CreateContainerOptions) (*docker.Container, error)
|
|
RemoveContainer(opts docker.RemoveContainerOptions) error
|
|
PullImage(opts docker.PullImageOptions, auth docker.AuthConfiguration) error
|
|
InspectImage(name string) (*docker.Image, error)
|
|
Stats(opts docker.StatsOptions) error
|
|
}
|
|
|
|
// TODO: switch to github.com/docker/engine-api
|
|
func newClient(env *common.Environment) dockerClient {
|
|
// TODO this was much easier, don't need special settings at the moment
|
|
// docker, err := docker.NewClient(conf.Docker)
|
|
client, err := docker.NewClientFromEnv()
|
|
if err != nil {
|
|
logrus.WithError(err).Fatal("couldn't create docker client")
|
|
}
|
|
|
|
t := &http.Transport{
|
|
Dial: (&net.Dialer{
|
|
Timeout: 10 * time.Second,
|
|
KeepAlive: 1 * time.Minute,
|
|
}).Dial,
|
|
TLSClientConfig: &tls.Config{
|
|
ClientSessionCache: tls.NewLRUClientSessionCache(8192),
|
|
},
|
|
TLSHandshakeTimeout: 10 * time.Second,
|
|
MaxIdleConnsPerHost: 512,
|
|
Proxy: http.ProxyFromEnvironment,
|
|
MaxIdleConns: 512,
|
|
IdleConnTimeout: 90 * time.Second,
|
|
ExpectContinueTimeout: 1 * time.Second,
|
|
}
|
|
|
|
client.HTTPClient = &http.Client{Transport: t}
|
|
|
|
if err := client.Ping(); err != nil {
|
|
logrus.WithError(err).Fatal("couldn't connect to docker daemon")
|
|
}
|
|
|
|
client.SetTimeout(120 * time.Second)
|
|
|
|
// get 2 clients, one with a small timeout, one with no timeout to use contexts
|
|
|
|
clientNoTimeout, err := docker.NewClientFromEnv()
|
|
if err != nil {
|
|
logrus.WithError(err).Fatal("couldn't create other docker client")
|
|
}
|
|
|
|
clientNoTimeout.HTTPClient = &http.Client{Transport: t}
|
|
|
|
if err := clientNoTimeout.Ping(); err != nil {
|
|
logrus.WithError(err).Fatal("couldn't connect to other docker daemon")
|
|
}
|
|
|
|
return &dockerWrap{client, clientNoTimeout, env}
|
|
}
|
|
|
|
type dockerWrap struct {
|
|
docker *docker.Client
|
|
dockerNoTimeout *docker.Client
|
|
*common.Environment
|
|
}
|
|
|
|
func (d *dockerWrap) retry(ctx context.Context, f func() error) error {
|
|
log := common.Logger(ctx)
|
|
var b common.Backoff
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
d.Inc("task", "fail.docker", 1, 1)
|
|
log.WithError(ctx.Err()).Warnf("retrying on docker errors timed out, restart docker or rotate this instance?")
|
|
return ctx.Err()
|
|
default:
|
|
}
|
|
|
|
err := filter(ctx, f())
|
|
if common.IsTemporary(err) || isDocker50x(err) {
|
|
log.WithError(err).Warn("docker temporary error, retrying")
|
|
b.Sleep()
|
|
d.Inc("task", "error.docker", 1, 1)
|
|
continue
|
|
}
|
|
if err != nil {
|
|
d.Inc("task", "error.docker", 1, 1)
|
|
}
|
|
return err
|
|
}
|
|
}
|
|
|
|
func isDocker50x(err error) bool {
|
|
derr, ok := err.(*docker.Error)
|
|
return ok && derr.Status >= 500
|
|
}
|
|
|
|
func containerConfigError(err error) error {
|
|
derr, ok := err.(*docker.Error)
|
|
if ok && derr.Status == 400 {
|
|
// derr.Message is a JSON response from docker, which has a "message" field we want to extract if possible.
|
|
var v struct {
|
|
Msg string `json:"message"`
|
|
}
|
|
|
|
err := json.Unmarshal([]byte(derr.Message), &v)
|
|
if err != nil {
|
|
// If message was not valid JSON, the raw body is still better than nothing.
|
|
return fmt.Errorf("%s", derr.Message)
|
|
}
|
|
return fmt.Errorf("%s", v.Msg)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
type temporary struct {
|
|
error
|
|
}
|
|
|
|
func (t *temporary) Temporary() bool { return true }
|
|
|
|
func temp(err error) error {
|
|
return &temporary{err}
|
|
}
|
|
|
|
// some 500s are totally cool
|
|
func filter(ctx context.Context, err error) error {
|
|
log := common.Logger(ctx)
|
|
// "API error (500): {\"message\":\"service endpoint with name task-57d722ecdecb9e7be16aff17 already exists\"}\n" -> ok since container exists
|
|
switch {
|
|
default:
|
|
return err
|
|
case err == nil:
|
|
return err
|
|
case strings.Contains(err.Error(), "service endpoint with name"):
|
|
}
|
|
log.WithError(err).Warn("filtering error")
|
|
return nil
|
|
}
|
|
|
|
func filterNoSuchContainer(ctx context.Context, err error) error {
|
|
log := common.Logger(ctx)
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
_, containerNotFound := err.(*docker.NoSuchContainer)
|
|
dockerErr, ok := err.(*docker.Error)
|
|
if containerNotFound || (ok && dockerErr.Status == 404) {
|
|
log.WithError(err).Error("filtering error")
|
|
return nil
|
|
}
|
|
return err
|
|
}
|
|
|
|
func filterNotRunning(ctx context.Context, err error) error {
|
|
log := common.Logger(ctx)
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
|
|
_, containerNotRunning := err.(*docker.ContainerNotRunning)
|
|
dockerErr, ok := err.(*docker.Error)
|
|
if containerNotRunning || (ok && dockerErr.Status == 304) {
|
|
log.WithError(err).Error("filtering error")
|
|
return nil
|
|
}
|
|
|
|
return err
|
|
}
|
|
|
|
func (d *dockerWrap) AttachToContainerNonBlocking(opts docker.AttachToContainerOptions) (w docker.CloseWaiter, err error) {
|
|
ctx, cancel := context.WithTimeout(context.Background(), retryTimeout)
|
|
defer cancel()
|
|
err = d.retry(ctx, func() error {
|
|
w, err = d.docker.AttachToContainerNonBlocking(opts)
|
|
if err != nil {
|
|
// always retry if attach errors, task is running, we want logs!
|
|
err = temp(err)
|
|
}
|
|
return err
|
|
})
|
|
return w, err
|
|
}
|
|
|
|
func (d *dockerWrap) WaitContainerWithContext(id string, ctx context.Context) (code int, err error) {
|
|
err = d.retry(ctx, func() error {
|
|
code, err = d.dockerNoTimeout.WaitContainerWithContext(id, ctx)
|
|
return err
|
|
})
|
|
return code, filterNoSuchContainer(ctx, err)
|
|
}
|
|
|
|
func (d *dockerWrap) StartContainerWithContext(id string, hostConfig *docker.HostConfig, ctx context.Context) (err error) {
|
|
err = d.retry(ctx, func() error {
|
|
err = d.dockerNoTimeout.StartContainerWithContext(id, hostConfig, ctx)
|
|
if _, ok := err.(*docker.NoSuchContainer); ok {
|
|
// for some reason create will sometimes return successfully then say no such container here. wtf. so just retry like normal
|
|
return temp(err)
|
|
}
|
|
return err
|
|
})
|
|
return err
|
|
}
|
|
|
|
func (d *dockerWrap) CreateContainer(opts docker.CreateContainerOptions) (c *docker.Container, err error) {
|
|
err = d.retry(opts.Context, func() error {
|
|
c, err = d.dockerNoTimeout.CreateContainer(opts)
|
|
return err
|
|
})
|
|
return c, err
|
|
}
|
|
|
|
func (d *dockerWrap) PullImage(opts docker.PullImageOptions, auth docker.AuthConfiguration) (err error) {
|
|
err = d.retry(opts.Context, func() error {
|
|
err = d.dockerNoTimeout.PullImage(opts, auth)
|
|
return err
|
|
})
|
|
return err
|
|
}
|
|
|
|
func (d *dockerWrap) RemoveContainer(opts docker.RemoveContainerOptions) (err error) {
|
|
ctx, cancel := context.WithTimeout(context.Background(), retryTimeout)
|
|
defer cancel()
|
|
err = d.retry(ctx, func() error {
|
|
err = d.docker.RemoveContainer(opts)
|
|
return err
|
|
})
|
|
return filterNoSuchContainer(ctx, err)
|
|
}
|
|
|
|
func (d *dockerWrap) InspectImage(name string) (i *docker.Image, err error) {
|
|
ctx, cancel := context.WithTimeout(context.Background(), retryTimeout)
|
|
defer cancel()
|
|
err = d.retry(ctx, func() error {
|
|
i, err = d.docker.InspectImage(name)
|
|
return err
|
|
})
|
|
return i, err
|
|
}
|
|
|
|
func (d *dockerWrap) Stats(opts docker.StatsOptions) (err error) {
|
|
// we can't retry this one this way since the callee closes the
|
|
// stats chan, need a fancier retry mechanism where we can swap out
|
|
// channels, but stats isn't crucial so... be lazy for now
|
|
return d.docker.Stats(opts)
|
|
|
|
//err = d.retry(func() error {
|
|
//err = d.docker.Stats(opts)
|
|
//return err
|
|
//})
|
|
//return err
|
|
}
|