Files
fn-serverless/api/runner/drivers/docker/docker_client.go
Reed Allman afcec04c24 remove the nanny
we finally graduated high school and can make our own ramen

we no longer need this since fn appears to have no concept of canceling tasks
through an api we need to watch, and the context is plumbed if the request is
canceled. since tasks are short, we may never need to do cancellation of
running tasks like we had with iron worker. this was an added docker call
that's unnecessary since we are doing force removal of the container at the
end anyway.
2017-07-24 11:56:58 -07:00

295 lines
8.3 KiB
Go

// +build go1.7
package docker
import (
"context"
"crypto/tls"
"encoding/json"
"fmt"
"net"
"net/http"
"strings"
"time"
"github.com/Sirupsen/logrus"
"github.com/fsouza/go-dockerclient"
"gitlab-odx.oracle.com/odx/functions/api/runner/common"
)
const (
retryTimeout = 10 * time.Minute
)
// wrap docker client calls so we can retry 500s, kind of sucks but fsouza doesn't
// bake in retries we can use internally, could contribute it at some point, would
// be much more convenient if we didn't have to do this, but it's better than ad hoc retries.
// also adds timeouts to many operations, varying by operation
// TODO could generate this, maybe not worth it, may not change often
type dockerClient interface {
// Each of these are github.com/fsouza/go-dockerclient methods
AttachToContainerNonBlocking(opts docker.AttachToContainerOptions) (docker.CloseWaiter, error)
WaitContainerWithContext(id string, ctx context.Context) (int, error)
StartContainerWithContext(id string, hostConfig *docker.HostConfig, ctx context.Context) error
CreateContainer(opts docker.CreateContainerOptions) (*docker.Container, error)
RemoveContainer(opts docker.RemoveContainerOptions) error
PullImage(opts docker.PullImageOptions, auth docker.AuthConfiguration) error
InspectImage(name string) (*docker.Image, error)
InspectContainer(id string) (*docker.Container, error)
Stats(opts docker.StatsOptions) error
}
// TODO: switch to github.com/docker/engine-api
func newClient(env *common.Environment) dockerClient {
// TODO this was much easier, don't need special settings at the moment
// docker, err := docker.NewClient(conf.Docker)
client, err := docker.NewClientFromEnv()
if err != nil {
logrus.WithError(err).Fatal("couldn't create docker client")
}
t := &http.Transport{
Dial: (&net.Dialer{
Timeout: 10 * time.Second,
KeepAlive: 1 * time.Minute,
}).Dial,
TLSClientConfig: &tls.Config{
ClientSessionCache: tls.NewLRUClientSessionCache(8192),
},
TLSHandshakeTimeout: 10 * time.Second,
MaxIdleConnsPerHost: 512,
Proxy: http.ProxyFromEnvironment,
MaxIdleConns: 512,
IdleConnTimeout: 90 * time.Second,
ExpectContinueTimeout: 1 * time.Second,
}
client.HTTPClient = &http.Client{Transport: t}
if err := client.Ping(); err != nil {
logrus.WithError(err).Fatal("couldn't connect to docker daemon")
}
client.SetTimeout(120 * time.Second)
// get 2 clients, one with a small timeout, one with no timeout to use contexts
clientNoTimeout, err := docker.NewClientFromEnv()
if err != nil {
logrus.WithError(err).Fatal("couldn't create other docker client")
}
clientNoTimeout.HTTPClient = &http.Client{Transport: t}
if err := clientNoTimeout.Ping(); err != nil {
logrus.WithError(err).Fatal("couldn't connect to other docker daemon")
}
return &dockerWrap{client, clientNoTimeout, env}
}
type dockerWrap struct {
docker *docker.Client
dockerNoTimeout *docker.Client
*common.Environment
}
func (d *dockerWrap) retry(ctx context.Context, f func() error) error {
log := common.Logger(ctx)
var b common.Backoff
for {
select {
case <-ctx.Done():
d.Inc("task", "fail.docker", 1, 1)
log.WithError(ctx.Err()).Warnf("retrying on docker errors timed out, restart docker or rotate this instance?")
return ctx.Err()
default:
}
err := filter(ctx, f())
if common.IsTemporary(err) || isDocker50x(err) {
log.WithError(err).Warn("docker temporary error, retrying")
b.Sleep()
d.Inc("task", "error.docker", 1, 1)
continue
}
if err != nil {
d.Inc("task", "error.docker", 1, 1)
}
return err
}
}
func isDocker50x(err error) bool {
derr, ok := err.(*docker.Error)
return ok && derr.Status >= 500
}
func containerConfigError(err error) error {
derr, ok := err.(*docker.Error)
if ok && derr.Status == 400 {
// derr.Message is a JSON response from docker, which has a "message" field we want to extract if possible.
var v struct {
Msg string `json:"message"`
}
err := json.Unmarshal([]byte(derr.Message), &v)
if err != nil {
// If message was not valid JSON, the raw body is still better than nothing.
return fmt.Errorf("%s", derr.Message)
}
return fmt.Errorf("%s", v.Msg)
}
return nil
}
type temporary struct {
error
}
func (t *temporary) Temporary() bool { return true }
func temp(err error) error {
return &temporary{err}
}
// some 500s are totally cool
func filter(ctx context.Context, err error) error {
log := common.Logger(ctx)
// "API error (500): {\"message\":\"service endpoint with name task-57d722ecdecb9e7be16aff17 already exists\"}\n" -> ok since container exists
switch {
default:
return err
case err == nil:
return err
case strings.Contains(err.Error(), "service endpoint with name"):
}
log.WithError(err).Warn("filtering error")
return nil
}
func filterNoSuchContainer(ctx context.Context, err error) error {
log := common.Logger(ctx)
if err == nil {
return nil
}
_, containerNotFound := err.(*docker.NoSuchContainer)
dockerErr, ok := err.(*docker.Error)
if containerNotFound || (ok && dockerErr.Status == 404) {
log.WithError(err).Error("filtering error")
return nil
}
return err
}
func filterNotRunning(ctx context.Context, err error) error {
log := common.Logger(ctx)
if err == nil {
return nil
}
_, containerNotRunning := err.(*docker.ContainerNotRunning)
dockerErr, ok := err.(*docker.Error)
if containerNotRunning || (ok && dockerErr.Status == 304) {
log.WithError(err).Error("filtering error")
return nil
}
return err
}
func (d *dockerWrap) AttachToContainerNonBlocking(opts docker.AttachToContainerOptions) (w docker.CloseWaiter, err error) {
ctx, cancel := context.WithTimeout(context.Background(), retryTimeout)
defer cancel()
err = d.retry(ctx, func() error {
w, err = d.docker.AttachToContainerNonBlocking(opts)
if err != nil {
// always retry if attach errors, task is running, we want logs!
err = temp(err)
}
return err
})
return w, err
}
func (d *dockerWrap) WaitContainerWithContext(id string, ctx context.Context) (code int, err error) {
err = d.retry(ctx, func() error {
code, err = d.dockerNoTimeout.WaitContainerWithContext(id, ctx)
return err
})
return code, filterNoSuchContainer(ctx, err)
}
func (d *dockerWrap) StartContainerWithContext(id string, hostConfig *docker.HostConfig, ctx context.Context) (err error) {
err = d.retry(ctx, func() error {
err = d.dockerNoTimeout.StartContainerWithContext(id, hostConfig, ctx)
if _, ok := err.(*docker.NoSuchContainer); ok {
// for some reason create will sometimes return successfully then say no such container here. wtf. so just retry like normal
return temp(err)
}
return err
})
return err
}
func (d *dockerWrap) CreateContainer(opts docker.CreateContainerOptions) (c *docker.Container, err error) {
err = d.retry(opts.Context, func() error {
c, err = d.dockerNoTimeout.CreateContainer(opts)
return err
})
return c, err
}
func (d *dockerWrap) PullImage(opts docker.PullImageOptions, auth docker.AuthConfiguration) (err error) {
err = d.retry(opts.Context, func() error {
err = d.dockerNoTimeout.PullImage(opts, auth)
return err
})
return err
}
func (d *dockerWrap) RemoveContainer(opts docker.RemoveContainerOptions) (err error) {
ctx, cancel := context.WithTimeout(context.Background(), retryTimeout)
defer cancel()
err = d.retry(ctx, func() error {
err = d.docker.RemoveContainer(opts)
return err
})
return filterNoSuchContainer(ctx, err)
}
func (d *dockerWrap) InspectImage(name string) (i *docker.Image, err error) {
ctx, cancel := context.WithTimeout(context.Background(), retryTimeout)
defer cancel()
err = d.retry(ctx, func() error {
i, err = d.docker.InspectImage(name)
return err
})
return i, err
}
func (d *dockerWrap) InspectContainer(id string) (c *docker.Container, err error) {
ctx, cancel := context.WithTimeout(context.Background(), retryTimeout)
defer cancel()
err = d.retry(ctx, func() error {
c, err = d.docker.InspectContainer(id)
return err
})
return c, err
}
func (d *dockerWrap) Stats(opts docker.StatsOptions) (err error) {
// we can't retry this one this way since the callee closes the
// stats chan, need a fancier retry mechanism where we can swap out
// channels, but stats isn't crucial so... be lazy for now
return d.docker.Stats(opts)
//err = d.retry(func() error {
//err = d.docker.Stats(opts)
//return err
//})
//return err
}