mirror of
https://github.com/fnproject/fn.git
synced 2022-10-28 21:29:17 +03:00
Moved runner into this repo, update dep files and now builds.
This commit is contained in:
638
api/runner/drivers/docker/docker.go
Normal file
638
api/runner/drivers/docker/docker.go
Normal file
@@ -0,0 +1,638 @@
|
||||
// Copyright 2016 Iron.io
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package docker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/Sirupsen/logrus"
|
||||
manifest "github.com/docker/distribution/manifest/schema1"
|
||||
"github.com/fsouza/go-dockerclient"
|
||||
"github.com/heroku/docker-registry-client/registry"
|
||||
"github.com/kumokit/functions/api/runner/common"
|
||||
"github.com/kumokit/functions/api/runner/common/stats"
|
||||
"github.com/kumokit/functions/api/runner/drivers"
|
||||
)
|
||||
|
||||
const hubURL = "https://registry.hub.docker.com"
|
||||
|
||||
var registryClient = &http.Client{
|
||||
Transport: &http.Transport{
|
||||
Dial: (&net.Dialer{
|
||||
Timeout: 10 * time.Second,
|
||||
KeepAlive: 2 * time.Minute,
|
||||
}).Dial,
|
||||
TLSClientConfig: &tls.Config{
|
||||
ClientSessionCache: tls.NewLRUClientSessionCache(8192),
|
||||
},
|
||||
TLSHandshakeTimeout: 10 * time.Second,
|
||||
MaxIdleConnsPerHost: 32, // TODO tune; we will likely be making lots of requests to same place
|
||||
Proxy: http.ProxyFromEnvironment,
|
||||
IdleConnTimeout: 90 * time.Second,
|
||||
MaxIdleConns: 512,
|
||||
ExpectContinueTimeout: 1 * time.Second,
|
||||
},
|
||||
}
|
||||
|
||||
// A drivers.ContainerTask should implement the Auther interface if it would
|
||||
// like to use not-necessarily-public docker images for any or all task
|
||||
// invocations.
|
||||
type Auther interface {
|
||||
// DockerAuth should return docker auth credentials that will authenticate
|
||||
// against a docker registry for a given drivers.ContainerTask.Image(). An
|
||||
// error may be returned which will cause the task not to be run, this can be
|
||||
// useful for an implementer to do things like testing auth configurations
|
||||
// before returning them; e.g. if the implementer would like to impose
|
||||
// certain restrictions on images or if credentials must be acquired right
|
||||
// before runtime and there's an error doing so. If these credentials don't
|
||||
// work, the docker pull will fail and the task will be set to error status.
|
||||
DockerAuth() (docker.AuthConfiguration, error)
|
||||
}
|
||||
|
||||
type runResult struct {
|
||||
error
|
||||
StatusValue string
|
||||
}
|
||||
|
||||
func (r *runResult) Error() string {
|
||||
if r.error == nil {
|
||||
return ""
|
||||
}
|
||||
return r.error.Error()
|
||||
}
|
||||
|
||||
func (r *runResult) Status() string { return r.StatusValue }
|
||||
func (r *runResult) UserVisible() bool { return common.IsUserVisibleError(r.error) }
|
||||
|
||||
type DockerDriver struct {
|
||||
conf drivers.Config
|
||||
docker dockerClient // retries on *docker.Client, restricts ad hoc *docker.Client usage / retries
|
||||
hostname string
|
||||
|
||||
*common.Environment
|
||||
}
|
||||
|
||||
// implements drivers.Driver
|
||||
func NewDocker(env *common.Environment, conf drivers.Config) *DockerDriver {
|
||||
hostname, err := os.Hostname()
|
||||
if err != nil {
|
||||
logrus.WithError(err).Fatal("couldn't resolve hostname")
|
||||
}
|
||||
|
||||
return &DockerDriver{
|
||||
conf: conf,
|
||||
docker: newClient(env),
|
||||
hostname: hostname,
|
||||
Environment: env,
|
||||
}
|
||||
}
|
||||
|
||||
// CheckRegistry will return a sizer, which can be used to check the size of an
|
||||
// image if the returned error is nil. If the error returned is nil, then
|
||||
// authentication against the given credentials was successful, if the
|
||||
// configuration does not specify a config.ServerAddress,
|
||||
// https://hub.docker.com will be tried. CheckRegistry is a package level
|
||||
// method since rkt can also use docker images, we may be interested in using
|
||||
// rkt w/o a docker driver configured; also, we don't have to tote around a
|
||||
// driver in any tasker that may be interested in registry information (2/2
|
||||
// cases thus far).
|
||||
func CheckRegistry(image string, config docker.AuthConfiguration) (Sizer, error) {
|
||||
registry, repo, tag := drivers.ParseImage(image)
|
||||
|
||||
reg, err := registryForConfig(config, registry)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
mani, err := reg.Manifest(repo, tag)
|
||||
if err != nil {
|
||||
logrus.WithFields(logrus.Fields{"username": config.Username, "server": config.ServerAddress, "image": image}).WithError(err).Error("Credentials not authorized, trying next.")
|
||||
//if !isAuthError(err) {
|
||||
// // TODO we might retry this, since if this was the registry that was supposed to
|
||||
// // auth the task will erroneously be set to 'error'
|
||||
//}
|
||||
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &sizer{mani, reg, repo}, nil
|
||||
}
|
||||
|
||||
// Sizer returns size information. This interface is liable to contain more
|
||||
// than a size at some point, change as needed.
|
||||
type Sizer interface {
|
||||
Size() (int64, error)
|
||||
}
|
||||
|
||||
type sizer struct {
|
||||
mani *manifest.SignedManifest
|
||||
reg *registry.Registry
|
||||
repo string
|
||||
}
|
||||
|
||||
func (s *sizer) Size() (int64, error) {
|
||||
var sum int64
|
||||
for _, r := range s.mani.References() {
|
||||
desc, err := s.reg.LayerMetadata(s.repo, r.Digest)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
sum += desc.Size
|
||||
}
|
||||
return sum, nil
|
||||
}
|
||||
|
||||
func registryURL(addr string) (string, error) {
|
||||
if addr == "" || strings.Contains(addr, "hub.docker.com") || strings.Contains(addr, "index.docker.io") {
|
||||
return hubURL, nil
|
||||
}
|
||||
|
||||
url, err := url.Parse(addr)
|
||||
if err != nil {
|
||||
// TODO we could error the task out from this with a user error but since
|
||||
// we have a list of auths to check, just return the error so as to be
|
||||
// skipped... horrible api as it is
|
||||
logrus.WithFields(logrus.Fields{"auth_addr": addr}).WithError(err).Error("error parsing server address url, skipping")
|
||||
return "", err
|
||||
}
|
||||
|
||||
if url.Scheme == "" {
|
||||
url.Scheme = "https"
|
||||
}
|
||||
url.Path = strings.TrimSuffix(url.Path, "/")
|
||||
url.Path = strings.TrimPrefix(url.Path, "/v2")
|
||||
url.Path = strings.TrimPrefix(url.Path, "/v1") // just try this, if it fails it fails, not supporting v1
|
||||
return url.String(), nil
|
||||
}
|
||||
|
||||
func isAuthError(err error) bool {
|
||||
// AARGH!
|
||||
if urlError, ok := err.(*url.Error); ok {
|
||||
if httpError, ok := urlError.Err.(*registry.HttpStatusError); ok {
|
||||
if httpError.Response.StatusCode == 401 {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func registryForConfig(config docker.AuthConfiguration, reg string) (*registry.Registry, error) {
|
||||
if reg == "" {
|
||||
reg = config.ServerAddress
|
||||
}
|
||||
|
||||
var err error
|
||||
config.ServerAddress, err = registryURL(reg)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Use this instead of registry.New to avoid the Ping().
|
||||
transport := registry.WrapTransport(registryClient.Transport, reg, config.Username, config.Password)
|
||||
r := ®istry.Registry{
|
||||
URL: config.ServerAddress,
|
||||
Client: &http.Client{
|
||||
Transport: transport,
|
||||
},
|
||||
Logf: registry.Quiet,
|
||||
}
|
||||
return r, nil
|
||||
}
|
||||
|
||||
func (drv *DockerDriver) Prepare(ctx context.Context, task drivers.ContainerTask) (drivers.Cookie, error) {
|
||||
var cmd []string
|
||||
if task.Command() != "" {
|
||||
// NOTE: this is hyper-sensitive and may not be correct like this even, but it passes old tests
|
||||
// task.Command() in swapi is always "sh /mnt/task/.runtask" so fields is safe
|
||||
cmd = strings.Fields(task.Command())
|
||||
logrus.WithFields(logrus.Fields{"task_id": task.Id(), "cmd": cmd, "len": len(cmd)}).Debug("docker command")
|
||||
}
|
||||
|
||||
envvars := make([]string, 0, len(task.EnvVars()))
|
||||
for name, val := range task.EnvVars() {
|
||||
envvars = append(envvars, name+"="+val)
|
||||
}
|
||||
|
||||
containerName := newContainerID(task)
|
||||
container := docker.CreateContainerOptions{
|
||||
Name: containerName,
|
||||
Config: &docker.Config{
|
||||
Env: envvars,
|
||||
Cmd: cmd,
|
||||
Memory: int64(drv.conf.Memory),
|
||||
CPUShares: drv.conf.CPUShares,
|
||||
Hostname: drv.hostname,
|
||||
Image: task.Image(),
|
||||
Volumes: map[string]struct{}{},
|
||||
Labels: task.Labels(),
|
||||
OpenStdin: true,
|
||||
AttachStdin: true,
|
||||
StdinOnce: true,
|
||||
},
|
||||
HostConfig: &docker.HostConfig{},
|
||||
Context: ctx,
|
||||
}
|
||||
|
||||
volumes := task.Volumes()
|
||||
for _, mapping := range volumes {
|
||||
hostDir := mapping[0]
|
||||
containerDir := mapping[1]
|
||||
container.Config.Volumes[containerDir] = struct{}{}
|
||||
mapn := fmt.Sprintf("%s:%s", hostDir, containerDir)
|
||||
container.HostConfig.Binds = append(container.HostConfig.Binds, mapn)
|
||||
logrus.WithFields(logrus.Fields{"volumes": mapn, "task_id": task.Id()}).Debug("setting volumes")
|
||||
}
|
||||
|
||||
if wd := task.WorkDir(); wd != "" {
|
||||
logrus.WithFields(logrus.Fields{"wd": wd, "task_id": task.Id()}).Debug("setting work dir")
|
||||
container.Config.WorkingDir = wd
|
||||
}
|
||||
|
||||
err := drv.ensureImage(ctx, task)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
createTimer := drv.NewTimer("docker", "create_container", 1.0)
|
||||
_, err = drv.docker.CreateContainer(container)
|
||||
createTimer.Measure()
|
||||
if err != nil {
|
||||
// since we retry under the hood, if the container gets created and retry fails, we can just ignore error
|
||||
if err != docker.ErrContainerAlreadyExists {
|
||||
logrus.WithFields(logrus.Fields{"task_id": task.Id(), "command": container.Config.Cmd, "memory": container.Config.Memory,
|
||||
"cpu_shares": container.Config.CPUShares, "hostname": container.Config.Hostname, "name": container.Name,
|
||||
"image": container.Config.Image, "volumes": container.Config.Volumes, "binds": container.HostConfig.Binds, "container": containerName,
|
||||
}).WithError(err).Error("Could not create container")
|
||||
|
||||
if ce := containerConfigError(err); ce != nil {
|
||||
return nil, common.UserError(fmt.Errorf("Failed to create container from task configuration '%s'", ce))
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
// discard removal error
|
||||
return &cookie{id: containerName, task: task, drv: drv}, nil
|
||||
}
|
||||
|
||||
type cookie struct {
|
||||
id string
|
||||
task drivers.ContainerTask
|
||||
drv *DockerDriver
|
||||
}
|
||||
|
||||
func (c *cookie) Close() error { return c.drv.removeContainer(c.id) }
|
||||
|
||||
func (c *cookie) Run(ctx context.Context) (drivers.RunResult, error) {
|
||||
return c.drv.run(ctx, c.id, c.task)
|
||||
}
|
||||
|
||||
func (drv *DockerDriver) removeContainer(container string) error {
|
||||
removeTimer := drv.NewTimer("docker", "remove_container", 1.0)
|
||||
defer removeTimer.Measure()
|
||||
err := drv.docker.RemoveContainer(docker.RemoveContainerOptions{
|
||||
ID: container, Force: true, RemoveVolumes: true})
|
||||
|
||||
if err != nil {
|
||||
logrus.WithError(err).WithFields(logrus.Fields{"container": container}).Error("error removing container")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (drv *DockerDriver) ensureImage(ctx context.Context, task drivers.ContainerTask) error {
|
||||
reg, _, _ := drivers.ParseImage(task.Image())
|
||||
|
||||
// ask for docker creds before looking for image, as the tasker may need to
|
||||
// validate creds even if the image is downloaded.
|
||||
|
||||
var config docker.AuthConfiguration // default, tries docker hub w/o user/pass
|
||||
if task, ok := task.(Auther); ok {
|
||||
var err error
|
||||
config, err = task.DockerAuth()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if reg != "" {
|
||||
config.ServerAddress = reg
|
||||
}
|
||||
|
||||
// see if we already have it, if not, pull it
|
||||
_, err := drv.docker.InspectImage(task.Image())
|
||||
if err == docker.ErrNoSuchImage {
|
||||
err = drv.pullImage(ctx, task, config)
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func (drv *DockerDriver) pullImage(ctx context.Context, task drivers.ContainerTask, config docker.AuthConfiguration) error {
|
||||
log := common.Logger(ctx)
|
||||
|
||||
reg, repo, tag := drivers.ParseImage(task.Image())
|
||||
globalRepo := path.Join(reg, repo)
|
||||
|
||||
pullTimer := drv.NewTimer("docker", "pull_image", 1.0)
|
||||
defer pullTimer.Measure()
|
||||
|
||||
drv.Inc("docker", "pull_image_count."+stats.AsStatField(task.Image()), 1, 1)
|
||||
|
||||
if reg != "" {
|
||||
config.ServerAddress = reg
|
||||
}
|
||||
|
||||
var err error
|
||||
config.ServerAddress, err = registryURL(config.ServerAddress)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
log.WithFields(logrus.Fields{"registry": config.ServerAddress, "username": config.Username, "image": task.Image()}).Info("Pulling image")
|
||||
|
||||
err = drv.docker.PullImage(docker.PullImageOptions{Repository: globalRepo, Tag: tag, Context: ctx}, config)
|
||||
if err != nil {
|
||||
drv.Inc("task", "error.pull."+stats.AsStatField(task.Image()), 1, 1)
|
||||
log.WithFields(logrus.Fields{"registry": config.ServerAddress, "username": config.Username, "image": task.Image()}).WithError(err).Error("Failed to pull image")
|
||||
|
||||
// TODO need to inspect for hub or network errors and pick.
|
||||
return common.UserError(fmt.Errorf("Failed to pull image '%s': %s", task.Image(), err))
|
||||
|
||||
// TODO what about a case where credentials were good, then credentials
|
||||
// were invalidated -- do we need to keep the credential cache docker
|
||||
// driver side and after pull for this case alone?
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Run executes the docker container. If task runs, drivers.RunResult will be returned. If something fails outside the task (ie: Docker), it will return error.
|
||||
// The docker driver will attempt to cast the task to a Auther. If that succeeds, private image support is available. See the Auther interface for how to implement this.
|
||||
func (drv *DockerDriver) run(ctx context.Context, container string, task drivers.ContainerTask) (drivers.RunResult, error) {
|
||||
log := common.Logger(ctx)
|
||||
timeout := task.Timeout()
|
||||
|
||||
var cancel context.CancelFunc
|
||||
if timeout <= 0 {
|
||||
ctx, cancel = context.WithCancel(ctx)
|
||||
} else {
|
||||
ctx, cancel = context.WithTimeout(ctx, timeout)
|
||||
}
|
||||
defer cancel() // do this so that after Run exits, nanny and collect stop
|
||||
var complete bool
|
||||
defer func() { complete = true }() // run before cancel is called
|
||||
ctx = context.WithValue(ctx, completeKey, &complete)
|
||||
|
||||
go drv.nanny(ctx, container)
|
||||
go drv.collectStats(ctx, container, task)
|
||||
|
||||
mwOut, mwErr := task.Logger()
|
||||
|
||||
timer := drv.NewTimer("docker", "attach_container", 1)
|
||||
waiter, err := drv.docker.AttachToContainerNonBlocking(docker.AttachToContainerOptions{
|
||||
Container: container, OutputStream: mwOut, ErrorStream: mwErr,
|
||||
Stream: true, Logs: true, Stdout: true, Stderr: true,
|
||||
Stdin: true, InputStream: task.Input()})
|
||||
timer.Measure()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
err = drv.startTask(ctx, container)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
taskTimer := drv.NewTimer("docker", "container_runtime", 1)
|
||||
|
||||
// can discard error, inspect will tell us about the task and wait will retry under the hood
|
||||
drv.docker.WaitContainerWithContext(container, ctx)
|
||||
taskTimer.Measure()
|
||||
|
||||
waiter.Close()
|
||||
err = waiter.Wait()
|
||||
if err != nil {
|
||||
// TODO need to make sure this error isn't just a context error or something we can ignore
|
||||
log.WithError(err).Error("attach to container returned error, task may be missing logs")
|
||||
}
|
||||
|
||||
status, err := drv.status(ctx, container)
|
||||
return &runResult{
|
||||
StatusValue: status,
|
||||
error: err,
|
||||
}, nil
|
||||
}
|
||||
|
||||
const completeKey = "complete"
|
||||
|
||||
// watch for cancel or timeout and kill process.
|
||||
func (drv *DockerDriver) nanny(ctx context.Context, container string) {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
if *(ctx.Value(completeKey).(*bool)) {
|
||||
return
|
||||
}
|
||||
drv.cancel(container)
|
||||
}
|
||||
}
|
||||
|
||||
func (drv *DockerDriver) cancel(container string) {
|
||||
stopTimer := drv.NewTimer("docker", "stop_container", 1.0)
|
||||
err := drv.docker.StopContainer(container, 30)
|
||||
stopTimer.Measure()
|
||||
if err != nil {
|
||||
logrus.WithError(err).WithFields(logrus.Fields{"container": container, "errType": fmt.Sprintf("%T", err)}).Error("something managed to escape our retries web, could not kill container")
|
||||
}
|
||||
}
|
||||
|
||||
func (drv *DockerDriver) collectStats(ctx context.Context, container string, task drivers.ContainerTask) {
|
||||
done := make(chan bool)
|
||||
defer close(done)
|
||||
dstats := make(chan *docker.Stats, 1)
|
||||
go func() {
|
||||
// NOTE: docker automatically streams every 1s. we can skip or avg samples if we'd like but
|
||||
// the memory overhead is < 1MB for 3600 stat points so this seems fine, seems better to stream
|
||||
// (internal docker api streams) than open/close stream for 1 sample over and over.
|
||||
// must be called in goroutine, docker.Stats() blocks
|
||||
err := drv.docker.Stats(docker.StatsOptions{
|
||||
ID: container,
|
||||
Stats: dstats,
|
||||
Stream: true,
|
||||
Done: done, // A flag that enables stopping the stats operation
|
||||
})
|
||||
|
||||
if err != nil && err != io.ErrClosedPipe {
|
||||
logrus.WithError(err).WithFields(logrus.Fields{"container": container, "task_id": task.Id()}).Error("error streaming docker stats for task")
|
||||
}
|
||||
}()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case ds, ok := <-dstats:
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
task.WriteStat(cherryPick(ds))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func cherryPick(ds *docker.Stats) drivers.Stat {
|
||||
// TODO cpu % is as a % of the whole system... cpu is weird since we're sharing it
|
||||
// across a bunch of containers and it scales based on how many we're sharing with,
|
||||
// do we want users to see as a % of system?
|
||||
systemDelta := float64(ds.CPUStats.SystemCPUUsage - ds.PreCPUStats.SystemCPUUsage)
|
||||
cores := float64(len(ds.CPUStats.CPUUsage.PercpuUsage))
|
||||
var cpuUser, cpuKernel, cpuTotal float64
|
||||
if systemDelta > 0 {
|
||||
// TODO we could leave these in docker format and let hud/viz tools do this instead of us... like net is, could do same for mem, too. thoughts?
|
||||
cpuUser = (float64(ds.CPUStats.CPUUsage.UsageInUsermode-ds.PreCPUStats.CPUUsage.UsageInUsermode) / systemDelta) * cores * 100.0
|
||||
cpuKernel = (float64(ds.CPUStats.CPUUsage.UsageInKernelmode-ds.PreCPUStats.CPUUsage.UsageInKernelmode) / systemDelta) * cores * 100.0
|
||||
cpuTotal = (float64(ds.CPUStats.CPUUsage.TotalUsage-ds.PreCPUStats.CPUUsage.TotalUsage) / systemDelta) * cores * 100.0
|
||||
}
|
||||
|
||||
var rx, tx float64
|
||||
for _, v := range ds.Networks {
|
||||
rx += float64(v.RxBytes)
|
||||
tx += float64(v.TxBytes)
|
||||
}
|
||||
|
||||
var blkRead, blkWrite uint64
|
||||
for _, bioEntry := range ds.BlkioStats.IOServiceBytesRecursive {
|
||||
switch strings.ToLower(bioEntry.Op) {
|
||||
case "read":
|
||||
blkRead = blkRead + bioEntry.Value
|
||||
case "write":
|
||||
blkWrite = blkWrite + bioEntry.Value
|
||||
}
|
||||
}
|
||||
|
||||
return drivers.Stat{
|
||||
Timestamp: ds.Read,
|
||||
Metrics: map[string]uint64{
|
||||
// source: https://godoc.org/github.com/fsouza/go-dockerclient#Stats
|
||||
// ex (for future expansion): {"read":"2016-08-03T18:08:05Z","pids_stats":{},"network":{},"networks":{"eth0":{"rx_bytes":508,"tx_packets":6,"rx_packets":6,"tx_bytes":508}},"memory_stats":{"stats":{"cache":16384,"pgpgout":281,"rss":8826880,"pgpgin":2440,"total_rss":8826880,"hierarchical_memory_limit":536870912,"total_pgfault":3809,"active_anon":8843264,"total_active_anon":8843264,"total_pgpgout":281,"total_cache":16384,"pgfault":3809,"total_pgpgin":2440},"max_usage":8953856,"usage":8953856,"limit":536870912},"blkio_stats":{"io_service_bytes_recursive":[{"major":202,"op":"Read"},{"major":202,"op":"Write"},{"major":202,"op":"Sync"},{"major":202,"op":"Async"},{"major":202,"op":"Total"}],"io_serviced_recursive":[{"major":202,"op":"Read"},{"major":202,"op":"Write"},{"major":202,"op":"Sync"},{"major":202,"op":"Async"},{"major":202,"op":"Total"}]},"cpu_stats":{"cpu_usage":{"percpu_usage":[47641874],"usage_in_usermode":30000000,"total_usage":47641874},"system_cpu_usage":8880800500000000,"throttling_data":{}},"precpu_stats":{"cpu_usage":{"percpu_usage":[44946186],"usage_in_usermode":30000000,"total_usage":44946186},"system_cpu_usage":8880799510000000,"throttling_data":{}}}
|
||||
// mostly stolen values from docker stats cli api...
|
||||
|
||||
// net
|
||||
"net_rx": uint64(rx),
|
||||
"net_tx": uint64(tx),
|
||||
// mem
|
||||
"mem_limit": ds.MemoryStats.Limit,
|
||||
"mem_usage": ds.MemoryStats.Usage,
|
||||
// i/o
|
||||
"disk_read": blkRead,
|
||||
"disk_write": blkWrite,
|
||||
// cpu
|
||||
"cpu_user": uint64(cpuUser),
|
||||
"cpu_total": uint64(cpuTotal),
|
||||
"cpu_kernel": uint64(cpuKernel),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Introduces some randomness to prevent container name clashes where task ID remains the same.
|
||||
func newContainerID(task drivers.ContainerTask) string {
|
||||
return fmt.Sprintf("task-%d-%s", time.Now().UnixNano(), task.Id())
|
||||
}
|
||||
|
||||
func (drv *DockerDriver) startTask(ctx context.Context, container string) error {
|
||||
log := common.Logger(ctx)
|
||||
startTimer := drv.NewTimer("docker", "start_container", 1.0)
|
||||
log.WithFields(logrus.Fields{"container": container}).Debug("Starting container execution")
|
||||
err := drv.docker.StartContainerWithContext(container, nil, ctx)
|
||||
startTimer.Measure()
|
||||
if err != nil {
|
||||
dockerErr, ok := err.(*docker.Error)
|
||||
_, containerAlreadyRunning := err.(*docker.ContainerAlreadyRunning)
|
||||
if containerAlreadyRunning || (ok && dockerErr.Status == 304) {
|
||||
// 304=container already started -- so we can ignore error
|
||||
} else {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (drv *DockerDriver) status(ctx context.Context, container string) (status string, err error) {
|
||||
log := common.Logger(ctx)
|
||||
|
||||
cinfo, err := drv.docker.InspectContainer(container)
|
||||
if err != nil {
|
||||
// this is pretty sad, but better to say we had an error than to not.
|
||||
// task has run to completion and logs will be uploaded, user can decide
|
||||
log.WithFields(logrus.Fields{"container": container}).WithError(err).Error("Inspecting container")
|
||||
return drivers.StatusError, err
|
||||
}
|
||||
|
||||
exitCode := cinfo.State.ExitCode
|
||||
log.WithFields(logrus.Fields{
|
||||
"exit_code": exitCode,
|
||||
"container_running": cinfo.State.Running,
|
||||
"container_status": cinfo.State.Status,
|
||||
"container_finished": cinfo.State.FinishedAt,
|
||||
"container_error": cinfo.State.Error,
|
||||
}).Info("container status")
|
||||
|
||||
select { // do this after inspect so we can see exit code
|
||||
case <-ctx.Done(): // check if task was canceled or timed out
|
||||
switch ctx.Err() {
|
||||
case context.DeadlineExceeded:
|
||||
return drivers.StatusTimeout, nil
|
||||
case context.Canceled:
|
||||
return drivers.StatusCancelled, nil
|
||||
}
|
||||
default:
|
||||
}
|
||||
|
||||
if cinfo.State.Running {
|
||||
log.Warn("getting status of task that is still running, need to fix this")
|
||||
return drivers.StatusError, errors.New("task in running state but not timed out. weird")
|
||||
}
|
||||
|
||||
switch exitCode {
|
||||
default:
|
||||
return drivers.StatusError, common.UserError(fmt.Errorf("exit code %d", exitCode))
|
||||
case 0:
|
||||
return drivers.StatusSuccess, nil
|
||||
case 137: // OOM
|
||||
drv.Inc("docker", "oom", 1, 1)
|
||||
if !cinfo.State.OOMKilled {
|
||||
// It is possible that the host itself is running out of memory and
|
||||
// the host kernel killed one of the container processes.
|
||||
// See: https://github.com/docker/docker/issues/15621
|
||||
// TODO reed: isn't an OOM an OOM? this is wasting space imo
|
||||
log.WithFields(logrus.Fields{"container": container}).Info("Setting task as OOM killed, but docker disagreed.")
|
||||
drv.Inc("docker", "possible_oom_false_alarm", 1, 1.0)
|
||||
}
|
||||
|
||||
return drivers.StatusKilled, drivers.ErrOutOfMemory
|
||||
}
|
||||
}
|
||||
315
api/runner/drivers/docker/docker_client.go
Normal file
315
api/runner/drivers/docker/docker_client.go
Normal file
@@ -0,0 +1,315 @@
|
||||
// +build go1.7
|
||||
|
||||
// Copyright 2016 Iron.io
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package docker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/tls"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/Sirupsen/logrus"
|
||||
"github.com/fsouza/go-dockerclient"
|
||||
"github.com/kumokit/functions/api/runner/common"
|
||||
)
|
||||
|
||||
const (
|
||||
retryTimeout = 10 * time.Minute
|
||||
)
|
||||
|
||||
// wrap docker client calls so we can retry 500s, kind of sucks but fsouza doesn't
|
||||
// bake in retries we can use internally, could contribute it at some point, would
|
||||
// be much more convenient if we didn't have to do this, but it's better than ad hoc retries.
|
||||
// also adds timeouts to many operations, varying by operation
|
||||
// TODO could generate this, maybe not worth it, may not change often
|
||||
type dockerClient interface {
|
||||
// Each of these are github.com/fsouza/go-dockerclient methods
|
||||
|
||||
AttachToContainerNonBlocking(opts docker.AttachToContainerOptions) (docker.CloseWaiter, error)
|
||||
WaitContainerWithContext(id string, ctx context.Context) (int, error)
|
||||
StartContainerWithContext(id string, hostConfig *docker.HostConfig, ctx context.Context) error
|
||||
CreateContainer(opts docker.CreateContainerOptions) (*docker.Container, error)
|
||||
RemoveContainer(opts docker.RemoveContainerOptions) error
|
||||
PullImage(opts docker.PullImageOptions, auth docker.AuthConfiguration) error
|
||||
InspectImage(name string) (*docker.Image, error)
|
||||
InspectContainer(id string) (*docker.Container, error)
|
||||
StopContainer(id string, timeout uint) error
|
||||
Stats(opts docker.StatsOptions) error
|
||||
}
|
||||
|
||||
// TODO: switch to github.com/docker/engine-api
|
||||
func newClient(env *common.Environment) dockerClient {
|
||||
// TODO this was much easier, don't need special settings at the moment
|
||||
// docker, err := docker.NewClient(conf.Docker)
|
||||
client, err := docker.NewClientFromEnv()
|
||||
if err != nil {
|
||||
logrus.WithError(err).Fatal("couldn't create docker client")
|
||||
}
|
||||
|
||||
t := &http.Transport{
|
||||
Dial: (&net.Dialer{
|
||||
Timeout: 10 * time.Second,
|
||||
KeepAlive: 1 * time.Minute,
|
||||
}).Dial,
|
||||
TLSClientConfig: &tls.Config{
|
||||
ClientSessionCache: tls.NewLRUClientSessionCache(8192),
|
||||
},
|
||||
TLSHandshakeTimeout: 10 * time.Second,
|
||||
MaxIdleConnsPerHost: 512,
|
||||
Proxy: http.ProxyFromEnvironment,
|
||||
MaxIdleConns: 512,
|
||||
IdleConnTimeout: 90 * time.Second,
|
||||
ExpectContinueTimeout: 1 * time.Second,
|
||||
}
|
||||
|
||||
client.HTTPClient = &http.Client{Transport: t}
|
||||
|
||||
if err := client.Ping(); err != nil {
|
||||
logrus.WithError(err).Fatal("couldn't connect to docker daemon")
|
||||
}
|
||||
|
||||
client.SetTimeout(120 * time.Second)
|
||||
|
||||
// get 2 clients, one with a small timeout, one with no timeout to use contexts
|
||||
|
||||
clientNoTimeout, err := docker.NewClientFromEnv()
|
||||
if err != nil {
|
||||
logrus.WithError(err).Fatal("couldn't create other docker client")
|
||||
}
|
||||
|
||||
clientNoTimeout.HTTPClient = &http.Client{Transport: t}
|
||||
|
||||
if err := clientNoTimeout.Ping(); err != nil {
|
||||
logrus.WithError(err).Fatal("couldn't connect to other docker daemon")
|
||||
}
|
||||
|
||||
return &dockerWrap{client, clientNoTimeout, env}
|
||||
}
|
||||
|
||||
type dockerWrap struct {
|
||||
docker *docker.Client
|
||||
dockerNoTimeout *docker.Client
|
||||
*common.Environment
|
||||
}
|
||||
|
||||
func (d *dockerWrap) retry(ctx context.Context, f func() error) error {
|
||||
var b common.Backoff
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
d.Inc("task", "fail.docker", 1, 1)
|
||||
logrus.WithError(ctx.Err()).Warnf("retrying on docker errors timed out, restart docker or rotate this instance?")
|
||||
return ctx.Err()
|
||||
default:
|
||||
}
|
||||
|
||||
err := filter(f())
|
||||
if common.IsTemporary(err) || isDocker50x(err) {
|
||||
logrus.WithError(err).Warn("docker temporary error, retrying")
|
||||
b.Sleep()
|
||||
d.Inc("task", "error.docker", 1, 1)
|
||||
continue
|
||||
}
|
||||
if err != nil {
|
||||
d.Inc("task", "error.docker", 1, 1)
|
||||
}
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
func isDocker50x(err error) bool {
|
||||
derr, ok := err.(*docker.Error)
|
||||
return ok && derr.Status >= 500
|
||||
}
|
||||
|
||||
func containerConfigError(err error) error {
|
||||
derr, ok := err.(*docker.Error)
|
||||
if ok && derr.Status == 400 {
|
||||
// derr.Message is a JSON response from docker, which has a "message" field we want to extract if possible.
|
||||
var v struct {
|
||||
Msg string `json:"message"`
|
||||
}
|
||||
|
||||
err := json.Unmarshal([]byte(derr.Message), &v)
|
||||
if err != nil {
|
||||
// If message was not valid JSON, the raw body is still better than nothing.
|
||||
return fmt.Errorf("%s", derr.Message)
|
||||
}
|
||||
return fmt.Errorf("%s", v.Msg)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
type temporary struct {
|
||||
error
|
||||
}
|
||||
|
||||
func (t *temporary) Temporary() bool { return true }
|
||||
|
||||
func temp(err error) error {
|
||||
return &temporary{err}
|
||||
}
|
||||
|
||||
// some 500s are totally cool
|
||||
func filter(err error) error {
|
||||
// "API error (500): {\"message\":\"service endpoint with name task-57d722ecdecb9e7be16aff17 already exists\"}\n" -> ok since container exists
|
||||
switch {
|
||||
default:
|
||||
return err
|
||||
case err == nil:
|
||||
return err
|
||||
case strings.Contains(err.Error(), "service endpoint with name"):
|
||||
}
|
||||
logrus.WithError(err).Warn("filtering error")
|
||||
return nil
|
||||
}
|
||||
|
||||
func filterNoSuchContainer(err error) error {
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
_, containerNotFound := err.(*docker.NoSuchContainer)
|
||||
dockerErr, ok := err.(*docker.Error)
|
||||
if containerNotFound || (ok && dockerErr.Status == 404) {
|
||||
logrus.WithError(err).Error("filtering error")
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func filterNotRunning(err error) error {
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
_, containerNotRunning := err.(*docker.ContainerNotRunning)
|
||||
dockerErr, ok := err.(*docker.Error)
|
||||
if containerNotRunning || (ok && dockerErr.Status == 304) {
|
||||
logrus.WithError(err).Error("filtering error")
|
||||
return nil
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func (d *dockerWrap) AttachToContainerNonBlocking(opts docker.AttachToContainerOptions) (w docker.CloseWaiter, err error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), retryTimeout)
|
||||
defer cancel()
|
||||
err = d.retry(ctx, func() error {
|
||||
w, err = d.docker.AttachToContainerNonBlocking(opts)
|
||||
if err != nil {
|
||||
// always retry if attach errors, task is running, we want logs!
|
||||
err = temp(err)
|
||||
}
|
||||
return err
|
||||
})
|
||||
return w, err
|
||||
}
|
||||
|
||||
func (d *dockerWrap) WaitContainerWithContext(id string, ctx context.Context) (code int, err error) {
|
||||
err = d.retry(ctx, func() error {
|
||||
code, err = d.dockerNoTimeout.WaitContainerWithContext(id, ctx)
|
||||
return err
|
||||
})
|
||||
return code, filterNoSuchContainer(err)
|
||||
}
|
||||
|
||||
func (d *dockerWrap) StartContainerWithContext(id string, hostConfig *docker.HostConfig, ctx context.Context) (err error) {
|
||||
err = d.retry(ctx, func() error {
|
||||
err = d.dockerNoTimeout.StartContainerWithContext(id, hostConfig, ctx)
|
||||
if _, ok := err.(*docker.NoSuchContainer); ok {
|
||||
// for some reason create will sometimes return successfully then say no such container here. wtf. so just retry like normal
|
||||
return temp(err)
|
||||
}
|
||||
return err
|
||||
})
|
||||
return err
|
||||
}
|
||||
|
||||
func (d *dockerWrap) CreateContainer(opts docker.CreateContainerOptions) (c *docker.Container, err error) {
|
||||
err = d.retry(opts.Context, func() error {
|
||||
c, err = d.dockerNoTimeout.CreateContainer(opts)
|
||||
return err
|
||||
})
|
||||
return c, err
|
||||
}
|
||||
|
||||
func (d *dockerWrap) PullImage(opts docker.PullImageOptions, auth docker.AuthConfiguration) (err error) {
|
||||
err = d.retry(opts.Context, func() error {
|
||||
err = d.dockerNoTimeout.PullImage(opts, auth)
|
||||
return err
|
||||
})
|
||||
return err
|
||||
}
|
||||
|
||||
func (d *dockerWrap) RemoveContainer(opts docker.RemoveContainerOptions) (err error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), retryTimeout)
|
||||
defer cancel()
|
||||
err = d.retry(ctx, func() error {
|
||||
err = d.docker.RemoveContainer(opts)
|
||||
return err
|
||||
})
|
||||
return filterNoSuchContainer(err)
|
||||
}
|
||||
|
||||
func (d *dockerWrap) InspectImage(name string) (i *docker.Image, err error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), retryTimeout)
|
||||
defer cancel()
|
||||
err = d.retry(ctx, func() error {
|
||||
i, err = d.docker.InspectImage(name)
|
||||
return err
|
||||
})
|
||||
return i, err
|
||||
}
|
||||
|
||||
func (d *dockerWrap) InspectContainer(id string) (c *docker.Container, err error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), retryTimeout)
|
||||
defer cancel()
|
||||
err = d.retry(ctx, func() error {
|
||||
c, err = d.docker.InspectContainer(id)
|
||||
return err
|
||||
})
|
||||
return c, err
|
||||
}
|
||||
|
||||
func (d *dockerWrap) StopContainer(id string, timeout uint) (err error) {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), retryTimeout)
|
||||
defer cancel()
|
||||
err = d.retry(ctx, func() error {
|
||||
err = d.docker.StopContainer(id, timeout)
|
||||
return err
|
||||
})
|
||||
return filterNotRunning(filterNoSuchContainer(err))
|
||||
}
|
||||
|
||||
func (d *dockerWrap) Stats(opts docker.StatsOptions) (err error) {
|
||||
// we can't retry this one this way since the callee closes the
|
||||
// stats chan, need a fancier retry mechanism where we can swap out
|
||||
// channels, but stats isn't crucial so... be lazy for now
|
||||
return d.docker.Stats(opts)
|
||||
|
||||
//err = d.retry(func() error {
|
||||
//err = d.docker.Stats(opts)
|
||||
//return err
|
||||
//})
|
||||
//return err
|
||||
}
|
||||
121
api/runner/drivers/docker/docker_test.go
Normal file
121
api/runner/drivers/docker/docker_test.go
Normal file
@@ -0,0 +1,121 @@
|
||||
// Copyright 2016 Iron.io
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package docker
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/kumokit/functions/api/runner/common"
|
||||
"github.com/kumokit/functions/api/runner/drivers"
|
||||
"github.com/vrischmann/envconfig"
|
||||
)
|
||||
|
||||
type taskDockerTest struct {
|
||||
id string
|
||||
input io.Reader
|
||||
output io.Writer
|
||||
}
|
||||
|
||||
func (f *taskDockerTest) Command() string { return "" }
|
||||
func (f *taskDockerTest) EnvVars() map[string]string {
|
||||
return map[string]string{}
|
||||
}
|
||||
func (f *taskDockerTest) Labels() map[string]string { return nil }
|
||||
func (f *taskDockerTest) Id() string { return f.id }
|
||||
func (f *taskDockerTest) Group() string { return "" }
|
||||
func (f *taskDockerTest) Image() string { return "iron/hello" }
|
||||
func (f *taskDockerTest) Timeout() time.Duration { return 30 * time.Second }
|
||||
func (f *taskDockerTest) Logger() (stdout, stderr io.Writer) { return f.output, nil }
|
||||
func (f *taskDockerTest) WriteStat(drivers.Stat) { /* TODO */ }
|
||||
func (f *taskDockerTest) Volumes() [][2]string { return [][2]string{} }
|
||||
func (f *taskDockerTest) WorkDir() string { return "" }
|
||||
func (f *taskDockerTest) Close() {}
|
||||
func (f *taskDockerTest) Input() io.Reader { return f.input }
|
||||
|
||||
func TestRunnerDocker(t *testing.T) {
|
||||
env := common.NewEnvironment(func(e *common.Environment) {})
|
||||
dkr := NewDocker(env, drivers.Config{})
|
||||
ctx := context.Background()
|
||||
|
||||
task := &taskDockerTest{"test-docker", nil, nil}
|
||||
|
||||
cookie, err := dkr.Prepare(ctx, task)
|
||||
if err != nil {
|
||||
t.Fatal("Couldn't prepare task test")
|
||||
}
|
||||
defer cookie.Close()
|
||||
|
||||
result, err := cookie.Run(ctx)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if result.Status() != "success" {
|
||||
t.Fatal("Test should successfully run the image")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunnerDockerStdin(t *testing.T) {
|
||||
env := common.NewEnvironment(func(e *common.Environment) {})
|
||||
dkr := NewDocker(env, drivers.Config{})
|
||||
ctx := context.Background()
|
||||
|
||||
input := `{"name": "test"}`
|
||||
var output bytes.Buffer
|
||||
|
||||
task := &taskDockerTest{"test-docker-stdin", bytes.NewBufferString(input), &output}
|
||||
|
||||
cookie, err := dkr.Prepare(ctx, task)
|
||||
if err != nil {
|
||||
t.Fatal("Couldn't prepare task test")
|
||||
}
|
||||
defer cookie.Close()
|
||||
|
||||
result, err := cookie.Run(ctx)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if result.Status() != "success" {
|
||||
t.Error("Test should successfully run the image")
|
||||
}
|
||||
|
||||
expect := "Hello test!"
|
||||
got := output.String()
|
||||
if !strings.Contains(got, expect) {
|
||||
t.Errorf("Test expected output to contain '%s', got '%s'", expect, got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestConfigLoadMemory(t *testing.T) {
|
||||
if err := os.Setenv("MEMORY_PER_JOB", "128M"); err != nil {
|
||||
t.Fatalf("Could not set MEMORY_PER_JOB: %v", err)
|
||||
}
|
||||
|
||||
var conf drivers.Config
|
||||
if err := envconfig.Init(&conf); err != nil {
|
||||
t.Fatalf("Could not read config: %v", err)
|
||||
}
|
||||
|
||||
if conf.Memory != 128*1024*1024 {
|
||||
t.Fatalf("Memory read from config should match 128M, got %d", conf.Memory)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user