mirror of
https://github.com/fnproject/fn.git
synced 2022-10-28 21:29:17 +03:00
we had this _almost_ right, in that we were trying, but we weren't masking the error from the user response for any error we don't intend to show. this also adds a stack trace from any internal server errors, so that we might be able to track them down in the future (looking at you, 'context deadline exceeded'). in addition, this adds a new `models.APIError` interface which all of the errors in `models` now implement, and can be caught easily / added to easily. the front end now does no status rewriting based on api errors, now when we get a non-nil error we can call `handleResponse(c, err)` with it and if it's a proper error, return it to the user with the right status code, otherwise log a stack trace and return `internal server error`. this cleans up a lot of the front end code. also rewrites start task ctx deadline exceeded as timeout. with iw we had async tasks so we could start the clock later and it didn't matter, but now with sync tasks time out sometimes just making docker calls, and we want the task status to show up as timed out. we may want to just catch all this above in addition to this, but this seems like the right thing to do. remove squishing together errors. this was weird, now we return the first error for the purposes of using the new err interface. removed a lot of 5xx errors that really should have been 4xx errors. changed some of the 400 errors to 409 errors, since they are from sending in conflicting info and not a malformed request. removed unused errors / useless errors (many were used for logging, and didn't provide any context. now with stack traces we don't need context as much in the logs).
638 lines
22 KiB
Go
638 lines
22 KiB
Go
package docker
|
|
|
|
import (
|
|
"context"
|
|
"crypto/tls"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"net"
|
|
"net/http"
|
|
"net/url"
|
|
"os"
|
|
"path"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/Sirupsen/logrus"
|
|
manifest "github.com/docker/distribution/manifest/schema1"
|
|
"github.com/fsouza/go-dockerclient"
|
|
"github.com/heroku/docker-registry-client/registry"
|
|
"gitlab-odx.oracle.com/odx/functions/api/runner/common"
|
|
"gitlab-odx.oracle.com/odx/functions/api/runner/common/stats"
|
|
"gitlab-odx.oracle.com/odx/functions/api/runner/drivers"
|
|
)
|
|
|
|
const hubURL = "https://registry.hub.docker.com"
|
|
|
|
var registryClient = &http.Client{
|
|
Transport: &http.Transport{
|
|
Dial: (&net.Dialer{
|
|
Timeout: 10 * time.Second,
|
|
KeepAlive: 2 * time.Minute,
|
|
}).Dial,
|
|
TLSClientConfig: &tls.Config{
|
|
ClientSessionCache: tls.NewLRUClientSessionCache(8192),
|
|
},
|
|
TLSHandshakeTimeout: 10 * time.Second,
|
|
MaxIdleConnsPerHost: 32, // TODO tune; we will likely be making lots of requests to same place
|
|
Proxy: http.ProxyFromEnvironment,
|
|
IdleConnTimeout: 90 * time.Second,
|
|
MaxIdleConns: 512,
|
|
ExpectContinueTimeout: 1 * time.Second,
|
|
},
|
|
}
|
|
|
|
// A drivers.ContainerTask should implement the Auther interface if it would
|
|
// like to use not-necessarily-public docker images for any or all task
|
|
// invocations.
|
|
type Auther interface {
|
|
// DockerAuth should return docker auth credentials that will authenticate
|
|
// against a docker registry for a given drivers.ContainerTask.Image(). An
|
|
// error may be returned which will cause the task not to be run, this can be
|
|
// useful for an implementer to do things like testing auth configurations
|
|
// before returning them; e.g. if the implementer would like to impose
|
|
// certain restrictions on images or if credentials must be acquired right
|
|
// before runtime and there's an error doing so. If these credentials don't
|
|
// work, the docker pull will fail and the task will be set to error status.
|
|
DockerAuth() (docker.AuthConfiguration, error)
|
|
}
|
|
|
|
type runResult struct {
|
|
error
|
|
status string
|
|
start time.Time
|
|
}
|
|
|
|
func (r *runResult) Error() string {
|
|
if r.error == nil {
|
|
return ""
|
|
}
|
|
return r.error.Error()
|
|
}
|
|
|
|
func (r *runResult) Status() string { return r.status }
|
|
func (r *runResult) UserVisible() bool { return common.IsUserVisibleError(r.error) }
|
|
func (r *runResult) StartTime() time.Time { return r.start }
|
|
|
|
type DockerDriver struct {
|
|
conf drivers.Config
|
|
docker dockerClient // retries on *docker.Client, restricts ad hoc *docker.Client usage / retries
|
|
hostname string
|
|
|
|
*common.Environment
|
|
}
|
|
|
|
// implements drivers.Driver
|
|
func NewDocker(env *common.Environment, conf drivers.Config) *DockerDriver {
|
|
hostname, err := os.Hostname()
|
|
if err != nil {
|
|
logrus.WithError(err).Fatal("couldn't resolve hostname")
|
|
}
|
|
|
|
return &DockerDriver{
|
|
conf: conf,
|
|
docker: newClient(env),
|
|
hostname: hostname,
|
|
Environment: env,
|
|
}
|
|
}
|
|
|
|
// CheckRegistry will return a sizer, which can be used to check the size of an
|
|
// image if the returned error is nil. If the error returned is nil, then
|
|
// authentication against the given credentials was successful, if the
|
|
// configuration does not specify a config.ServerAddress,
|
|
// https://hub.docker.com will be tried. CheckRegistry is a package level
|
|
// method since rkt can also use docker images, we may be interested in using
|
|
// rkt w/o a docker driver configured; also, we don't have to tote around a
|
|
// driver in any tasker that may be interested in registry information (2/2
|
|
// cases thus far).
|
|
func CheckRegistry(ctx context.Context, image string, config docker.AuthConfiguration) (Sizer, error) {
|
|
ctx, log := common.LoggerWithFields(ctx, logrus.Fields{"stack": "CheckRegistry"})
|
|
registry, repo, tag := drivers.ParseImage(image)
|
|
|
|
reg, err := registryForConfig(ctx, config, registry)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
mani, err := reg.Manifest(repo, tag)
|
|
if err != nil {
|
|
log.WithFields(logrus.Fields{"username": config.Username, "server": config.ServerAddress, "image": image}).WithError(err).Error("Credentials not authorized, trying next.")
|
|
//if !isAuthError(err) {
|
|
// // TODO we might retry this, since if this was the registry that was supposed to
|
|
// // auth the task will erroneously be set to 'error'
|
|
//}
|
|
|
|
return nil, err
|
|
}
|
|
|
|
return &sizer{mani, reg, repo}, nil
|
|
}
|
|
|
|
// Sizer returns size information. This interface is liable to contain more
|
|
// than a size at some point, change as needed.
|
|
type Sizer interface {
|
|
Size() (int64, error)
|
|
}
|
|
|
|
type sizer struct {
|
|
mani *manifest.SignedManifest
|
|
reg *registry.Registry
|
|
repo string
|
|
}
|
|
|
|
func (s *sizer) Size() (int64, error) {
|
|
var sum int64
|
|
for _, r := range s.mani.References() {
|
|
desc, err := s.reg.LayerMetadata(s.repo, r.Digest)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
sum += desc.Size
|
|
}
|
|
return sum, nil
|
|
}
|
|
|
|
func registryURL(ctx context.Context, addr string) (string, error) {
|
|
log := common.Logger(ctx)
|
|
if addr == "" || strings.Contains(addr, "hub.docker.com") || strings.Contains(addr, "index.docker.io") {
|
|
return hubURL, nil
|
|
}
|
|
|
|
url, err := url.Parse(addr)
|
|
if err != nil {
|
|
// TODO we could error the task out from this with a user error but since
|
|
// we have a list of auths to check, just return the error so as to be
|
|
// skipped... horrible api as it is
|
|
log.WithFields(logrus.Fields{"auth_addr": addr}).WithError(err).Error("error parsing server address url, skipping")
|
|
return "", err
|
|
}
|
|
|
|
if url.Scheme == "" {
|
|
url.Scheme = "https"
|
|
}
|
|
url.Path = strings.TrimSuffix(url.Path, "/")
|
|
url.Path = strings.TrimPrefix(url.Path, "/v2")
|
|
url.Path = strings.TrimPrefix(url.Path, "/v1") // just try this, if it fails it fails, not supporting v1
|
|
return url.String(), nil
|
|
}
|
|
|
|
func isAuthError(err error) bool {
|
|
// AARGH!
|
|
if urlError, ok := err.(*url.Error); ok {
|
|
if httpError, ok := urlError.Err.(*registry.HttpStatusError); ok {
|
|
if httpError.Response.StatusCode == 401 {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
func registryForConfig(ctx context.Context, config docker.AuthConfiguration, reg string) (*registry.Registry, error) {
|
|
if reg == "" {
|
|
reg = config.ServerAddress
|
|
}
|
|
|
|
var err error
|
|
config.ServerAddress, err = registryURL(ctx, reg)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Use this instead of registry.New to avoid the Ping().
|
|
transport := registry.WrapTransport(registryClient.Transport, reg, config.Username, config.Password)
|
|
r := ®istry.Registry{
|
|
URL: config.ServerAddress,
|
|
Client: &http.Client{
|
|
Transport: transport,
|
|
},
|
|
Logf: registry.Quiet,
|
|
}
|
|
return r, nil
|
|
}
|
|
|
|
func (drv *DockerDriver) Prepare(ctx context.Context, task drivers.ContainerTask) (drivers.Cookie, error) {
|
|
ctx, log := common.LoggerWithFields(ctx, logrus.Fields{"stack": "Prepare"})
|
|
var cmd []string
|
|
if task.Command() != "" {
|
|
// NOTE: this is hyper-sensitive and may not be correct like this even, but it passes old tests
|
|
// task.Command() in swapi is always "sh /mnt/task/.runtask" so fields is safe
|
|
cmd = strings.Fields(task.Command())
|
|
log.WithFields(logrus.Fields{"call_id": task.Id(), "cmd": cmd, "len": len(cmd)}).Debug("docker command")
|
|
}
|
|
|
|
envvars := make([]string, 0, len(task.EnvVars()))
|
|
for name, val := range task.EnvVars() {
|
|
envvars = append(envvars, name+"="+val)
|
|
}
|
|
|
|
containerName := newContainerID(task)
|
|
container := docker.CreateContainerOptions{
|
|
Name: containerName,
|
|
Config: &docker.Config{
|
|
Env: envvars,
|
|
Cmd: cmd,
|
|
Memory: int64(drv.conf.Memory),
|
|
CPUShares: drv.conf.CPUShares,
|
|
Hostname: drv.hostname,
|
|
Image: task.Image(),
|
|
Volumes: map[string]struct{}{},
|
|
Labels: task.Labels(),
|
|
OpenStdin: true,
|
|
AttachStdin: true,
|
|
StdinOnce: true,
|
|
},
|
|
HostConfig: &docker.HostConfig{},
|
|
Context: ctx,
|
|
}
|
|
|
|
volumes := task.Volumes()
|
|
for _, mapping := range volumes {
|
|
hostDir := mapping[0]
|
|
containerDir := mapping[1]
|
|
container.Config.Volumes[containerDir] = struct{}{}
|
|
mapn := fmt.Sprintf("%s:%s", hostDir, containerDir)
|
|
container.HostConfig.Binds = append(container.HostConfig.Binds, mapn)
|
|
log.WithFields(logrus.Fields{"volumes": mapn, "call_id": task.Id()}).Debug("setting volumes")
|
|
}
|
|
|
|
if wd := task.WorkDir(); wd != "" {
|
|
log.WithFields(logrus.Fields{"wd": wd, "call_id": task.Id()}).Debug("setting work dir")
|
|
container.Config.WorkingDir = wd
|
|
}
|
|
|
|
err := drv.ensureImage(ctx, task)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
createTimer := drv.NewTimer("docker", "create_container", 1.0)
|
|
_, err = drv.docker.CreateContainer(container)
|
|
createTimer.Measure()
|
|
if err != nil {
|
|
// since we retry under the hood, if the container gets created and retry fails, we can just ignore error
|
|
if err != docker.ErrContainerAlreadyExists {
|
|
log.WithFields(logrus.Fields{"call_id": task.Id(), "command": container.Config.Cmd, "memory": container.Config.Memory,
|
|
"cpu_shares": container.Config.CPUShares, "hostname": container.Config.Hostname, "name": container.Name,
|
|
"image": container.Config.Image, "volumes": container.Config.Volumes, "binds": container.HostConfig.Binds, "container": containerName,
|
|
}).WithError(err).Error("Could not create container")
|
|
|
|
if ce := containerConfigError(err); ce != nil {
|
|
return nil, common.UserError(fmt.Errorf("Failed to create container from task configuration '%s'", ce))
|
|
}
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
// discard removal error
|
|
return &cookie{id: containerName, task: task, drv: drv}, nil
|
|
}
|
|
|
|
type cookie struct {
|
|
id string
|
|
task drivers.ContainerTask
|
|
drv *DockerDriver
|
|
}
|
|
|
|
func (c *cookie) Close() error { return c.drv.removeContainer(c.id) }
|
|
|
|
func (c *cookie) Run(ctx context.Context) (drivers.RunResult, error) {
|
|
return c.drv.run(ctx, c.id, c.task)
|
|
}
|
|
|
|
func (drv *DockerDriver) removeContainer(container string) error {
|
|
removeTimer := drv.NewTimer("docker", "remove_container", 1.0)
|
|
defer removeTimer.Measure()
|
|
err := drv.docker.RemoveContainer(docker.RemoveContainerOptions{
|
|
ID: container, Force: true, RemoveVolumes: true})
|
|
|
|
if err != nil {
|
|
logrus.WithError(err).WithFields(logrus.Fields{"container": container}).Error("error removing container")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (drv *DockerDriver) ensureImage(ctx context.Context, task drivers.ContainerTask) error {
|
|
reg, _, _ := drivers.ParseImage(task.Image())
|
|
|
|
// ask for docker creds before looking for image, as the tasker may need to
|
|
// validate creds even if the image is downloaded.
|
|
|
|
var config docker.AuthConfiguration // default, tries docker hub w/o user/pass
|
|
if task, ok := task.(Auther); ok {
|
|
var err error
|
|
config, err = task.DockerAuth()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
if reg != "" {
|
|
config.ServerAddress = reg
|
|
}
|
|
|
|
// see if we already have it, if not, pull it
|
|
_, err := drv.docker.InspectImage(task.Image())
|
|
if err == docker.ErrNoSuchImage {
|
|
err = drv.pullImage(ctx, task, config)
|
|
}
|
|
|
|
return err
|
|
}
|
|
|
|
func (drv *DockerDriver) pullImage(ctx context.Context, task drivers.ContainerTask, config docker.AuthConfiguration) error {
|
|
log := common.Logger(ctx)
|
|
|
|
reg, repo, tag := drivers.ParseImage(task.Image())
|
|
globalRepo := path.Join(reg, repo)
|
|
|
|
pullTimer := drv.NewTimer("docker", "pull_image", 1.0)
|
|
defer pullTimer.Measure()
|
|
|
|
drv.Inc("docker", "pull_image_count."+stats.AsStatField(task.Image()), 1, 1)
|
|
|
|
if reg != "" {
|
|
config.ServerAddress = reg
|
|
}
|
|
|
|
var err error
|
|
config.ServerAddress, err = registryURL(ctx, config.ServerAddress)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
log.WithFields(logrus.Fields{"registry": config.ServerAddress, "username": config.Username, "image": task.Image()}).Info("Pulling image")
|
|
|
|
err = drv.docker.PullImage(docker.PullImageOptions{Repository: globalRepo, Tag: tag, Context: ctx}, config)
|
|
if err != nil {
|
|
drv.Inc("task", "error.pull."+stats.AsStatField(task.Image()), 1, 1)
|
|
log.WithFields(logrus.Fields{"registry": config.ServerAddress, "username": config.Username, "image": task.Image()}).WithError(err).Error("Failed to pull image")
|
|
|
|
// TODO need to inspect for hub or network errors and pick.
|
|
return common.UserError(fmt.Errorf("Failed to pull image '%s': %s", task.Image(), err))
|
|
|
|
// TODO what about a case where credentials were good, then credentials
|
|
// were invalidated -- do we need to keep the credential cache docker
|
|
// driver side and after pull for this case alone?
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Run executes the docker container. If task runs, drivers.RunResult will be returned. If something fails outside the task (ie: Docker), it will return error.
|
|
// The docker driver will attempt to cast the task to a Auther. If that succeeds, private image support is available. See the Auther interface for how to implement this.
|
|
func (drv *DockerDriver) run(ctx context.Context, container string, task drivers.ContainerTask) (drivers.RunResult, error) {
|
|
log := common.Logger(ctx)
|
|
timeout := task.Timeout()
|
|
|
|
var cancel context.CancelFunc
|
|
if timeout <= 0 {
|
|
ctx, cancel = context.WithCancel(ctx)
|
|
} else {
|
|
ctx, cancel = context.WithTimeout(ctx, timeout)
|
|
}
|
|
defer cancel() // do this so that after Run exits, nanny and collect stop
|
|
var complete bool
|
|
defer func() { complete = true }() // run before cancel is called
|
|
ctx = context.WithValue(ctx, completeKey, &complete)
|
|
|
|
go drv.nanny(ctx, container)
|
|
go drv.collectStats(ctx, container, task)
|
|
|
|
mwOut, mwErr := task.Logger()
|
|
|
|
timer := drv.NewTimer("docker", "attach_container", 1)
|
|
waiter, err := drv.docker.AttachToContainerNonBlocking(docker.AttachToContainerOptions{
|
|
Container: container, OutputStream: mwOut, ErrorStream: mwErr,
|
|
Stream: true, Logs: true, Stdout: true, Stderr: true,
|
|
Stdin: true, InputStream: task.Input()})
|
|
timer.Measure()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
start := time.Now()
|
|
|
|
err = drv.startTask(ctx, container)
|
|
if err != nil {
|
|
if err == context.DeadlineExceeded {
|
|
// if there's just a timeout making the docker calls, rewrite it as such
|
|
return &runResult{start: start, status: drivers.StatusTimeout}, nil
|
|
}
|
|
return nil, err
|
|
}
|
|
|
|
taskTimer := drv.NewTimer("docker", "container_runtime", 1)
|
|
|
|
// can discard error, inspect will tell us about the task and wait will retry under the hood
|
|
drv.docker.WaitContainerWithContext(container, ctx)
|
|
taskTimer.Measure()
|
|
|
|
waiter.Close()
|
|
err = waiter.Wait()
|
|
if err != nil {
|
|
// TODO need to make sure this error isn't just a context error or something we can ignore
|
|
log.WithError(err).Error("attach to container returned error, task may be missing logs")
|
|
}
|
|
|
|
status, err := drv.status(ctx, container)
|
|
return &runResult{
|
|
start: start,
|
|
status: status,
|
|
error: err,
|
|
}, nil
|
|
}
|
|
|
|
const completeKey = "complete"
|
|
|
|
// watch for cancel or timeout and kill process.
|
|
func (drv *DockerDriver) nanny(ctx context.Context, container string) {
|
|
select {
|
|
case <-ctx.Done():
|
|
if *(ctx.Value(completeKey).(*bool)) {
|
|
return
|
|
}
|
|
drv.cancel(container)
|
|
}
|
|
}
|
|
|
|
func (drv *DockerDriver) cancel(container string) {
|
|
stopTimer := drv.NewTimer("docker", "stop_container", 1.0)
|
|
err := drv.docker.StopContainer(container, 30)
|
|
stopTimer.Measure()
|
|
if err != nil {
|
|
logrus.WithError(err).WithFields(logrus.Fields{"container": container, "errType": fmt.Sprintf("%T", err)}).Error("something managed to escape our retries web, could not kill container")
|
|
}
|
|
}
|
|
|
|
func (drv *DockerDriver) collectStats(ctx context.Context, container string, task drivers.ContainerTask) {
|
|
log := common.Logger(ctx)
|
|
done := make(chan bool)
|
|
defer close(done)
|
|
dstats := make(chan *docker.Stats, 1)
|
|
go func() {
|
|
// NOTE: docker automatically streams every 1s. we can skip or avg samples if we'd like but
|
|
// the memory overhead is < 1MB for 3600 stat points so this seems fine, seems better to stream
|
|
// (internal docker api streams) than open/close stream for 1 sample over and over.
|
|
// must be called in goroutine, docker.Stats() blocks
|
|
err := drv.docker.Stats(docker.StatsOptions{
|
|
ID: container,
|
|
Stats: dstats,
|
|
Stream: true,
|
|
Done: done, // A flag that enables stopping the stats operation
|
|
})
|
|
|
|
if err != nil && err != io.ErrClosedPipe {
|
|
log.WithError(err).WithFields(logrus.Fields{"container": container, "call_id": task.Id()}).Error("error streaming docker stats for task")
|
|
}
|
|
}()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case ds, ok := <-dstats:
|
|
if !ok {
|
|
return
|
|
}
|
|
task.WriteStat(cherryPick(ds))
|
|
}
|
|
}
|
|
}
|
|
|
|
func cherryPick(ds *docker.Stats) drivers.Stat {
|
|
// TODO cpu % is as a % of the whole system... cpu is weird since we're sharing it
|
|
// across a bunch of containers and it scales based on how many we're sharing with,
|
|
// do we want users to see as a % of system?
|
|
systemDelta := float64(ds.CPUStats.SystemCPUUsage - ds.PreCPUStats.SystemCPUUsage)
|
|
cores := float64(len(ds.CPUStats.CPUUsage.PercpuUsage))
|
|
var cpuUser, cpuKernel, cpuTotal float64
|
|
if systemDelta > 0 {
|
|
// TODO we could leave these in docker format and let hud/viz tools do this instead of us... like net is, could do same for mem, too. thoughts?
|
|
cpuUser = (float64(ds.CPUStats.CPUUsage.UsageInUsermode-ds.PreCPUStats.CPUUsage.UsageInUsermode) / systemDelta) * cores * 100.0
|
|
cpuKernel = (float64(ds.CPUStats.CPUUsage.UsageInKernelmode-ds.PreCPUStats.CPUUsage.UsageInKernelmode) / systemDelta) * cores * 100.0
|
|
cpuTotal = (float64(ds.CPUStats.CPUUsage.TotalUsage-ds.PreCPUStats.CPUUsage.TotalUsage) / systemDelta) * cores * 100.0
|
|
}
|
|
|
|
var rx, tx float64
|
|
for _, v := range ds.Networks {
|
|
rx += float64(v.RxBytes)
|
|
tx += float64(v.TxBytes)
|
|
}
|
|
|
|
var blkRead, blkWrite uint64
|
|
for _, bioEntry := range ds.BlkioStats.IOServiceBytesRecursive {
|
|
switch strings.ToLower(bioEntry.Op) {
|
|
case "read":
|
|
blkRead = blkRead + bioEntry.Value
|
|
case "write":
|
|
blkWrite = blkWrite + bioEntry.Value
|
|
}
|
|
}
|
|
|
|
return drivers.Stat{
|
|
Timestamp: ds.Read,
|
|
Metrics: map[string]uint64{
|
|
// source: https://godoc.org/github.com/fsouza/go-dockerclient#Stats
|
|
// ex (for future expansion): {"read":"2016-08-03T18:08:05Z","pids_stats":{},"network":{},"networks":{"eth0":{"rx_bytes":508,"tx_packets":6,"rx_packets":6,"tx_bytes":508}},"memory_stats":{"stats":{"cache":16384,"pgpgout":281,"rss":8826880,"pgpgin":2440,"total_rss":8826880,"hierarchical_memory_limit":536870912,"total_pgfault":3809,"active_anon":8843264,"total_active_anon":8843264,"total_pgpgout":281,"total_cache":16384,"pgfault":3809,"total_pgpgin":2440},"max_usage":8953856,"usage":8953856,"limit":536870912},"blkio_stats":{"io_service_bytes_recursive":[{"major":202,"op":"Read"},{"major":202,"op":"Write"},{"major":202,"op":"Sync"},{"major":202,"op":"Async"},{"major":202,"op":"Total"}],"io_serviced_recursive":[{"major":202,"op":"Read"},{"major":202,"op":"Write"},{"major":202,"op":"Sync"},{"major":202,"op":"Async"},{"major":202,"op":"Total"}]},"cpu_stats":{"cpu_usage":{"percpu_usage":[47641874],"usage_in_usermode":30000000,"total_usage":47641874},"system_cpu_usage":8880800500000000,"throttling_data":{}},"precpu_stats":{"cpu_usage":{"percpu_usage":[44946186],"usage_in_usermode":30000000,"total_usage":44946186},"system_cpu_usage":8880799510000000,"throttling_data":{}}}
|
|
// mostly stolen values from docker stats cli api...
|
|
|
|
// net
|
|
"net_rx": uint64(rx),
|
|
"net_tx": uint64(tx),
|
|
// mem
|
|
"mem_limit": ds.MemoryStats.Limit,
|
|
"mem_usage": ds.MemoryStats.Usage,
|
|
// i/o
|
|
"disk_read": blkRead,
|
|
"disk_write": blkWrite,
|
|
// cpu
|
|
"cpu_user": uint64(cpuUser),
|
|
"cpu_total": uint64(cpuTotal),
|
|
"cpu_kernel": uint64(cpuKernel),
|
|
},
|
|
}
|
|
}
|
|
|
|
// Introduces some randomness to prevent container name clashes where task ID remains the same.
|
|
func newContainerID(task drivers.ContainerTask) string {
|
|
return fmt.Sprintf("task-%d-%s", time.Now().UnixNano(), task.Id())
|
|
}
|
|
|
|
func (drv *DockerDriver) startTask(ctx context.Context, container string) error {
|
|
log := common.Logger(ctx)
|
|
startTimer := drv.NewTimer("docker", "start_container", 1.0)
|
|
log.WithFields(logrus.Fields{"container": container}).Debug("Starting container execution")
|
|
err := drv.docker.StartContainerWithContext(container, nil, ctx)
|
|
startTimer.Measure()
|
|
if err != nil {
|
|
dockerErr, ok := err.(*docker.Error)
|
|
_, containerAlreadyRunning := err.(*docker.ContainerAlreadyRunning)
|
|
if containerAlreadyRunning || (ok && dockerErr.Status == 304) {
|
|
// 304=container already started -- so we can ignore error
|
|
} else {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (drv *DockerDriver) status(ctx context.Context, container string) (status string, err error) {
|
|
log := common.Logger(ctx)
|
|
|
|
cinfo, err := drv.docker.InspectContainer(container)
|
|
if err != nil {
|
|
// this is pretty sad, but better to say we had an error than to not.
|
|
// task has run to completion and logs will be uploaded, user can decide
|
|
log.WithFields(logrus.Fields{"container": container}).WithError(err).Error("Inspecting container")
|
|
return drivers.StatusError, err
|
|
}
|
|
|
|
exitCode := cinfo.State.ExitCode
|
|
log.WithFields(logrus.Fields{
|
|
"exit_code": exitCode,
|
|
"container_running": cinfo.State.Running,
|
|
"container_status": cinfo.State.Status,
|
|
"container_finished": cinfo.State.FinishedAt,
|
|
"container_error": cinfo.State.Error,
|
|
}).Info("container status")
|
|
|
|
select { // do this after inspect so we can see exit code
|
|
case <-ctx.Done(): // check if task was canceled or timed out
|
|
switch ctx.Err() {
|
|
case context.DeadlineExceeded:
|
|
return drivers.StatusTimeout, nil
|
|
case context.Canceled:
|
|
return drivers.StatusCancelled, nil
|
|
}
|
|
default:
|
|
}
|
|
|
|
if cinfo.State.Running {
|
|
log.Warn("getting status of task that is still running, need to fix this")
|
|
return drivers.StatusError, errors.New("task in running state but not timed out. weird")
|
|
}
|
|
|
|
switch exitCode {
|
|
default:
|
|
return drivers.StatusError, common.UserError(fmt.Errorf("exit code %d", exitCode))
|
|
case 0:
|
|
return drivers.StatusSuccess, nil
|
|
case 137: // OOM
|
|
drv.Inc("docker", "oom", 1, 1)
|
|
if !cinfo.State.OOMKilled {
|
|
// It is possible that the host itself is running out of memory and
|
|
// the host kernel killed one of the container processes.
|
|
// See: https://github.com/moby/moby/issues/15621
|
|
// TODO reed: isn't an OOM an OOM? this is wasting space imo
|
|
log.WithFields(logrus.Fields{"container": container}).Info("Setting task as OOM killed, but docker disagreed.")
|
|
drv.Inc("docker", "possible_oom_false_alarm", 1, 1.0)
|
|
}
|
|
|
|
return drivers.StatusKilled, drivers.ErrOutOfMemory
|
|
}
|
|
}
|