additional ctx spans / maid service (#716)

* add spans to async

* clean up / add spans to agent

* there were a few methods which had multiple contexts which existed in the same
scope (this doesn't end well, usually), flattened those out.
* loop bound context cancels now rely on defer (also was brittle)
* runHot had a lot of ctx shuffling, flattened that.
* added some additional spans in certain paths for added granularity
* linked up the hot launcher / run hot / wait hot to _a_ root span, the first
2 are follows from spans, but at least we can see the source of these and also
can see containers launched over a hot launcher's lifetime

I left TODO around the FollowsFrom because OpenCensus doesn't, at least at the
moment, appear to have any idea of FollowsFrom and it was an extra OpenTracing
method (we have to get the span out, start a new span with the option, then
add it to the context... some shuffling required). anyway, was on the fence
about adding at least.

* resource waiters need to manage their own goroutine lifecycle

* if we get an impossible memory request, bail instead of infinite loop

* handle timeout slippery case

* still sucks, but hotLauncher doesn't leak anything. even the time.After timer goroutines

* simplify GetResourceToken

GetCall can guard against the impossible to allocate resource tasks entering
the system by erroring instead of doling them out. this makes GetResourceToken
logic more straightforward for callers, who now simply have the contract that
they won't ever get a token if they let tasks into the agent that can't run
(but GetCall guards this, and there's a test for it).

sorry, I was going to make this only do that, but when I went to fix up the
tests, my last patch went haywire so I fixed that too. this also at least
tries to simplify the hotLaunch loop, which will now no longer leak time.After
timers (which were long, and with signaller, they were many -- I got a stack
trace :) -- this breaks out the bottom half of the logic to check to see if we
need to launch into its own function, and handles the cleaning duties only in
the caller instead of in 2 different select statements. played with this a
bit, no doubt further cleaning could be done, but this _seems_ better.

* fix vet

* add units to exported method contract docs

* oops
This commit is contained in:
Reed Allman
2018-01-23 19:52:22 -08:00
committed by GitHub
parent ccd95b6f72
commit bbd50a0e02
6 changed files with 161 additions and 103 deletions

View File

@@ -24,7 +24,6 @@ import (
// TODO async calls need to add route.Headers as well
// TODO need to shut off reads/writes in dispatch to the pipes when call times out so that
// 2 calls don't have the same container's pipes...
// TODO add spans back around container launching for hot (follows from?) + other more granular spans
// TODO handle timeouts / no response in sync & async (sync is json+503 atm, not 504, async is empty log+status)
// see also: server/runner.go wrapping the response writer there, but need to handle async too (push down?)
// TODO storing logs / call can push call over the timeout
@@ -251,7 +250,7 @@ func (a *agent) handleStatsEnd(ctx context.Context, call *call, err error) {
}
}
func statSpans(ctx context.Context, call *call) (ctxr context.Context, finish func()) {
func statSpans(ctx context.Context, call *call) (_ context.Context, finish func()) {
// agent_submit_global has no parent span because we don't want it to inherit fn_appname or fn_path
spanGlobal := opentracing.StartSpan("agent_submit_global")
@@ -305,81 +304,77 @@ func (a *agent) getSlot(ctx context.Context, call *call) (Slot, error) {
// hotLauncher is spawned in a go routine for each slot queue to monitor stats and launch hot
// containers if needed. Upon shutdown or activity timeout, hotLauncher exits and during exit,
// it destroys the slot queue.
func (a *agent) hotLauncher(ctx context.Context, callObj *call) {
func (a *agent) hotLauncher(ctx context.Context, call *call) {
// Let use 60 minutes or 2 * IdleTimeout as hot queue idle timeout, pick
// whichever is longer. If in this time, there's no activity, then
// we destroy the hot queue.
timeout := time.Duration(60) * time.Minute
idleTimeout := time.Duration(callObj.IdleTimeout) * time.Second * 2
idleTimeout := time.Duration(call.IdleTimeout) * time.Second * 2
if timeout < idleTimeout {
timeout = idleTimeout
}
logger := common.Logger(ctx)
logger.WithField("launcher_timeout", timeout).Info("Hot function launcher starting")
isAsync := callObj.Type == models.TypeAsync
// IMPORTANT: get a context that has a child span / logger but NO timeout
// TODO this is a 'FollowsFrom'
ctx = opentracing.ContextWithSpan(common.WithLogger(context.Background(), logger), opentracing.SpanFromContext(ctx))
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_hot_launcher")
defer span.Finish()
for {
ctx, cancel := context.WithTimeout(ctx, timeout)
a.checkLaunch(ctx, call)
select {
case <-a.shutdown: // server shutdown
cancel()
return
case <-time.After(timeout):
if a.slotMgr.deleteSlotQueue(callObj.slots) {
case <-ctx.Done(): // timed out
cancel()
if a.slotMgr.deleteSlotQueue(call.slots) {
logger.Info("Hot function launcher timed out")
return
}
case <-callObj.slots.signaller:
case <-call.slots.signaller:
cancel()
}
}
}
curStats := callObj.slots.getStats()
isNeeded := isNewContainerNeeded(&curStats)
logger.WithFields(logrus.Fields{
"currentStats": curStats,
"isNeeded": isNeeded,
}).Debug("Hot function launcher stats")
if !isNeeded {
continue
}
func (a *agent) checkLaunch(ctx context.Context, call *call) {
curStats := call.slots.getStats()
isAsync := call.Type == models.TypeAsync
isNeeded := isNewContainerNeeded(&curStats)
common.Logger(ctx).WithFields(logrus.Fields{"currentStats": curStats, "isNeeded": isNeeded}).Debug("Hot function launcher stats")
if !isNeeded {
return
}
common.Logger(ctx).WithFields(logrus.Fields{"currentStats": curStats, "isNeeded": isNeeded}).Info("Hot function launcher starting hot container")
ctxResource, cancelResource := context.WithCancel(context.Background())
logger.WithFields(logrus.Fields{
"currentStats": curStats,
"isNeeded": isNeeded,
}).Info("Hot function launcher starting hot container")
select {
case tok, isOpen := <-a.resources.GetResourceToken(ctxResource, callObj.Memory, uint64(callObj.CPUs), isAsync):
cancelResource()
if isOpen {
a.wg.Add(1)
go func(ctx context.Context, call *call, tok ResourceToken) {
a.runHot(ctx, call, tok)
a.wg.Done()
}(ctx, callObj, tok)
} else {
// this means the resource was impossible to reserve (eg. memory size we can never satisfy)
callObj.slots.queueSlot(&hotSlot{done: make(chan struct{}), err: models.ErrCallTimeoutServerBusy})
}
case <-time.After(timeout):
cancelResource()
if a.slotMgr.deleteSlotQueue(callObj.slots) {
logger.Info("Hot function launcher timed out")
return
}
case <-a.shutdown: // server shutdown
cancelResource()
return
}
select {
case tok := <-a.resources.GetResourceToken(ctx, call.Memory, uint64(call.CPUs), isAsync):
a.wg.Add(1) // add waiter in this thread
go func() {
// NOTE: runHot will not inherit the timeout from ctx (ignore timings)
a.runHot(ctx, call, tok)
a.wg.Done()
}()
case <-ctx.Done(): // timeout
case <-a.shutdown: // server shutdown
}
}
// waitHot pings and waits for a hot container from the slot queue
func (a *agent) waitHot(ctx context.Context, call *call) (Slot, error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_wait_hot")
defer span.Finish()
ctxDequeuer, cancelDequeuer := context.WithCancel(ctx)
defer cancelDequeuer()
ch := call.slots.startDequeuer(ctxDequeuer)
ctx, cancel := context.WithCancel(ctx)
defer cancel() // shut down dequeuer if we grab a slot
ch := call.slots.startDequeuer(ctx)
// 1) if we can get a slot immediately, grab it.
// 2) if we don't, send a signaller every 200ms until we do.
@@ -417,24 +412,19 @@ func (a *agent) waitHot(ctx context.Context, call *call) (Slot, error) {
// launchCold waits for necessary resources to launch a new container, then
// returns the slot for that new container to run the request on.
func (a *agent) launchCold(ctx context.Context, call *call) (Slot, error) {
isAsync := call.Type == models.TypeAsync
ch := make(chan Slot)
ctxResource, cancelResource := context.WithCancel(ctx)
defer cancelResource()
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_launch_cold")
defer span.Finish()
select {
case tok, isOpen := <-a.resources.GetResourceToken(ctxResource, call.Memory, uint64(call.CPUs), isAsync):
if !isOpen {
return nil, models.ErrCallTimeoutServerBusy
}
case tok := <-a.resources.GetResourceToken(ctx, call.Memory, uint64(call.CPUs), isAsync):
go a.prepCold(ctx, call, tok, ch)
case <-ctx.Done():
return nil, ctx.Err()
}
cancelResource()
// wait for launch err or a slot to open up
select {
case s := <-ch:
@@ -518,9 +508,7 @@ func (s *hotSlot) exec(ctx context.Context, call *call) error {
common.Logger(ctx).WithField("container_id", s.container.id).Info("starting call")
start := time.Now()
defer func() {
call.slots.recordLatency(SlotQueueRunner, uint64(time.Now().Sub(start).Seconds()*1000))
}()
defer func() { call.slots.recordLatency(SlotQueueRunner, uint64(time.Now().Sub(start).Seconds()*1000)) }()
// swap in the new stderr logger & stat accumulator
oldStderr := s.container.swap(call.stderr, &call.Stats)
@@ -551,6 +539,9 @@ func specialHeader(k string) bool {
}
func (a *agent) prepCold(ctx context.Context, call *call, tok ResourceToken, ch chan Slot) {
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_prep_cold")
defer span.Finish()
// add additional headers to the config to shove everything into env vars for cold
for k, v := range call.Headers {
if !specialHeader(k) {
@@ -585,14 +576,13 @@ func (a *agent) prepCold(ctx context.Context, call *call, tok ResourceToken, ch
}
}
func (a *agent) runHot(ctxArg context.Context, call *call, tok ResourceToken) {
// We must be careful to only use ctxArg for logs/spans
// create a span from ctxArg but ignore the new Context
// instead we will create a new Context below and explicitly set its span
span, _ := opentracing.StartSpanFromContext(ctxArg, "docker_run_hot")
func (a *agent) runHot(ctx context.Context, call *call, tok ResourceToken) {
// IMPORTANT: get a context that has a child span / logger but NO timeout
// TODO this is a 'FollowsFrom'
ctx = opentracing.ContextWithSpan(context.Background(), opentracing.SpanFromContext(ctx))
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_run_hot")
defer span.Finish()
defer tok.Close()
defer tok.Close() // IMPORTANT: this MUST get called
// TODO we have to make sure we flush these pipes or we will deadlock
stdinRead, stdinWrite := io.Pipe()
@@ -600,17 +590,6 @@ func (a *agent) runHot(ctxArg context.Context, call *call, tok ResourceToken) {
proto := protocol.New(protocol.Protocol(call.Format), stdinWrite, stdoutRead)
// we don't want to timeout in here. this is inside of a goroutine and the
// caller can timeout this Call appropriately. e.g. w/ hot if it takes 20
// minutes to pull, then timing out calls for 20 minutes and eventually
// having the image is ideal vs. never getting the image pulled.
// TODO this ctx needs to inherit logger, etc
ctx, shutdownContainer := context.WithCancel(context.Background())
defer shutdownContainer() // close this if our waiter returns
// add the span we created above to the new Context
ctx = opentracing.ContextWithSpan(ctx, span)
start := time.Now()
call.slots.enterState(SlotQueueStarter)
@@ -659,13 +638,16 @@ func (a *agent) runHot(ctxArg context.Context, call *call, tok ResourceToken) {
// buffered, in case someone has slot when waiter returns but isn't yet listening
errC := make(chan error, 1)
ctx, shutdownContainer := context.WithCancel(ctx)
defer shutdownContainer() // close this if our waiter returns, to call off slots
go func() {
defer shutdownContainer() // also close if we get an agent shutdown / idle timeout
for {
select { // make sure everything is up before trying to send slot
case <-ctx.Done(): // container shutdown
return
case <-a.shutdown: // server shutdown
shutdownContainer()
return
default: // ok
}
@@ -682,7 +664,6 @@ func (a *agent) runHot(ctxArg context.Context, call *call, tok ResourceToken) {
if call.slots.ejectSlot(s) {
call.slots.exitStateWithLatency(SlotQueueIdle, uint64(time.Now().Sub(start).Seconds()*1000))
logger.Info("Canceling inactive hot function")
shutdownContainer()
return
}
case <-ctx.Done(): // container shutdown
@@ -693,7 +674,6 @@ func (a *agent) runHot(ctxArg context.Context, call *call, tok ResourceToken) {
case <-a.shutdown: // server shutdown
if call.slots.ejectSlot(s) {
call.slots.exitStateWithLatency(SlotQueueIdle, uint64(time.Now().Sub(start).Seconds()*1000))
shutdownContainer()
return
}
}