additional ctx spans / maid service (#716)

* add spans to async

* clean up / add spans to agent

* there were a few methods which had multiple contexts which existed in the same
scope (this doesn't end well, usually), flattened those out.
* loop bound context cancels now rely on defer (also was brittle)
* runHot had a lot of ctx shuffling, flattened that.
* added some additional spans in certain paths for added granularity
* linked up the hot launcher / run hot / wait hot to _a_ root span, the first
2 are follows from spans, but at least we can see the source of these and also
can see containers launched over a hot launcher's lifetime

I left TODO around the FollowsFrom because OpenCensus doesn't, at least at the
moment, appear to have any idea of FollowsFrom and it was an extra OpenTracing
method (we have to get the span out, start a new span with the option, then
add it to the context... some shuffling required). anyway, was on the fence
about adding at least.

* resource waiters need to manage their own goroutine lifecycle

* if we get an impossible memory request, bail instead of infinite loop

* handle timeout slippery case

* still sucks, but hotLauncher doesn't leak anything. even the time.After timer goroutines

* simplify GetResourceToken

GetCall can guard against the impossible to allocate resource tasks entering
the system by erroring instead of doling them out. this makes GetResourceToken
logic more straightforward for callers, who now simply have the contract that
they won't ever get a token if they let tasks into the agent that can't run
(but GetCall guards this, and there's a test for it).

sorry, I was going to make this only do that, but when I went to fix up the
tests, my last patch went haywire so I fixed that too. this also at least
tries to simplify the hotLaunch loop, which will now no longer leak time.After
timers (which were long, and with signaller, they were many -- I got a stack
trace :) -- this breaks out the bottom half of the logic to check to see if we
need to launch into its own function, and handles the cleaning duties only in
the caller instead of in 2 different select statements. played with this a
bit, no doubt further cleaning could be done, but this _seems_ better.

* fix vet

* add units to exported method contract docs

* oops
This commit is contained in:
Reed Allman
2018-01-23 19:52:22 -08:00
committed by GitHub
parent ccd95b6f72
commit bbd50a0e02
6 changed files with 161 additions and 103 deletions

View File

@@ -24,7 +24,6 @@ import (
// TODO async calls need to add route.Headers as well
// TODO need to shut off reads/writes in dispatch to the pipes when call times out so that
// 2 calls don't have the same container's pipes...
// TODO add spans back around container launching for hot (follows from?) + other more granular spans
// TODO handle timeouts / no response in sync & async (sync is json+503 atm, not 504, async is empty log+status)
// see also: server/runner.go wrapping the response writer there, but need to handle async too (push down?)
// TODO storing logs / call can push call over the timeout
@@ -251,7 +250,7 @@ func (a *agent) handleStatsEnd(ctx context.Context, call *call, err error) {
}
}
func statSpans(ctx context.Context, call *call) (ctxr context.Context, finish func()) {
func statSpans(ctx context.Context, call *call) (_ context.Context, finish func()) {
// agent_submit_global has no parent span because we don't want it to inherit fn_appname or fn_path
spanGlobal := opentracing.StartSpan("agent_submit_global")
@@ -305,81 +304,77 @@ func (a *agent) getSlot(ctx context.Context, call *call) (Slot, error) {
// hotLauncher is spawned in a go routine for each slot queue to monitor stats and launch hot
// containers if needed. Upon shutdown or activity timeout, hotLauncher exits and during exit,
// it destroys the slot queue.
func (a *agent) hotLauncher(ctx context.Context, callObj *call) {
func (a *agent) hotLauncher(ctx context.Context, call *call) {
// Let use 60 minutes or 2 * IdleTimeout as hot queue idle timeout, pick
// whichever is longer. If in this time, there's no activity, then
// we destroy the hot queue.
timeout := time.Duration(60) * time.Minute
idleTimeout := time.Duration(callObj.IdleTimeout) * time.Second * 2
idleTimeout := time.Duration(call.IdleTimeout) * time.Second * 2
if timeout < idleTimeout {
timeout = idleTimeout
}
logger := common.Logger(ctx)
logger.WithField("launcher_timeout", timeout).Info("Hot function launcher starting")
isAsync := callObj.Type == models.TypeAsync
// IMPORTANT: get a context that has a child span / logger but NO timeout
// TODO this is a 'FollowsFrom'
ctx = opentracing.ContextWithSpan(common.WithLogger(context.Background(), logger), opentracing.SpanFromContext(ctx))
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_hot_launcher")
defer span.Finish()
for {
ctx, cancel := context.WithTimeout(ctx, timeout)
a.checkLaunch(ctx, call)
select {
case <-a.shutdown: // server shutdown
cancel()
return
case <-time.After(timeout):
if a.slotMgr.deleteSlotQueue(callObj.slots) {
case <-ctx.Done(): // timed out
cancel()
if a.slotMgr.deleteSlotQueue(call.slots) {
logger.Info("Hot function launcher timed out")
return
}
case <-callObj.slots.signaller:
case <-call.slots.signaller:
cancel()
}
}
}
curStats := callObj.slots.getStats()
func (a *agent) checkLaunch(ctx context.Context, call *call) {
curStats := call.slots.getStats()
isAsync := call.Type == models.TypeAsync
isNeeded := isNewContainerNeeded(&curStats)
logger.WithFields(logrus.Fields{
"currentStats": curStats,
"isNeeded": isNeeded,
}).Debug("Hot function launcher stats")
common.Logger(ctx).WithFields(logrus.Fields{"currentStats": curStats, "isNeeded": isNeeded}).Debug("Hot function launcher stats")
if !isNeeded {
continue
return
}
ctxResource, cancelResource := context.WithCancel(context.Background())
logger.WithFields(logrus.Fields{
"currentStats": curStats,
"isNeeded": isNeeded,
}).Info("Hot function launcher starting hot container")
common.Logger(ctx).WithFields(logrus.Fields{"currentStats": curStats, "isNeeded": isNeeded}).Info("Hot function launcher starting hot container")
select {
case tok, isOpen := <-a.resources.GetResourceToken(ctxResource, callObj.Memory, uint64(callObj.CPUs), isAsync):
cancelResource()
if isOpen {
a.wg.Add(1)
go func(ctx context.Context, call *call, tok ResourceToken) {
case tok := <-a.resources.GetResourceToken(ctx, call.Memory, uint64(call.CPUs), isAsync):
a.wg.Add(1) // add waiter in this thread
go func() {
// NOTE: runHot will not inherit the timeout from ctx (ignore timings)
a.runHot(ctx, call, tok)
a.wg.Done()
}(ctx, callObj, tok)
} else {
// this means the resource was impossible to reserve (eg. memory size we can never satisfy)
callObj.slots.queueSlot(&hotSlot{done: make(chan struct{}), err: models.ErrCallTimeoutServerBusy})
}
case <-time.After(timeout):
cancelResource()
if a.slotMgr.deleteSlotQueue(callObj.slots) {
logger.Info("Hot function launcher timed out")
return
}
}()
case <-ctx.Done(): // timeout
case <-a.shutdown: // server shutdown
cancelResource()
return
}
}
}
// waitHot pings and waits for a hot container from the slot queue
func (a *agent) waitHot(ctx context.Context, call *call) (Slot, error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_wait_hot")
defer span.Finish()
ctxDequeuer, cancelDequeuer := context.WithCancel(ctx)
defer cancelDequeuer()
ch := call.slots.startDequeuer(ctxDequeuer)
ctx, cancel := context.WithCancel(ctx)
defer cancel() // shut down dequeuer if we grab a slot
ch := call.slots.startDequeuer(ctx)
// 1) if we can get a slot immediately, grab it.
// 2) if we don't, send a signaller every 200ms until we do.
@@ -417,24 +412,19 @@ func (a *agent) waitHot(ctx context.Context, call *call) (Slot, error) {
// launchCold waits for necessary resources to launch a new container, then
// returns the slot for that new container to run the request on.
func (a *agent) launchCold(ctx context.Context, call *call) (Slot, error) {
isAsync := call.Type == models.TypeAsync
ch := make(chan Slot)
ctxResource, cancelResource := context.WithCancel(ctx)
defer cancelResource()
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_launch_cold")
defer span.Finish()
select {
case tok, isOpen := <-a.resources.GetResourceToken(ctxResource, call.Memory, uint64(call.CPUs), isAsync):
if !isOpen {
return nil, models.ErrCallTimeoutServerBusy
}
case tok := <-a.resources.GetResourceToken(ctx, call.Memory, uint64(call.CPUs), isAsync):
go a.prepCold(ctx, call, tok, ch)
case <-ctx.Done():
return nil, ctx.Err()
}
cancelResource()
// wait for launch err or a slot to open up
select {
case s := <-ch:
@@ -518,9 +508,7 @@ func (s *hotSlot) exec(ctx context.Context, call *call) error {
common.Logger(ctx).WithField("container_id", s.container.id).Info("starting call")
start := time.Now()
defer func() {
call.slots.recordLatency(SlotQueueRunner, uint64(time.Now().Sub(start).Seconds()*1000))
}()
defer func() { call.slots.recordLatency(SlotQueueRunner, uint64(time.Now().Sub(start).Seconds()*1000)) }()
// swap in the new stderr logger & stat accumulator
oldStderr := s.container.swap(call.stderr, &call.Stats)
@@ -551,6 +539,9 @@ func specialHeader(k string) bool {
}
func (a *agent) prepCold(ctx context.Context, call *call, tok ResourceToken, ch chan Slot) {
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_prep_cold")
defer span.Finish()
// add additional headers to the config to shove everything into env vars for cold
for k, v := range call.Headers {
if !specialHeader(k) {
@@ -585,14 +576,13 @@ func (a *agent) prepCold(ctx context.Context, call *call, tok ResourceToken, ch
}
}
func (a *agent) runHot(ctxArg context.Context, call *call, tok ResourceToken) {
// We must be careful to only use ctxArg for logs/spans
// create a span from ctxArg but ignore the new Context
// instead we will create a new Context below and explicitly set its span
span, _ := opentracing.StartSpanFromContext(ctxArg, "docker_run_hot")
func (a *agent) runHot(ctx context.Context, call *call, tok ResourceToken) {
// IMPORTANT: get a context that has a child span / logger but NO timeout
// TODO this is a 'FollowsFrom'
ctx = opentracing.ContextWithSpan(context.Background(), opentracing.SpanFromContext(ctx))
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_run_hot")
defer span.Finish()
defer tok.Close()
defer tok.Close() // IMPORTANT: this MUST get called
// TODO we have to make sure we flush these pipes or we will deadlock
stdinRead, stdinWrite := io.Pipe()
@@ -600,17 +590,6 @@ func (a *agent) runHot(ctxArg context.Context, call *call, tok ResourceToken) {
proto := protocol.New(protocol.Protocol(call.Format), stdinWrite, stdoutRead)
// we don't want to timeout in here. this is inside of a goroutine and the
// caller can timeout this Call appropriately. e.g. w/ hot if it takes 20
// minutes to pull, then timing out calls for 20 minutes and eventually
// having the image is ideal vs. never getting the image pulled.
// TODO this ctx needs to inherit logger, etc
ctx, shutdownContainer := context.WithCancel(context.Background())
defer shutdownContainer() // close this if our waiter returns
// add the span we created above to the new Context
ctx = opentracing.ContextWithSpan(ctx, span)
start := time.Now()
call.slots.enterState(SlotQueueStarter)
@@ -659,13 +638,16 @@ func (a *agent) runHot(ctxArg context.Context, call *call, tok ResourceToken) {
// buffered, in case someone has slot when waiter returns but isn't yet listening
errC := make(chan error, 1)
ctx, shutdownContainer := context.WithCancel(ctx)
defer shutdownContainer() // close this if our waiter returns, to call off slots
go func() {
defer shutdownContainer() // also close if we get an agent shutdown / idle timeout
for {
select { // make sure everything is up before trying to send slot
case <-ctx.Done(): // container shutdown
return
case <-a.shutdown: // server shutdown
shutdownContainer()
return
default: // ok
}
@@ -682,7 +664,6 @@ func (a *agent) runHot(ctxArg context.Context, call *call, tok ResourceToken) {
if call.slots.ejectSlot(s) {
call.slots.exitStateWithLatency(SlotQueueIdle, uint64(time.Now().Sub(start).Seconds()*1000))
logger.Info("Canceling inactive hot function")
shutdownContainer()
return
}
case <-ctx.Done(): // container shutdown
@@ -693,7 +674,6 @@ func (a *agent) runHot(ctxArg context.Context, call *call, tok ResourceToken) {
case <-a.shutdown: // server shutdown
if call.slots.ejectSlot(s) {
call.slots.exitStateWithLatency(SlotQueueIdle, uint64(time.Now().Sub(start).Seconds()*1000))
shutdownContainer()
return
}
}

View File

@@ -6,6 +6,7 @@ import (
"encoding/json"
"fmt"
"io"
"math"
"net/http"
"net/http/httptest"
"strconv"
@@ -495,3 +496,27 @@ func TestHTTPWithoutContentLengthWorks(t *testing.T) {
no Content-Length is set. also make sure the body makes it (and the image hasn't changed)); GLHF, got:`, resp.R.Body)
}
}
func TestGetCallReturnsResourceImpossibility(t *testing.T) {
call := &models.Call{
AppName: "yo",
Path: "/yoyo",
Image: "fnproject/fn-test-utils",
Type: "sync",
Format: "http",
Timeout: 1,
IdleTimeout: 2,
Memory: math.MaxUint64,
}
// FromModel doesn't need a datastore, for now...
ds := datastore.NewMockInit(nil, nil, nil)
a := New(NewCachedDataAccess(NewDirectDataAccess(ds, ds, new(mqs.Mock))))
defer a.Close()
_, err := a.GetCall(FromModel(call))
if err != models.ErrCallTimeoutServerBusy {
t.Fatal("did not get expected err, got: ", err)
}
}

View File

@@ -5,6 +5,7 @@ import (
"time"
"github.com/fnproject/fn/api/models"
opentracing "github.com/opentracing/opentracing-go"
"github.com/sirupsen/logrus"
)
@@ -15,14 +16,15 @@ func (a *agent) asyncDequeue() {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// parent span here so that we can see how many async calls are running
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_async_dequeue")
defer span.Finish()
for {
ctxResource, cancelResource := context.WithCancel(context.Background())
select {
case <-a.shutdown:
cancelResource()
return
case <-a.resources.WaitAsyncResource(ctxResource):
cancelResource()
case <-a.resources.WaitAsyncResource(ctx):
// TODO we _could_ return a token here to reserve the ram so that there's
// not a race between here and Submit but we're single threaded
// dequeueing and retries handled gracefully inside of Submit if we run
@@ -37,7 +39,7 @@ func (a *agent) asyncDequeue() {
if ok {
a.wg.Add(1) // need to add 1 in this thread to ensure safe shutdown
go func(model *models.Call) {
a.asyncRun(model)
a.asyncRun(ctx, model)
a.wg.Done() // can shed it after this is done, Submit will add 1 too but it's fine
}(model)
}
@@ -68,9 +70,19 @@ func (a *agent) asyncChew(ctx context.Context) <-chan *models.Call {
return ch
}
func (a *agent) asyncRun(model *models.Call) {
// TODO output / logger should be here too...
call, err := a.GetCall(FromModel(model))
func (a *agent) asyncRun(ctx context.Context, model *models.Call) {
// IMPORTANT: get a context that has a child span but NO timeout (Submit imposes timeout)
// TODO this is a 'FollowsFrom'
ctx = opentracing.ContextWithSpan(context.Background(), opentracing.SpanFromContext(ctx))
// additional enclosing context here since this isn't spawned from an http request
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_async_run")
defer span.Finish()
call, err := a.GetCall(
FromModel(model),
WithContext(ctx), // NOTE: order is important
)
if err != nil {
logrus.WithError(err).Error("error getting async call")
return

View File

@@ -180,6 +180,13 @@ func WithWriter(w io.Writer) CallOpt {
}
}
func WithContext(ctx context.Context) CallOpt {
return func(a *agent, c *call) error {
c.req = c.req.WithContext(ctx)
return nil
}
}
// GetCall builds a Call that can be used to submit jobs to the agent.
//
// TODO where to put this? async and sync both call this
@@ -198,6 +205,11 @@ func (a *agent) GetCall(opts ...CallOpt) (Call, error) {
return nil, errors.New("no model or request provided for call")
}
if !a.resources.IsResourcePossible(c.Memory, uint64(c.CPUs), c.Type == models.TypeAsync) {
// if we're not going to be able to run this call on this machine, bail here.
return nil, models.ErrCallTimeoutServerBusy
}
c.da = a.da
c.ct = a

View File

@@ -13,6 +13,7 @@ import (
"strings"
"sync"
opentracing "github.com/opentracing/opentracing-go"
"github.com/sirupsen/logrus"
)
@@ -24,9 +25,21 @@ const (
// A simple resource (memory, cpu, disk, etc.) tracker for scheduling.
// TODO: add cpu, disk, network IO for future
type ResourceTracker interface {
// WaitAsyncResource returns a channel that will send once when there seem to be sufficient
// resource levels to run an async task, it is up to the implementer to create policy here.
WaitAsyncResource(ctx context.Context) chan struct{}
// returns a closed channel if the resource can never me met.
GetResourceToken(ctx context.Context, memory uint64, cpuQuota uint64, isAsync bool) <-chan ResourceToken
// GetResourceToken returns a channel to wait for a resource token on. If the provided context is canceled,
// the channel will never receive anything. If it is not possible to fulfill this resource, the channel
// will never receive anything (use IsResourcePossible). If a resource token is available for the provided
// resource parameters, it will otherwise be sent once on the returned channel. The channel is never closed.
// Memory is expected to be provided in MB units.
GetResourceToken(ctx context.Context, memory, cpuQuota uint64, isAsync bool) <-chan ResourceToken
// IsResourcePossible returns whether it's possible to fulfill the requested resources on this
// machine. It must be called before GetResourceToken or GetResourceToken may hang.
// Memory is expected to be provided in MB units.
IsResourcePossible(memory, cpuQuota uint64, isAsync bool) bool
}
type resourceTracker struct {
@@ -100,7 +113,9 @@ func (a *resourceTracker) isResourceAvailableLocked(memory uint64, cpuQuota uint
}
// is this request possible to meet? If no, fail quick
func (a *resourceTracker) isResourcePossible(memory uint64, cpuQuota uint64, isAsync bool) bool {
func (a *resourceTracker) IsResourcePossible(memory uint64, cpuQuota uint64, isAsync bool) bool {
memory = memory * Mem1MB
if isAsync {
return memory <= a.ramAsyncTotal && cpuQuota <= a.cpuAsyncTotal
} else {
@@ -111,17 +126,20 @@ func (a *resourceTracker) isResourcePossible(memory uint64, cpuQuota uint64, isA
// the received token should be passed directly to launch (unconditionally), launch
// will close this token (i.e. the receiver should not call Close)
func (a *resourceTracker) GetResourceToken(ctx context.Context, memory uint64, cpuQuota uint64, isAsync bool) <-chan ResourceToken {
memory = memory * Mem1MB
ch := make(chan ResourceToken)
if !a.IsResourcePossible(memory, cpuQuota, isAsync) {
// return the channel, but never send anything.
return ch
}
c := a.cond
isWaiting := false
ch := make(chan ResourceToken)
if !a.isResourcePossible(memory, cpuQuota, isAsync) {
close(ch)
return ch
}
memory = memory * Mem1MB
// if we find a resource token, shut down the thread waiting on ctx finish.
// alternatively, if the ctx is done, wake up the cond loop.
ctx, cancel := context.WithCancel(ctx)
go func() {
<-ctx.Done()
@@ -132,7 +150,10 @@ func (a *resourceTracker) GetResourceToken(ctx context.Context, memory uint64, c
c.L.Unlock()
}()
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_get_resource_token")
go func() {
defer span.Finish()
defer cancel()
c.L.Lock()
isWaiting = true
@@ -202,6 +223,10 @@ func (a *resourceTracker) WaitAsyncResource(ctx context.Context) chan struct{} {
isWaiting := false
c := a.cond
// if we find a resource token, shut down the thread waiting on ctx finish.
// alternatively, if the ctx is done, wake up the cond loop.
ctx, cancel := context.WithCancel(ctx)
go func() {
<-ctx.Done()
c.L.Lock()
@@ -211,7 +236,10 @@ func (a *resourceTracker) WaitAsyncResource(ctx context.Context) chan struct{} {
c.L.Unlock()
}()
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_wait_async_resource")
go func() {
defer span.Finish()
defer cancel()
c.L.Lock()
isWaiting = true
for (a.ramAsyncUsed >= a.ramAsyncHWMark || a.cpuAsyncUsed >= a.cpuAsyncHWMark) && ctx.Err() == nil {

View File

@@ -238,8 +238,9 @@ func TestResourceGetCombo(t *testing.T) {
// impossible request
ctx, cancel := context.WithCancel(context.Background())
ch := trI.GetResourceToken(ctx, 20*1024, 20000, false)
if !isClosed(ch) {
t.Fatalf("impossible request should return closed channel")
_, err := fetchToken(ch)
if err == nil {
t.Fatalf("impossible request should never return (error here)")
}
cancel()