mirror of
https://github.com/fnproject/fn.git
synced 2022-10-28 21:29:17 +03:00
additional ctx spans / maid service (#716)
* add spans to async * clean up / add spans to agent * there were a few methods which had multiple contexts which existed in the same scope (this doesn't end well, usually), flattened those out. * loop bound context cancels now rely on defer (also was brittle) * runHot had a lot of ctx shuffling, flattened that. * added some additional spans in certain paths for added granularity * linked up the hot launcher / run hot / wait hot to _a_ root span, the first 2 are follows from spans, but at least we can see the source of these and also can see containers launched over a hot launcher's lifetime I left TODO around the FollowsFrom because OpenCensus doesn't, at least at the moment, appear to have any idea of FollowsFrom and it was an extra OpenTracing method (we have to get the span out, start a new span with the option, then add it to the context... some shuffling required). anyway, was on the fence about adding at least. * resource waiters need to manage their own goroutine lifecycle * if we get an impossible memory request, bail instead of infinite loop * handle timeout slippery case * still sucks, but hotLauncher doesn't leak anything. even the time.After timer goroutines * simplify GetResourceToken GetCall can guard against the impossible to allocate resource tasks entering the system by erroring instead of doling them out. this makes GetResourceToken logic more straightforward for callers, who now simply have the contract that they won't ever get a token if they let tasks into the agent that can't run (but GetCall guards this, and there's a test for it). sorry, I was going to make this only do that, but when I went to fix up the tests, my last patch went haywire so I fixed that too. this also at least tries to simplify the hotLaunch loop, which will now no longer leak time.After timers (which were long, and with signaller, they were many -- I got a stack trace :) -- this breaks out the bottom half of the logic to check to see if we need to launch into its own function, and handles the cleaning duties only in the caller instead of in 2 different select statements. played with this a bit, no doubt further cleaning could be done, but this _seems_ better. * fix vet * add units to exported method contract docs * oops
This commit is contained in:
@@ -24,7 +24,6 @@ import (
|
|||||||
// TODO async calls need to add route.Headers as well
|
// TODO async calls need to add route.Headers as well
|
||||||
// TODO need to shut off reads/writes in dispatch to the pipes when call times out so that
|
// TODO need to shut off reads/writes in dispatch to the pipes when call times out so that
|
||||||
// 2 calls don't have the same container's pipes...
|
// 2 calls don't have the same container's pipes...
|
||||||
// TODO add spans back around container launching for hot (follows from?) + other more granular spans
|
|
||||||
// TODO handle timeouts / no response in sync & async (sync is json+503 atm, not 504, async is empty log+status)
|
// TODO handle timeouts / no response in sync & async (sync is json+503 atm, not 504, async is empty log+status)
|
||||||
// see also: server/runner.go wrapping the response writer there, but need to handle async too (push down?)
|
// see also: server/runner.go wrapping the response writer there, but need to handle async too (push down?)
|
||||||
// TODO storing logs / call can push call over the timeout
|
// TODO storing logs / call can push call over the timeout
|
||||||
@@ -251,7 +250,7 @@ func (a *agent) handleStatsEnd(ctx context.Context, call *call, err error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func statSpans(ctx context.Context, call *call) (ctxr context.Context, finish func()) {
|
func statSpans(ctx context.Context, call *call) (_ context.Context, finish func()) {
|
||||||
// agent_submit_global has no parent span because we don't want it to inherit fn_appname or fn_path
|
// agent_submit_global has no parent span because we don't want it to inherit fn_appname or fn_path
|
||||||
spanGlobal := opentracing.StartSpan("agent_submit_global")
|
spanGlobal := opentracing.StartSpan("agent_submit_global")
|
||||||
|
|
||||||
@@ -305,81 +304,77 @@ func (a *agent) getSlot(ctx context.Context, call *call) (Slot, error) {
|
|||||||
// hotLauncher is spawned in a go routine for each slot queue to monitor stats and launch hot
|
// hotLauncher is spawned in a go routine for each slot queue to monitor stats and launch hot
|
||||||
// containers if needed. Upon shutdown or activity timeout, hotLauncher exits and during exit,
|
// containers if needed. Upon shutdown or activity timeout, hotLauncher exits and during exit,
|
||||||
// it destroys the slot queue.
|
// it destroys the slot queue.
|
||||||
func (a *agent) hotLauncher(ctx context.Context, callObj *call) {
|
func (a *agent) hotLauncher(ctx context.Context, call *call) {
|
||||||
|
|
||||||
// Let use 60 minutes or 2 * IdleTimeout as hot queue idle timeout, pick
|
// Let use 60 minutes or 2 * IdleTimeout as hot queue idle timeout, pick
|
||||||
// whichever is longer. If in this time, there's no activity, then
|
// whichever is longer. If in this time, there's no activity, then
|
||||||
// we destroy the hot queue.
|
// we destroy the hot queue.
|
||||||
timeout := time.Duration(60) * time.Minute
|
timeout := time.Duration(60) * time.Minute
|
||||||
idleTimeout := time.Duration(callObj.IdleTimeout) * time.Second * 2
|
idleTimeout := time.Duration(call.IdleTimeout) * time.Second * 2
|
||||||
if timeout < idleTimeout {
|
if timeout < idleTimeout {
|
||||||
timeout = idleTimeout
|
timeout = idleTimeout
|
||||||
}
|
}
|
||||||
|
|
||||||
logger := common.Logger(ctx)
|
logger := common.Logger(ctx)
|
||||||
logger.WithField("launcher_timeout", timeout).Info("Hot function launcher starting")
|
logger.WithField("launcher_timeout", timeout).Info("Hot function launcher starting")
|
||||||
isAsync := callObj.Type == models.TypeAsync
|
|
||||||
|
// IMPORTANT: get a context that has a child span / logger but NO timeout
|
||||||
|
// TODO this is a 'FollowsFrom'
|
||||||
|
ctx = opentracing.ContextWithSpan(common.WithLogger(context.Background(), logger), opentracing.SpanFromContext(ctx))
|
||||||
|
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_hot_launcher")
|
||||||
|
defer span.Finish()
|
||||||
|
|
||||||
for {
|
for {
|
||||||
|
ctx, cancel := context.WithTimeout(ctx, timeout)
|
||||||
|
a.checkLaunch(ctx, call)
|
||||||
|
|
||||||
select {
|
select {
|
||||||
case <-a.shutdown: // server shutdown
|
case <-a.shutdown: // server shutdown
|
||||||
|
cancel()
|
||||||
return
|
return
|
||||||
case <-time.After(timeout):
|
case <-ctx.Done(): // timed out
|
||||||
if a.slotMgr.deleteSlotQueue(callObj.slots) {
|
cancel()
|
||||||
|
if a.slotMgr.deleteSlotQueue(call.slots) {
|
||||||
logger.Info("Hot function launcher timed out")
|
logger.Info("Hot function launcher timed out")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
case <-callObj.slots.signaller:
|
case <-call.slots.signaller:
|
||||||
|
cancel()
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
curStats := callObj.slots.getStats()
|
func (a *agent) checkLaunch(ctx context.Context, call *call) {
|
||||||
isNeeded := isNewContainerNeeded(&curStats)
|
curStats := call.slots.getStats()
|
||||||
logger.WithFields(logrus.Fields{
|
isAsync := call.Type == models.TypeAsync
|
||||||
"currentStats": curStats,
|
isNeeded := isNewContainerNeeded(&curStats)
|
||||||
"isNeeded": isNeeded,
|
common.Logger(ctx).WithFields(logrus.Fields{"currentStats": curStats, "isNeeded": isNeeded}).Debug("Hot function launcher stats")
|
||||||
}).Debug("Hot function launcher stats")
|
if !isNeeded {
|
||||||
if !isNeeded {
|
return
|
||||||
continue
|
}
|
||||||
}
|
common.Logger(ctx).WithFields(logrus.Fields{"currentStats": curStats, "isNeeded": isNeeded}).Info("Hot function launcher starting hot container")
|
||||||
|
|
||||||
ctxResource, cancelResource := context.WithCancel(context.Background())
|
select {
|
||||||
logger.WithFields(logrus.Fields{
|
case tok := <-a.resources.GetResourceToken(ctx, call.Memory, uint64(call.CPUs), isAsync):
|
||||||
"currentStats": curStats,
|
a.wg.Add(1) // add waiter in this thread
|
||||||
"isNeeded": isNeeded,
|
go func() {
|
||||||
}).Info("Hot function launcher starting hot container")
|
// NOTE: runHot will not inherit the timeout from ctx (ignore timings)
|
||||||
|
a.runHot(ctx, call, tok)
|
||||||
select {
|
a.wg.Done()
|
||||||
case tok, isOpen := <-a.resources.GetResourceToken(ctxResource, callObj.Memory, uint64(callObj.CPUs), isAsync):
|
}()
|
||||||
cancelResource()
|
case <-ctx.Done(): // timeout
|
||||||
if isOpen {
|
case <-a.shutdown: // server shutdown
|
||||||
a.wg.Add(1)
|
|
||||||
go func(ctx context.Context, call *call, tok ResourceToken) {
|
|
||||||
a.runHot(ctx, call, tok)
|
|
||||||
a.wg.Done()
|
|
||||||
}(ctx, callObj, tok)
|
|
||||||
} else {
|
|
||||||
// this means the resource was impossible to reserve (eg. memory size we can never satisfy)
|
|
||||||
callObj.slots.queueSlot(&hotSlot{done: make(chan struct{}), err: models.ErrCallTimeoutServerBusy})
|
|
||||||
}
|
|
||||||
case <-time.After(timeout):
|
|
||||||
cancelResource()
|
|
||||||
if a.slotMgr.deleteSlotQueue(callObj.slots) {
|
|
||||||
logger.Info("Hot function launcher timed out")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
case <-a.shutdown: // server shutdown
|
|
||||||
cancelResource()
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// waitHot pings and waits for a hot container from the slot queue
|
// waitHot pings and waits for a hot container from the slot queue
|
||||||
func (a *agent) waitHot(ctx context.Context, call *call) (Slot, error) {
|
func (a *agent) waitHot(ctx context.Context, call *call) (Slot, error) {
|
||||||
|
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_wait_hot")
|
||||||
|
defer span.Finish()
|
||||||
|
|
||||||
ctxDequeuer, cancelDequeuer := context.WithCancel(ctx)
|
ctx, cancel := context.WithCancel(ctx)
|
||||||
defer cancelDequeuer()
|
defer cancel() // shut down dequeuer if we grab a slot
|
||||||
ch := call.slots.startDequeuer(ctxDequeuer)
|
|
||||||
|
ch := call.slots.startDequeuer(ctx)
|
||||||
|
|
||||||
// 1) if we can get a slot immediately, grab it.
|
// 1) if we can get a slot immediately, grab it.
|
||||||
// 2) if we don't, send a signaller every 200ms until we do.
|
// 2) if we don't, send a signaller every 200ms until we do.
|
||||||
@@ -417,24 +412,19 @@ func (a *agent) waitHot(ctx context.Context, call *call) (Slot, error) {
|
|||||||
// launchCold waits for necessary resources to launch a new container, then
|
// launchCold waits for necessary resources to launch a new container, then
|
||||||
// returns the slot for that new container to run the request on.
|
// returns the slot for that new container to run the request on.
|
||||||
func (a *agent) launchCold(ctx context.Context, call *call) (Slot, error) {
|
func (a *agent) launchCold(ctx context.Context, call *call) (Slot, error) {
|
||||||
|
|
||||||
isAsync := call.Type == models.TypeAsync
|
isAsync := call.Type == models.TypeAsync
|
||||||
ch := make(chan Slot)
|
ch := make(chan Slot)
|
||||||
ctxResource, cancelResource := context.WithCancel(ctx)
|
|
||||||
defer cancelResource()
|
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_launch_cold")
|
||||||
|
defer span.Finish()
|
||||||
|
|
||||||
select {
|
select {
|
||||||
case tok, isOpen := <-a.resources.GetResourceToken(ctxResource, call.Memory, uint64(call.CPUs), isAsync):
|
case tok := <-a.resources.GetResourceToken(ctx, call.Memory, uint64(call.CPUs), isAsync):
|
||||||
if !isOpen {
|
|
||||||
return nil, models.ErrCallTimeoutServerBusy
|
|
||||||
}
|
|
||||||
go a.prepCold(ctx, call, tok, ch)
|
go a.prepCold(ctx, call, tok, ch)
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
return nil, ctx.Err()
|
return nil, ctx.Err()
|
||||||
}
|
}
|
||||||
|
|
||||||
cancelResource()
|
|
||||||
|
|
||||||
// wait for launch err or a slot to open up
|
// wait for launch err or a slot to open up
|
||||||
select {
|
select {
|
||||||
case s := <-ch:
|
case s := <-ch:
|
||||||
@@ -518,9 +508,7 @@ func (s *hotSlot) exec(ctx context.Context, call *call) error {
|
|||||||
common.Logger(ctx).WithField("container_id", s.container.id).Info("starting call")
|
common.Logger(ctx).WithField("container_id", s.container.id).Info("starting call")
|
||||||
|
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
defer func() {
|
defer func() { call.slots.recordLatency(SlotQueueRunner, uint64(time.Now().Sub(start).Seconds()*1000)) }()
|
||||||
call.slots.recordLatency(SlotQueueRunner, uint64(time.Now().Sub(start).Seconds()*1000))
|
|
||||||
}()
|
|
||||||
|
|
||||||
// swap in the new stderr logger & stat accumulator
|
// swap in the new stderr logger & stat accumulator
|
||||||
oldStderr := s.container.swap(call.stderr, &call.Stats)
|
oldStderr := s.container.swap(call.stderr, &call.Stats)
|
||||||
@@ -551,6 +539,9 @@ func specialHeader(k string) bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (a *agent) prepCold(ctx context.Context, call *call, tok ResourceToken, ch chan Slot) {
|
func (a *agent) prepCold(ctx context.Context, call *call, tok ResourceToken, ch chan Slot) {
|
||||||
|
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_prep_cold")
|
||||||
|
defer span.Finish()
|
||||||
|
|
||||||
// add additional headers to the config to shove everything into env vars for cold
|
// add additional headers to the config to shove everything into env vars for cold
|
||||||
for k, v := range call.Headers {
|
for k, v := range call.Headers {
|
||||||
if !specialHeader(k) {
|
if !specialHeader(k) {
|
||||||
@@ -585,14 +576,13 @@ func (a *agent) prepCold(ctx context.Context, call *call, tok ResourceToken, ch
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *agent) runHot(ctxArg context.Context, call *call, tok ResourceToken) {
|
func (a *agent) runHot(ctx context.Context, call *call, tok ResourceToken) {
|
||||||
// We must be careful to only use ctxArg for logs/spans
|
// IMPORTANT: get a context that has a child span / logger but NO timeout
|
||||||
|
// TODO this is a 'FollowsFrom'
|
||||||
// create a span from ctxArg but ignore the new Context
|
ctx = opentracing.ContextWithSpan(context.Background(), opentracing.SpanFromContext(ctx))
|
||||||
// instead we will create a new Context below and explicitly set its span
|
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_run_hot")
|
||||||
span, _ := opentracing.StartSpanFromContext(ctxArg, "docker_run_hot")
|
|
||||||
defer span.Finish()
|
defer span.Finish()
|
||||||
defer tok.Close()
|
defer tok.Close() // IMPORTANT: this MUST get called
|
||||||
|
|
||||||
// TODO we have to make sure we flush these pipes or we will deadlock
|
// TODO we have to make sure we flush these pipes or we will deadlock
|
||||||
stdinRead, stdinWrite := io.Pipe()
|
stdinRead, stdinWrite := io.Pipe()
|
||||||
@@ -600,17 +590,6 @@ func (a *agent) runHot(ctxArg context.Context, call *call, tok ResourceToken) {
|
|||||||
|
|
||||||
proto := protocol.New(protocol.Protocol(call.Format), stdinWrite, stdoutRead)
|
proto := protocol.New(protocol.Protocol(call.Format), stdinWrite, stdoutRead)
|
||||||
|
|
||||||
// we don't want to timeout in here. this is inside of a goroutine and the
|
|
||||||
// caller can timeout this Call appropriately. e.g. w/ hot if it takes 20
|
|
||||||
// minutes to pull, then timing out calls for 20 minutes and eventually
|
|
||||||
// having the image is ideal vs. never getting the image pulled.
|
|
||||||
// TODO this ctx needs to inherit logger, etc
|
|
||||||
ctx, shutdownContainer := context.WithCancel(context.Background())
|
|
||||||
defer shutdownContainer() // close this if our waiter returns
|
|
||||||
|
|
||||||
// add the span we created above to the new Context
|
|
||||||
ctx = opentracing.ContextWithSpan(ctx, span)
|
|
||||||
|
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
call.slots.enterState(SlotQueueStarter)
|
call.slots.enterState(SlotQueueStarter)
|
||||||
|
|
||||||
@@ -659,13 +638,16 @@ func (a *agent) runHot(ctxArg context.Context, call *call, tok ResourceToken) {
|
|||||||
// buffered, in case someone has slot when waiter returns but isn't yet listening
|
// buffered, in case someone has slot when waiter returns but isn't yet listening
|
||||||
errC := make(chan error, 1)
|
errC := make(chan error, 1)
|
||||||
|
|
||||||
|
ctx, shutdownContainer := context.WithCancel(ctx)
|
||||||
|
defer shutdownContainer() // close this if our waiter returns, to call off slots
|
||||||
go func() {
|
go func() {
|
||||||
|
defer shutdownContainer() // also close if we get an agent shutdown / idle timeout
|
||||||
|
|
||||||
for {
|
for {
|
||||||
select { // make sure everything is up before trying to send slot
|
select { // make sure everything is up before trying to send slot
|
||||||
case <-ctx.Done(): // container shutdown
|
case <-ctx.Done(): // container shutdown
|
||||||
return
|
return
|
||||||
case <-a.shutdown: // server shutdown
|
case <-a.shutdown: // server shutdown
|
||||||
shutdownContainer()
|
|
||||||
return
|
return
|
||||||
default: // ok
|
default: // ok
|
||||||
}
|
}
|
||||||
@@ -682,7 +664,6 @@ func (a *agent) runHot(ctxArg context.Context, call *call, tok ResourceToken) {
|
|||||||
if call.slots.ejectSlot(s) {
|
if call.slots.ejectSlot(s) {
|
||||||
call.slots.exitStateWithLatency(SlotQueueIdle, uint64(time.Now().Sub(start).Seconds()*1000))
|
call.slots.exitStateWithLatency(SlotQueueIdle, uint64(time.Now().Sub(start).Seconds()*1000))
|
||||||
logger.Info("Canceling inactive hot function")
|
logger.Info("Canceling inactive hot function")
|
||||||
shutdownContainer()
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
case <-ctx.Done(): // container shutdown
|
case <-ctx.Done(): // container shutdown
|
||||||
@@ -693,7 +674,6 @@ func (a *agent) runHot(ctxArg context.Context, call *call, tok ResourceToken) {
|
|||||||
case <-a.shutdown: // server shutdown
|
case <-a.shutdown: // server shutdown
|
||||||
if call.slots.ejectSlot(s) {
|
if call.slots.ejectSlot(s) {
|
||||||
call.slots.exitStateWithLatency(SlotQueueIdle, uint64(time.Now().Sub(start).Seconds()*1000))
|
call.slots.exitStateWithLatency(SlotQueueIdle, uint64(time.Now().Sub(start).Seconds()*1000))
|
||||||
shutdownContainer()
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
|
"math"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
"strconv"
|
"strconv"
|
||||||
@@ -495,3 +496,27 @@ func TestHTTPWithoutContentLengthWorks(t *testing.T) {
|
|||||||
no Content-Length is set. also make sure the body makes it (and the image hasn't changed)); GLHF, got:`, resp.R.Body)
|
no Content-Length is set. also make sure the body makes it (and the image hasn't changed)); GLHF, got:`, resp.R.Body)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestGetCallReturnsResourceImpossibility(t *testing.T) {
|
||||||
|
call := &models.Call{
|
||||||
|
AppName: "yo",
|
||||||
|
Path: "/yoyo",
|
||||||
|
Image: "fnproject/fn-test-utils",
|
||||||
|
Type: "sync",
|
||||||
|
Format: "http",
|
||||||
|
Timeout: 1,
|
||||||
|
IdleTimeout: 2,
|
||||||
|
Memory: math.MaxUint64,
|
||||||
|
}
|
||||||
|
|
||||||
|
// FromModel doesn't need a datastore, for now...
|
||||||
|
ds := datastore.NewMockInit(nil, nil, nil)
|
||||||
|
|
||||||
|
a := New(NewCachedDataAccess(NewDirectDataAccess(ds, ds, new(mqs.Mock))))
|
||||||
|
defer a.Close()
|
||||||
|
|
||||||
|
_, err := a.GetCall(FromModel(call))
|
||||||
|
if err != models.ErrCallTimeoutServerBusy {
|
||||||
|
t.Fatal("did not get expected err, got: ", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/fnproject/fn/api/models"
|
"github.com/fnproject/fn/api/models"
|
||||||
|
opentracing "github.com/opentracing/opentracing-go"
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -15,14 +16,15 @@ func (a *agent) asyncDequeue() {
|
|||||||
ctx, cancel := context.WithCancel(context.Background())
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
|
// parent span here so that we can see how many async calls are running
|
||||||
|
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_async_dequeue")
|
||||||
|
defer span.Finish()
|
||||||
|
|
||||||
for {
|
for {
|
||||||
ctxResource, cancelResource := context.WithCancel(context.Background())
|
|
||||||
select {
|
select {
|
||||||
case <-a.shutdown:
|
case <-a.shutdown:
|
||||||
cancelResource()
|
|
||||||
return
|
return
|
||||||
case <-a.resources.WaitAsyncResource(ctxResource):
|
case <-a.resources.WaitAsyncResource(ctx):
|
||||||
cancelResource()
|
|
||||||
// TODO we _could_ return a token here to reserve the ram so that there's
|
// TODO we _could_ return a token here to reserve the ram so that there's
|
||||||
// not a race between here and Submit but we're single threaded
|
// not a race between here and Submit but we're single threaded
|
||||||
// dequeueing and retries handled gracefully inside of Submit if we run
|
// dequeueing and retries handled gracefully inside of Submit if we run
|
||||||
@@ -37,7 +39,7 @@ func (a *agent) asyncDequeue() {
|
|||||||
if ok {
|
if ok {
|
||||||
a.wg.Add(1) // need to add 1 in this thread to ensure safe shutdown
|
a.wg.Add(1) // need to add 1 in this thread to ensure safe shutdown
|
||||||
go func(model *models.Call) {
|
go func(model *models.Call) {
|
||||||
a.asyncRun(model)
|
a.asyncRun(ctx, model)
|
||||||
a.wg.Done() // can shed it after this is done, Submit will add 1 too but it's fine
|
a.wg.Done() // can shed it after this is done, Submit will add 1 too but it's fine
|
||||||
}(model)
|
}(model)
|
||||||
}
|
}
|
||||||
@@ -68,9 +70,19 @@ func (a *agent) asyncChew(ctx context.Context) <-chan *models.Call {
|
|||||||
return ch
|
return ch
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *agent) asyncRun(model *models.Call) {
|
func (a *agent) asyncRun(ctx context.Context, model *models.Call) {
|
||||||
// TODO output / logger should be here too...
|
// IMPORTANT: get a context that has a child span but NO timeout (Submit imposes timeout)
|
||||||
call, err := a.GetCall(FromModel(model))
|
// TODO this is a 'FollowsFrom'
|
||||||
|
ctx = opentracing.ContextWithSpan(context.Background(), opentracing.SpanFromContext(ctx))
|
||||||
|
|
||||||
|
// additional enclosing context here since this isn't spawned from an http request
|
||||||
|
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_async_run")
|
||||||
|
defer span.Finish()
|
||||||
|
|
||||||
|
call, err := a.GetCall(
|
||||||
|
FromModel(model),
|
||||||
|
WithContext(ctx), // NOTE: order is important
|
||||||
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logrus.WithError(err).Error("error getting async call")
|
logrus.WithError(err).Error("error getting async call")
|
||||||
return
|
return
|
||||||
|
|||||||
@@ -180,6 +180,13 @@ func WithWriter(w io.Writer) CallOpt {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func WithContext(ctx context.Context) CallOpt {
|
||||||
|
return func(a *agent, c *call) error {
|
||||||
|
c.req = c.req.WithContext(ctx)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// GetCall builds a Call that can be used to submit jobs to the agent.
|
// GetCall builds a Call that can be used to submit jobs to the agent.
|
||||||
//
|
//
|
||||||
// TODO where to put this? async and sync both call this
|
// TODO where to put this? async and sync both call this
|
||||||
@@ -198,6 +205,11 @@ func (a *agent) GetCall(opts ...CallOpt) (Call, error) {
|
|||||||
return nil, errors.New("no model or request provided for call")
|
return nil, errors.New("no model or request provided for call")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if !a.resources.IsResourcePossible(c.Memory, uint64(c.CPUs), c.Type == models.TypeAsync) {
|
||||||
|
// if we're not going to be able to run this call on this machine, bail here.
|
||||||
|
return nil, models.ErrCallTimeoutServerBusy
|
||||||
|
}
|
||||||
|
|
||||||
c.da = a.da
|
c.da = a.da
|
||||||
c.ct = a
|
c.ct = a
|
||||||
|
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
|
opentracing "github.com/opentracing/opentracing-go"
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -24,9 +25,21 @@ const (
|
|||||||
// A simple resource (memory, cpu, disk, etc.) tracker for scheduling.
|
// A simple resource (memory, cpu, disk, etc.) tracker for scheduling.
|
||||||
// TODO: add cpu, disk, network IO for future
|
// TODO: add cpu, disk, network IO for future
|
||||||
type ResourceTracker interface {
|
type ResourceTracker interface {
|
||||||
|
// WaitAsyncResource returns a channel that will send once when there seem to be sufficient
|
||||||
|
// resource levels to run an async task, it is up to the implementer to create policy here.
|
||||||
WaitAsyncResource(ctx context.Context) chan struct{}
|
WaitAsyncResource(ctx context.Context) chan struct{}
|
||||||
// returns a closed channel if the resource can never me met.
|
|
||||||
GetResourceToken(ctx context.Context, memory uint64, cpuQuota uint64, isAsync bool) <-chan ResourceToken
|
// GetResourceToken returns a channel to wait for a resource token on. If the provided context is canceled,
|
||||||
|
// the channel will never receive anything. If it is not possible to fulfill this resource, the channel
|
||||||
|
// will never receive anything (use IsResourcePossible). If a resource token is available for the provided
|
||||||
|
// resource parameters, it will otherwise be sent once on the returned channel. The channel is never closed.
|
||||||
|
// Memory is expected to be provided in MB units.
|
||||||
|
GetResourceToken(ctx context.Context, memory, cpuQuota uint64, isAsync bool) <-chan ResourceToken
|
||||||
|
|
||||||
|
// IsResourcePossible returns whether it's possible to fulfill the requested resources on this
|
||||||
|
// machine. It must be called before GetResourceToken or GetResourceToken may hang.
|
||||||
|
// Memory is expected to be provided in MB units.
|
||||||
|
IsResourcePossible(memory, cpuQuota uint64, isAsync bool) bool
|
||||||
}
|
}
|
||||||
|
|
||||||
type resourceTracker struct {
|
type resourceTracker struct {
|
||||||
@@ -100,7 +113,9 @@ func (a *resourceTracker) isResourceAvailableLocked(memory uint64, cpuQuota uint
|
|||||||
}
|
}
|
||||||
|
|
||||||
// is this request possible to meet? If no, fail quick
|
// is this request possible to meet? If no, fail quick
|
||||||
func (a *resourceTracker) isResourcePossible(memory uint64, cpuQuota uint64, isAsync bool) bool {
|
func (a *resourceTracker) IsResourcePossible(memory uint64, cpuQuota uint64, isAsync bool) bool {
|
||||||
|
memory = memory * Mem1MB
|
||||||
|
|
||||||
if isAsync {
|
if isAsync {
|
||||||
return memory <= a.ramAsyncTotal && cpuQuota <= a.cpuAsyncTotal
|
return memory <= a.ramAsyncTotal && cpuQuota <= a.cpuAsyncTotal
|
||||||
} else {
|
} else {
|
||||||
@@ -111,17 +126,20 @@ func (a *resourceTracker) isResourcePossible(memory uint64, cpuQuota uint64, isA
|
|||||||
// the received token should be passed directly to launch (unconditionally), launch
|
// the received token should be passed directly to launch (unconditionally), launch
|
||||||
// will close this token (i.e. the receiver should not call Close)
|
// will close this token (i.e. the receiver should not call Close)
|
||||||
func (a *resourceTracker) GetResourceToken(ctx context.Context, memory uint64, cpuQuota uint64, isAsync bool) <-chan ResourceToken {
|
func (a *resourceTracker) GetResourceToken(ctx context.Context, memory uint64, cpuQuota uint64, isAsync bool) <-chan ResourceToken {
|
||||||
|
ch := make(chan ResourceToken)
|
||||||
memory = memory * Mem1MB
|
if !a.IsResourcePossible(memory, cpuQuota, isAsync) {
|
||||||
|
// return the channel, but never send anything.
|
||||||
|
return ch
|
||||||
|
}
|
||||||
|
|
||||||
c := a.cond
|
c := a.cond
|
||||||
isWaiting := false
|
isWaiting := false
|
||||||
ch := make(chan ResourceToken)
|
|
||||||
|
|
||||||
if !a.isResourcePossible(memory, cpuQuota, isAsync) {
|
memory = memory * Mem1MB
|
||||||
close(ch)
|
|
||||||
return ch
|
// if we find a resource token, shut down the thread waiting on ctx finish.
|
||||||
}
|
// alternatively, if the ctx is done, wake up the cond loop.
|
||||||
|
ctx, cancel := context.WithCancel(ctx)
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
<-ctx.Done()
|
<-ctx.Done()
|
||||||
@@ -132,7 +150,10 @@ func (a *resourceTracker) GetResourceToken(ctx context.Context, memory uint64, c
|
|||||||
c.L.Unlock()
|
c.L.Unlock()
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_get_resource_token")
|
||||||
go func() {
|
go func() {
|
||||||
|
defer span.Finish()
|
||||||
|
defer cancel()
|
||||||
c.L.Lock()
|
c.L.Lock()
|
||||||
|
|
||||||
isWaiting = true
|
isWaiting = true
|
||||||
@@ -202,6 +223,10 @@ func (a *resourceTracker) WaitAsyncResource(ctx context.Context) chan struct{} {
|
|||||||
isWaiting := false
|
isWaiting := false
|
||||||
c := a.cond
|
c := a.cond
|
||||||
|
|
||||||
|
// if we find a resource token, shut down the thread waiting on ctx finish.
|
||||||
|
// alternatively, if the ctx is done, wake up the cond loop.
|
||||||
|
ctx, cancel := context.WithCancel(ctx)
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
<-ctx.Done()
|
<-ctx.Done()
|
||||||
c.L.Lock()
|
c.L.Lock()
|
||||||
@@ -211,7 +236,10 @@ func (a *resourceTracker) WaitAsyncResource(ctx context.Context) chan struct{} {
|
|||||||
c.L.Unlock()
|
c.L.Unlock()
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_wait_async_resource")
|
||||||
go func() {
|
go func() {
|
||||||
|
defer span.Finish()
|
||||||
|
defer cancel()
|
||||||
c.L.Lock()
|
c.L.Lock()
|
||||||
isWaiting = true
|
isWaiting = true
|
||||||
for (a.ramAsyncUsed >= a.ramAsyncHWMark || a.cpuAsyncUsed >= a.cpuAsyncHWMark) && ctx.Err() == nil {
|
for (a.ramAsyncUsed >= a.ramAsyncHWMark || a.cpuAsyncUsed >= a.cpuAsyncHWMark) && ctx.Err() == nil {
|
||||||
|
|||||||
@@ -238,8 +238,9 @@ func TestResourceGetCombo(t *testing.T) {
|
|||||||
// impossible request
|
// impossible request
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
ch := trI.GetResourceToken(ctx, 20*1024, 20000, false)
|
ch := trI.GetResourceToken(ctx, 20*1024, 20000, false)
|
||||||
if !isClosed(ch) {
|
_, err := fetchToken(ch)
|
||||||
t.Fatalf("impossible request should return closed channel")
|
if err == nil {
|
||||||
|
t.Fatalf("impossible request should never return (error here)")
|
||||||
}
|
}
|
||||||
|
|
||||||
cancel()
|
cancel()
|
||||||
|
|||||||
Reference in New Issue
Block a user