mirror of
https://github.com/fnproject/fn.git
synced 2022-10-28 21:29:17 +03:00
* fn: sync.WaitGroup replacement common.WaitGroup agent/lb_agent/pure_runner has been incorrectly using sync.WaitGroup semantics. Switching these components to use the new common.WaitGroup() that provides a few handy functionality for common graceful shutdown cases. From https://golang.org/pkg/sync/#WaitGroup, "Note that calls with a positive delta that occur when the counter is zero must happen before a Wait. Calls with a negative delta, or calls with a positive delta that start when the counter is greater than zero, may happen at any time. Typically this means the calls to Add should execute before the statement creating the goroutine or other event to be waited for. If a WaitGroup is reused to wait for several independent sets of events, new Add calls must happen after all previous Wait calls have returned." HandleCallEnd introduces some complexity to the shutdowns, but this is currently handled by AddSession(2) initially and letting the HandleCallEnd() when to decrement by -1 in addition to decrement -1 in Submit(). lb_agent shutdown sequence and particularly timeouts with runner pool needs another look/revision, but this is outside of the scope of this commit. * fn: lb-agent wg share * fn: no need to +2 in Submit with defer. Removed defer since handleCallEnd already has this responsibility.
129 lines
3.4 KiB
Go
129 lines
3.4 KiB
Go
package agent
|
|
|
|
import (
|
|
"context"
|
|
"time"
|
|
|
|
"github.com/fnproject/fn/api/common"
|
|
"github.com/fnproject/fn/api/models"
|
|
"github.com/sirupsen/logrus"
|
|
"go.opencensus.io/tag"
|
|
"go.opencensus.io/trace"
|
|
)
|
|
|
|
func (a *agent) asyncDequeue() {
|
|
// this is just so we can hang up the dequeue request if we get shut down
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
defer cancel()
|
|
|
|
// parent span here so that we can see how many async calls are running
|
|
ctx, span := trace.StartSpan(ctx, "agent_async_dequeue")
|
|
defer span.End()
|
|
|
|
for {
|
|
select {
|
|
case <-a.shutWg.Closer():
|
|
a.shutWg.AddSession(-1)
|
|
return
|
|
case <-a.resources.WaitAsyncResource(ctx):
|
|
// TODO we _could_ return a token here to reserve the ram so that there's
|
|
// not a race between here and Submit but we're single threaded
|
|
// dequeueing and retries handled gracefully inside of Submit if we run
|
|
// out of RAM so..
|
|
}
|
|
|
|
// we think we can get a cookie now, so go get a cookie
|
|
select {
|
|
case <-a.shutWg.Closer():
|
|
a.shutWg.AddSession(-1)
|
|
return
|
|
case model, ok := <-a.asyncChew(ctx):
|
|
if ok {
|
|
go func(model *models.Call) {
|
|
a.asyncRun(ctx, model)
|
|
a.shutWg.AddSession(-1)
|
|
}(model)
|
|
|
|
// WARNING: tricky. We reserve another session for next iteration of the loop
|
|
if !a.shutWg.AddSession(1) {
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (a *agent) asyncChew(ctx context.Context) <-chan *models.Call {
|
|
ch := make(chan *models.Call, 1)
|
|
|
|
go func() {
|
|
ctx, cancel := context.WithTimeout(ctx, a.cfg.AsyncChewPoll)
|
|
defer cancel()
|
|
|
|
call, err := a.da.Dequeue(ctx)
|
|
if call != nil {
|
|
ch <- call
|
|
} else { // call is nil / error
|
|
if err != nil && err != context.DeadlineExceeded {
|
|
logrus.WithError(err).Error("error fetching queued calls")
|
|
}
|
|
// queue may be empty / unavailable
|
|
time.Sleep(1 * time.Second) // backoff a little before sending no cookie message
|
|
close(ch)
|
|
}
|
|
}()
|
|
|
|
return ch
|
|
}
|
|
|
|
func (a *agent) asyncRun(ctx context.Context, model *models.Call) {
|
|
// IMPORTANT: get a context that has a child span but NO timeout (Submit imposes timeout)
|
|
// TODO this is a 'FollowsFrom'
|
|
ctx = common.BackgroundContext(ctx)
|
|
|
|
// since async doesn't come in through the normal request path,
|
|
// we've gotta add tags here for stats to come out properly.
|
|
appKey, err := tag.NewKey("fn_appname")
|
|
if err != nil {
|
|
logrus.Fatal(err)
|
|
}
|
|
pathKey, err := tag.NewKey("fn_path")
|
|
if err != nil {
|
|
logrus.Fatal(err)
|
|
}
|
|
ctx, err = tag.New(ctx,
|
|
tag.Insert(appKey, model.AppID),
|
|
tag.Insert(pathKey, model.Path),
|
|
)
|
|
if err != nil {
|
|
logrus.Fatal(err)
|
|
}
|
|
|
|
// additional enclosing context here since this isn't spawned from an http request
|
|
ctx, span := trace.StartSpan(ctx, "agent_async_run")
|
|
defer span.End()
|
|
|
|
call, err := a.GetCall(
|
|
FromModel(model),
|
|
WithContext(ctx), // NOTE: order is important
|
|
)
|
|
if err != nil {
|
|
logrus.WithError(err).Error("error getting async call")
|
|
return
|
|
}
|
|
|
|
// TODO if the task is cold and doesn't require reading STDIN, it could
|
|
// run but we may not listen for output since the task timed out. these
|
|
// are at least once semantics, which is really preferable to at most
|
|
// once, so let's do it for now
|
|
|
|
err = a.Submit(call)
|
|
if err != nil {
|
|
// NOTE: these could be errors / timeouts from the call that we're
|
|
// logging here (i.e. not our fault), but it's likely better to log
|
|
// these than suppress them so...
|
|
id := call.Model().ID
|
|
logrus.WithFields(logrus.Fields{"id": id}).WithError(err).Error("error running async call")
|
|
}
|
|
}
|