Files
fn-serverless/api/agent/async.go
Tolga Ceylan e53d23afc9 fn: sync.WaitGroup replacement common.WaitGroup (#937)
* fn: sync.WaitGroup replacement common.WaitGroup

agent/lb_agent/pure_runner has been incorrectly using
sync.WaitGroup semantics. Switching these components to
use the new common.WaitGroup() that provides a few handy
functionality for common graceful shutdown cases.

From https://golang.org/pkg/sync/#WaitGroup,
    "Note that calls with a positive delta that occur when the counter
     is zero must happen before a Wait. Calls with a negative delta,
     or calls with a positive delta that start when the counter is
     greater than zero, may happen at any time. Typically this means
     the calls to Add should execute before the statement creating
     the goroutine or other event to be waited for. If a WaitGroup
     is reused to wait for several independent sets of events,
     new Add calls must happen after all previous Wait calls have
     returned."

HandleCallEnd introduces some complexity to the shutdowns, but this
is currently handled by AddSession(2) initially and letting the
HandleCallEnd() when to decrement by -1 in addition to decrement -1 in
Submit().

lb_agent shutdown sequence and particularly timeouts with runner pool
needs another look/revision, but this is outside of the scope of this
commit.

* fn: lb-agent wg share

* fn: no need to +2 in Submit with defer.

Removed defer since handleCallEnd already has
this responsibility.
2018-04-12 11:33:01 -07:00

129 lines
3.4 KiB
Go

package agent
import (
"context"
"time"
"github.com/fnproject/fn/api/common"
"github.com/fnproject/fn/api/models"
"github.com/sirupsen/logrus"
"go.opencensus.io/tag"
"go.opencensus.io/trace"
)
func (a *agent) asyncDequeue() {
// this is just so we can hang up the dequeue request if we get shut down
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// parent span here so that we can see how many async calls are running
ctx, span := trace.StartSpan(ctx, "agent_async_dequeue")
defer span.End()
for {
select {
case <-a.shutWg.Closer():
a.shutWg.AddSession(-1)
return
case <-a.resources.WaitAsyncResource(ctx):
// TODO we _could_ return a token here to reserve the ram so that there's
// not a race between here and Submit but we're single threaded
// dequeueing and retries handled gracefully inside of Submit if we run
// out of RAM so..
}
// we think we can get a cookie now, so go get a cookie
select {
case <-a.shutWg.Closer():
a.shutWg.AddSession(-1)
return
case model, ok := <-a.asyncChew(ctx):
if ok {
go func(model *models.Call) {
a.asyncRun(ctx, model)
a.shutWg.AddSession(-1)
}(model)
// WARNING: tricky. We reserve another session for next iteration of the loop
if !a.shutWg.AddSession(1) {
return
}
}
}
}
}
func (a *agent) asyncChew(ctx context.Context) <-chan *models.Call {
ch := make(chan *models.Call, 1)
go func() {
ctx, cancel := context.WithTimeout(ctx, a.cfg.AsyncChewPoll)
defer cancel()
call, err := a.da.Dequeue(ctx)
if call != nil {
ch <- call
} else { // call is nil / error
if err != nil && err != context.DeadlineExceeded {
logrus.WithError(err).Error("error fetching queued calls")
}
// queue may be empty / unavailable
time.Sleep(1 * time.Second) // backoff a little before sending no cookie message
close(ch)
}
}()
return ch
}
func (a *agent) asyncRun(ctx context.Context, model *models.Call) {
// IMPORTANT: get a context that has a child span but NO timeout (Submit imposes timeout)
// TODO this is a 'FollowsFrom'
ctx = common.BackgroundContext(ctx)
// since async doesn't come in through the normal request path,
// we've gotta add tags here for stats to come out properly.
appKey, err := tag.NewKey("fn_appname")
if err != nil {
logrus.Fatal(err)
}
pathKey, err := tag.NewKey("fn_path")
if err != nil {
logrus.Fatal(err)
}
ctx, err = tag.New(ctx,
tag.Insert(appKey, model.AppID),
tag.Insert(pathKey, model.Path),
)
if err != nil {
logrus.Fatal(err)
}
// additional enclosing context here since this isn't spawned from an http request
ctx, span := trace.StartSpan(ctx, "agent_async_run")
defer span.End()
call, err := a.GetCall(
FromModel(model),
WithContext(ctx), // NOTE: order is important
)
if err != nil {
logrus.WithError(err).Error("error getting async call")
return
}
// TODO if the task is cold and doesn't require reading STDIN, it could
// run but we may not listen for output since the task timed out. these
// are at least once semantics, which is really preferable to at most
// once, so let's do it for now
err = a.Submit(call)
if err != nil {
// NOTE: these could be errors / timeouts from the call that we're
// logging here (i.e. not our fault), but it's likely better to log
// these than suppress them so...
id := call.Model().ID
logrus.WithFields(logrus.Fields{"id": id}).WithError(err).Error("error running async call")
}
}