mirror of
https://github.com/fnproject/fn.git
synced 2022-10-28 21:29:17 +03:00
fn: sync.WaitGroup replacement common.WaitGroup (#937)
* fn: sync.WaitGroup replacement common.WaitGroup agent/lb_agent/pure_runner has been incorrectly using sync.WaitGroup semantics. Switching these components to use the new common.WaitGroup() that provides a few handy functionality for common graceful shutdown cases. From https://golang.org/pkg/sync/#WaitGroup, "Note that calls with a positive delta that occur when the counter is zero must happen before a Wait. Calls with a negative delta, or calls with a positive delta that start when the counter is greater than zero, may happen at any time. Typically this means the calls to Add should execute before the statement creating the goroutine or other event to be waited for. If a WaitGroup is reused to wait for several independent sets of events, new Add calls must happen after all previous Wait calls have returned." HandleCallEnd introduces some complexity to the shutdowns, but this is currently handled by AddSession(2) initially and letting the HandleCallEnd() when to decrement by -1 in addition to decrement -1 in Submit(). lb_agent shutdown sequence and particularly timeouts with runner pool needs another look/revision, but this is outside of the scope of this commit. * fn: lb-agent wg share * fn: no need to +2 in Submit with defer. Removed defer since handleCallEnd already has this responsibility.
This commit is contained in:
@@ -112,21 +112,22 @@ type agent struct {
|
||||
resources ResourceTracker
|
||||
|
||||
// used to track running calls / safe shutdown
|
||||
wg sync.WaitGroup // TODO rename
|
||||
shutWg *common.WaitGroup
|
||||
shutonce sync.Once
|
||||
shutdown chan struct{}
|
||||
callEndCount int64
|
||||
}
|
||||
|
||||
// New creates an Agent that executes functions locally as Docker containers.
|
||||
func New(da DataAccess) Agent {
|
||||
a := createAgent(da, true).(*agent)
|
||||
a.wg.Add(1)
|
||||
a := createAgent(da, true, nil).(*agent)
|
||||
if !a.shutWg.AddSession(1) {
|
||||
logrus.Fatalf("cannot start agent, unable to add session")
|
||||
}
|
||||
go a.asyncDequeue() // safe shutdown can nanny this fine
|
||||
return a
|
||||
}
|
||||
|
||||
func createAgent(da DataAccess, withDocker bool) Agent {
|
||||
func createAgent(da DataAccess, withDocker bool, withShutWg *common.WaitGroup) Agent {
|
||||
cfg, err := NewAgentConfig()
|
||||
if err != nil {
|
||||
logrus.WithError(err).Fatalf("error in agent config cfg=%+v", cfg)
|
||||
@@ -147,6 +148,9 @@ func createAgent(da DataAccess, withDocker bool) Agent {
|
||||
} else {
|
||||
driver = mock.New()
|
||||
}
|
||||
if withShutWg == nil {
|
||||
withShutWg = common.NewWaitGroup()
|
||||
}
|
||||
|
||||
a := &agent{
|
||||
cfg: *cfg,
|
||||
@@ -154,7 +158,7 @@ func createAgent(da DataAccess, withDocker bool) Agent {
|
||||
driver: driver,
|
||||
slotMgr: NewSlotQueueMgr(),
|
||||
resources: NewResourceTracker(cfg),
|
||||
shutdown: make(chan struct{}),
|
||||
shutWg: withShutWg,
|
||||
}
|
||||
|
||||
// TODO assert that agent doesn't get started for API nodes up above ?
|
||||
@@ -176,25 +180,23 @@ func (a *agent) Enqueue(ctx context.Context, call *models.Call) error {
|
||||
|
||||
func (a *agent) Close() error {
|
||||
var err error
|
||||
|
||||
// wait for ongoing sessions
|
||||
a.shutWg.CloseGroup()
|
||||
|
||||
a.shutonce.Do(func() {
|
||||
// now close docker layer
|
||||
if a.driver != nil {
|
||||
err = a.driver.Close()
|
||||
}
|
||||
close(a.shutdown)
|
||||
})
|
||||
|
||||
a.wg.Wait()
|
||||
return err
|
||||
}
|
||||
|
||||
func (a *agent) Submit(callI Call) error {
|
||||
a.wg.Add(1)
|
||||
defer a.wg.Done()
|
||||
|
||||
select {
|
||||
case <-a.shutdown:
|
||||
if !a.shutWg.AddSession(1) {
|
||||
return models.ErrCallTimeoutServerBusy
|
||||
default:
|
||||
}
|
||||
|
||||
call := callI.(*call)
|
||||
@@ -254,15 +256,24 @@ func (a *agent) submit(ctx context.Context, call *call) error {
|
||||
}
|
||||
|
||||
func (a *agent) scheduleCallEnd(fn func()) {
|
||||
a.wg.Add(1)
|
||||
atomic.AddInt64(&a.callEndCount, 1)
|
||||
go func() {
|
||||
fn()
|
||||
atomic.AddInt64(&a.callEndCount, -1)
|
||||
a.wg.Done()
|
||||
a.shutWg.AddSession(-1)
|
||||
}()
|
||||
}
|
||||
|
||||
func (a *agent) finalizeCallEnd(ctx context.Context, err error, isRetriable, isScheduled bool) error {
|
||||
// if scheduled in background, let scheduleCallEnd() handle
|
||||
// the shutWg group, otherwise decrement here.
|
||||
if !isScheduled {
|
||||
a.shutWg.AddSession(-1)
|
||||
}
|
||||
handleStatsEnd(ctx, err)
|
||||
return transformTimeout(err, isRetriable)
|
||||
}
|
||||
|
||||
func (a *agent) handleCallEnd(ctx context.Context, call *call, slot Slot, err error, isCommitted bool) error {
|
||||
|
||||
// For hot-containers, slot close is a simple channel close... No need
|
||||
@@ -284,9 +295,7 @@ func (a *agent) handleCallEnd(ctx context.Context, call *call, slot Slot, err er
|
||||
call.End(ctx, err)
|
||||
cancel()
|
||||
})
|
||||
|
||||
handleStatsEnd(ctx, err)
|
||||
return transformTimeout(err, false)
|
||||
return a.finalizeCallEnd(ctx, err, false, true)
|
||||
}
|
||||
|
||||
// The call did not succeed. And it is retriable. We close the slot
|
||||
@@ -296,10 +305,10 @@ func (a *agent) handleCallEnd(ctx context.Context, call *call, slot Slot, err er
|
||||
a.scheduleCallEnd(func() {
|
||||
slot.Close(common.BackgroundContext(ctx)) // (no timeout)
|
||||
})
|
||||
return a.finalizeCallEnd(ctx, err, true, true)
|
||||
}
|
||||
|
||||
handleStatsDequeue(ctx, err)
|
||||
return transformTimeout(err, true)
|
||||
return a.finalizeCallEnd(ctx, err, true, false)
|
||||
}
|
||||
|
||||
func transformTimeout(e error, isRetriable bool) error {
|
||||
@@ -400,7 +409,7 @@ func (a *agent) hotLauncher(ctx context.Context, call *call) {
|
||||
a.checkLaunch(ctx, call)
|
||||
|
||||
select {
|
||||
case <-a.shutdown: // server shutdown
|
||||
case <-a.shutWg.Closer(): // server shutdown
|
||||
cancel()
|
||||
return
|
||||
case <-ctx.Done(): // timed out
|
||||
@@ -431,17 +440,22 @@ func (a *agent) checkLaunch(ctx context.Context, call *call) {
|
||||
|
||||
select {
|
||||
case tok := <-a.resources.GetResourceToken(ctx, call.Memory, uint64(call.CPUs), isAsync):
|
||||
a.wg.Add(1) // add waiter in this thread
|
||||
go func() {
|
||||
// NOTE: runHot will not inherit the timeout from ctx (ignore timings)
|
||||
a.runHot(ctx, call, tok, state)
|
||||
a.wg.Done()
|
||||
}()
|
||||
if a.shutWg.AddSession(1) {
|
||||
go func() {
|
||||
// NOTE: runHot will not inherit the timeout from ctx (ignore timings)
|
||||
a.runHot(ctx, call, tok, state)
|
||||
a.shutWg.AddSession(-1)
|
||||
}()
|
||||
return
|
||||
}
|
||||
if tok != nil {
|
||||
tok.Close()
|
||||
}
|
||||
case <-ctx.Done(): // timeout
|
||||
state.UpdateState(ctx, ContainerStateDone, call.slots)
|
||||
case <-a.shutdown: // server shutdown
|
||||
state.UpdateState(ctx, ContainerStateDone, call.slots)
|
||||
case <-a.shutWg.Closer(): // server shutdown
|
||||
}
|
||||
|
||||
state.UpdateState(ctx, ContainerStateDone, call.slots)
|
||||
}
|
||||
|
||||
// waitHot pings and waits for a hot container from the slot queue
|
||||
@@ -471,7 +485,7 @@ func (a *agent) waitHot(ctx context.Context, call *call) (Slot, error) {
|
||||
// we failed to take ownership of the token (eg. container idle timeout) => try again
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
case <-a.shutdown: // server shutdown
|
||||
case <-a.shutWg.Closer(): // server shutdown
|
||||
return nil, models.ErrCallTimeoutServerBusy
|
||||
case <-time.After(sleep):
|
||||
// ping dequeuer again
|
||||
@@ -735,7 +749,7 @@ func (a *agent) runHot(ctx context.Context, call *call, tok ResourceToken, state
|
||||
select { // make sure everything is up before trying to send slot
|
||||
case <-ctx.Done(): // container shutdown
|
||||
return
|
||||
case <-a.shutdown: // server shutdown
|
||||
case <-a.shutWg.Closer(): // server shutdown
|
||||
return
|
||||
default: // ok
|
||||
}
|
||||
@@ -808,7 +822,7 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
|
||||
select {
|
||||
case <-s.trigger: // slot already consumed
|
||||
case <-ctx.Done(): // container shutdown
|
||||
case <-a.shutdown: // server shutdown
|
||||
case <-a.shutWg.Closer(): // server shutdown
|
||||
case <-idleTimer.C:
|
||||
case <-freezeTimer.C:
|
||||
if !isFrozen {
|
||||
|
||||
Reference in New Issue
Block a user