fn: sync.WaitGroup replacement common.WaitGroup (#937)

* fn: sync.WaitGroup replacement common.WaitGroup

agent/lb_agent/pure_runner has been incorrectly using
sync.WaitGroup semantics. Switching these components to
use the new common.WaitGroup() that provides a few handy
functionality for common graceful shutdown cases.

From https://golang.org/pkg/sync/#WaitGroup,
    "Note that calls with a positive delta that occur when the counter
     is zero must happen before a Wait. Calls with a negative delta,
     or calls with a positive delta that start when the counter is
     greater than zero, may happen at any time. Typically this means
     the calls to Add should execute before the statement creating
     the goroutine or other event to be waited for. If a WaitGroup
     is reused to wait for several independent sets of events,
     new Add calls must happen after all previous Wait calls have
     returned."

HandleCallEnd introduces some complexity to the shutdowns, but this
is currently handled by AddSession(2) initially and letting the
HandleCallEnd() when to decrement by -1 in addition to decrement -1 in
Submit().

lb_agent shutdown sequence and particularly timeouts with runner pool
needs another look/revision, but this is outside of the scope of this
commit.

* fn: lb-agent wg share

* fn: no need to +2 in Submit with defer.

Removed defer since handleCallEnd already has
this responsibility.
This commit is contained in:
Tolga Ceylan
2018-04-12 11:33:01 -07:00
committed by GitHub
parent f350b2ca48
commit e53d23afc9
7 changed files with 298 additions and 99 deletions

View File

@@ -112,21 +112,22 @@ type agent struct {
resources ResourceTracker
// used to track running calls / safe shutdown
wg sync.WaitGroup // TODO rename
shutWg *common.WaitGroup
shutonce sync.Once
shutdown chan struct{}
callEndCount int64
}
// New creates an Agent that executes functions locally as Docker containers.
func New(da DataAccess) Agent {
a := createAgent(da, true).(*agent)
a.wg.Add(1)
a := createAgent(da, true, nil).(*agent)
if !a.shutWg.AddSession(1) {
logrus.Fatalf("cannot start agent, unable to add session")
}
go a.asyncDequeue() // safe shutdown can nanny this fine
return a
}
func createAgent(da DataAccess, withDocker bool) Agent {
func createAgent(da DataAccess, withDocker bool, withShutWg *common.WaitGroup) Agent {
cfg, err := NewAgentConfig()
if err != nil {
logrus.WithError(err).Fatalf("error in agent config cfg=%+v", cfg)
@@ -147,6 +148,9 @@ func createAgent(da DataAccess, withDocker bool) Agent {
} else {
driver = mock.New()
}
if withShutWg == nil {
withShutWg = common.NewWaitGroup()
}
a := &agent{
cfg: *cfg,
@@ -154,7 +158,7 @@ func createAgent(da DataAccess, withDocker bool) Agent {
driver: driver,
slotMgr: NewSlotQueueMgr(),
resources: NewResourceTracker(cfg),
shutdown: make(chan struct{}),
shutWg: withShutWg,
}
// TODO assert that agent doesn't get started for API nodes up above ?
@@ -176,25 +180,23 @@ func (a *agent) Enqueue(ctx context.Context, call *models.Call) error {
func (a *agent) Close() error {
var err error
// wait for ongoing sessions
a.shutWg.CloseGroup()
a.shutonce.Do(func() {
// now close docker layer
if a.driver != nil {
err = a.driver.Close()
}
close(a.shutdown)
})
a.wg.Wait()
return err
}
func (a *agent) Submit(callI Call) error {
a.wg.Add(1)
defer a.wg.Done()
select {
case <-a.shutdown:
if !a.shutWg.AddSession(1) {
return models.ErrCallTimeoutServerBusy
default:
}
call := callI.(*call)
@@ -254,15 +256,24 @@ func (a *agent) submit(ctx context.Context, call *call) error {
}
func (a *agent) scheduleCallEnd(fn func()) {
a.wg.Add(1)
atomic.AddInt64(&a.callEndCount, 1)
go func() {
fn()
atomic.AddInt64(&a.callEndCount, -1)
a.wg.Done()
a.shutWg.AddSession(-1)
}()
}
func (a *agent) finalizeCallEnd(ctx context.Context, err error, isRetriable, isScheduled bool) error {
// if scheduled in background, let scheduleCallEnd() handle
// the shutWg group, otherwise decrement here.
if !isScheduled {
a.shutWg.AddSession(-1)
}
handleStatsEnd(ctx, err)
return transformTimeout(err, isRetriable)
}
func (a *agent) handleCallEnd(ctx context.Context, call *call, slot Slot, err error, isCommitted bool) error {
// For hot-containers, slot close is a simple channel close... No need
@@ -284,9 +295,7 @@ func (a *agent) handleCallEnd(ctx context.Context, call *call, slot Slot, err er
call.End(ctx, err)
cancel()
})
handleStatsEnd(ctx, err)
return transformTimeout(err, false)
return a.finalizeCallEnd(ctx, err, false, true)
}
// The call did not succeed. And it is retriable. We close the slot
@@ -296,10 +305,10 @@ func (a *agent) handleCallEnd(ctx context.Context, call *call, slot Slot, err er
a.scheduleCallEnd(func() {
slot.Close(common.BackgroundContext(ctx)) // (no timeout)
})
return a.finalizeCallEnd(ctx, err, true, true)
}
handleStatsDequeue(ctx, err)
return transformTimeout(err, true)
return a.finalizeCallEnd(ctx, err, true, false)
}
func transformTimeout(e error, isRetriable bool) error {
@@ -400,7 +409,7 @@ func (a *agent) hotLauncher(ctx context.Context, call *call) {
a.checkLaunch(ctx, call)
select {
case <-a.shutdown: // server shutdown
case <-a.shutWg.Closer(): // server shutdown
cancel()
return
case <-ctx.Done(): // timed out
@@ -431,17 +440,22 @@ func (a *agent) checkLaunch(ctx context.Context, call *call) {
select {
case tok := <-a.resources.GetResourceToken(ctx, call.Memory, uint64(call.CPUs), isAsync):
a.wg.Add(1) // add waiter in this thread
go func() {
// NOTE: runHot will not inherit the timeout from ctx (ignore timings)
a.runHot(ctx, call, tok, state)
a.wg.Done()
}()
if a.shutWg.AddSession(1) {
go func() {
// NOTE: runHot will not inherit the timeout from ctx (ignore timings)
a.runHot(ctx, call, tok, state)
a.shutWg.AddSession(-1)
}()
return
}
if tok != nil {
tok.Close()
}
case <-ctx.Done(): // timeout
state.UpdateState(ctx, ContainerStateDone, call.slots)
case <-a.shutdown: // server shutdown
state.UpdateState(ctx, ContainerStateDone, call.slots)
case <-a.shutWg.Closer(): // server shutdown
}
state.UpdateState(ctx, ContainerStateDone, call.slots)
}
// waitHot pings and waits for a hot container from the slot queue
@@ -471,7 +485,7 @@ func (a *agent) waitHot(ctx context.Context, call *call) (Slot, error) {
// we failed to take ownership of the token (eg. container idle timeout) => try again
case <-ctx.Done():
return nil, ctx.Err()
case <-a.shutdown: // server shutdown
case <-a.shutWg.Closer(): // server shutdown
return nil, models.ErrCallTimeoutServerBusy
case <-time.After(sleep):
// ping dequeuer again
@@ -735,7 +749,7 @@ func (a *agent) runHot(ctx context.Context, call *call, tok ResourceToken, state
select { // make sure everything is up before trying to send slot
case <-ctx.Done(): // container shutdown
return
case <-a.shutdown: // server shutdown
case <-a.shutWg.Closer(): // server shutdown
return
default: // ok
}
@@ -808,7 +822,7 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
select {
case <-s.trigger: // slot already consumed
case <-ctx.Done(): // container shutdown
case <-a.shutdown: // server shutdown
case <-a.shutWg.Closer(): // server shutdown
case <-idleTimer.C:
case <-freezeTimer.C:
if !isFrozen {

View File

@@ -12,8 +12,6 @@ import (
)
func (a *agent) asyncDequeue() {
defer a.wg.Done() // we can treat this thread like one big task and get safe shutdown fo free
// this is just so we can hang up the dequeue request if we get shut down
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
@@ -24,7 +22,8 @@ func (a *agent) asyncDequeue() {
for {
select {
case <-a.shutdown:
case <-a.shutWg.Closer():
a.shutWg.AddSession(-1)
return
case <-a.resources.WaitAsyncResource(ctx):
// TODO we _could_ return a token here to reserve the ram so that there's
@@ -35,15 +34,20 @@ func (a *agent) asyncDequeue() {
// we think we can get a cookie now, so go get a cookie
select {
case <-a.shutdown:
case <-a.shutWg.Closer():
a.shutWg.AddSession(-1)
return
case model, ok := <-a.asyncChew(ctx):
if ok {
a.wg.Add(1) // need to add 1 in this thread to ensure safe shutdown
go func(model *models.Call) {
a.asyncRun(ctx, model)
a.wg.Done() // can shed it after this is done, Submit will add 1 too but it's fine
a.shutWg.AddSession(-1)
}(model)
// WARNING: tricky. We reserve another session for next iteration of the loop
if !a.shutWg.AddSession(1) {
return
}
}
}
}

View File

@@ -2,12 +2,12 @@ package agent
import (
"context"
"sync"
"time"
"github.com/sirupsen/logrus"
"go.opencensus.io/trace"
"github.com/fnproject/fn/api/common"
"github.com/fnproject/fn/api/models"
pool "github.com/fnproject/fn/api/runnerpool"
"github.com/fnproject/fn/fnext"
@@ -28,19 +28,19 @@ type lbAgent struct {
delegatedAgent Agent
rp pool.RunnerPool
placer pool.Placer
wg sync.WaitGroup // Needs a good name
shutdown chan struct{}
shutWg *common.WaitGroup
}
// NewLBAgent creates an Agent that knows how to load-balance function calls
// across a group of runner nodes.
func NewLBAgent(da DataAccess, rp pool.RunnerPool, p pool.Placer) (Agent, error) {
agent := createAgent(da, false)
wg := common.NewWaitGroup()
agent := createAgent(da, false, wg)
a := &lbAgent{
delegatedAgent: agent,
rp: rp,
placer: p,
shutWg: wg,
}
return a, nil
}
@@ -63,18 +63,31 @@ func (a *lbAgent) GetCall(opts ...CallOpt) (Call, error) {
}
func (a *lbAgent) Close() error {
// we should really be passing the server's context here
// start closing the front gate first
ch := a.shutWg.CloseGroupNB()
// delegated agent shutdown next, blocks here...
err1 := a.delegatedAgent.Close()
if err1 != nil {
logrus.WithError(err1).Warn("Delegated agent shutdown error")
}
// finally shutdown the runner pool
ctx, cancel := context.WithTimeout(context.Background(), runnerPoolShutdownTimeout)
defer cancel()
close(a.shutdown)
a.rp.Shutdown(ctx)
err := a.delegatedAgent.Close()
a.wg.Wait()
if err != nil {
return err
err2 := a.rp.Shutdown(ctx)
if err2 != nil {
logrus.WithError(err2).Warn("Runner pool shutdown error")
}
return nil
// gate-on front-gate, should be completed if delegated agent & runner pool is gone.
<-ch
if err1 != nil {
return err1
}
return err2
}
func GetGroupID(call *models.Call) string {
@@ -90,13 +103,8 @@ func GetGroupID(call *models.Call) string {
}
func (a *lbAgent) Submit(callI Call) error {
a.wg.Add(1)
defer a.wg.Done()
select {
case <-a.shutdown:
if !a.shutWg.AddSession(1) {
return models.ErrCallTimeoutServerBusy
default:
}
call := callI.(*call)

View File

@@ -671,7 +671,7 @@ func DefaultPureRunner(cancel context.CancelFunc, addr string, da DataAccess, ce
}
func NewPureRunner(cancel context.CancelFunc, addr string, da DataAccess, cert string, key string, ca string, gate CapacityGate) (Agent, error) {
a := createAgent(da, true)
a := createAgent(da, true, nil)
var pr *pureRunner
var err error
if cert != "" && key != "" && ca != "" {

View File

@@ -3,22 +3,26 @@ package agent
import (
"context"
"encoding/json"
"errors"
"io"
"sync"
"time"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials"
pb "github.com/fnproject/fn/api/agent/grpc"
"github.com/fnproject/fn/api/common"
pool "github.com/fnproject/fn/api/runnerpool"
"github.com/fnproject/fn/grpcutil"
"github.com/sirupsen/logrus"
)
var (
ErrorRunnerClosed = errors.New("Runner is closed")
)
type gRPCRunner struct {
// Need a WaitGroup of TryExec in flight
wg sync.WaitGroup
shutWg *common.WaitGroup
address string
conn *grpc.ClientConn
client pb.RunnerProtocolClient
@@ -31,6 +35,7 @@ func SecureGRPCRunnerFactory(addr, runnerCertCN string, pki *pool.PKIData) (pool
}
return &gRPCRunner{
shutWg: common.NewWaitGroup(),
address: addr,
conn: conn,
client: client,
@@ -43,7 +48,7 @@ func (r *gRPCRunner) Close(ctx context.Context) error {
err := make(chan error, 1)
go func() {
defer close(err)
r.wg.Wait()
r.shutWg.CloseGroup()
err <- r.conn.Close()
}()
@@ -86,8 +91,10 @@ func (r *gRPCRunner) Address() string {
func (r *gRPCRunner) TryExec(ctx context.Context, call pool.RunnerCall) (bool, error) {
logrus.WithField("runner_addr", r.address).Debug("Attempting to place call")
r.wg.Add(1)
defer r.wg.Done()
if !r.shutWg.AddSession(1) {
return true, ErrorRunnerClosed
}
defer r.shutWg.AddSession(-1)
// extract the call's model data to pass on to the pure runner
modelJSON, err := json.Marshal(call.Model())