mirror of
https://github.com/fnproject/fn.git
synced 2022-10-28 21:29:17 +03:00
fn: sync.WaitGroup replacement common.WaitGroup (#937)
* fn: sync.WaitGroup replacement common.WaitGroup agent/lb_agent/pure_runner has been incorrectly using sync.WaitGroup semantics. Switching these components to use the new common.WaitGroup() that provides a few handy functionality for common graceful shutdown cases. From https://golang.org/pkg/sync/#WaitGroup, "Note that calls with a positive delta that occur when the counter is zero must happen before a Wait. Calls with a negative delta, or calls with a positive delta that start when the counter is greater than zero, may happen at any time. Typically this means the calls to Add should execute before the statement creating the goroutine or other event to be waited for. If a WaitGroup is reused to wait for several independent sets of events, new Add calls must happen after all previous Wait calls have returned." HandleCallEnd introduces some complexity to the shutdowns, but this is currently handled by AddSession(2) initially and letting the HandleCallEnd() when to decrement by -1 in addition to decrement -1 in Submit(). lb_agent shutdown sequence and particularly timeouts with runner pool needs another look/revision, but this is outside of the scope of this commit. * fn: lb-agent wg share * fn: no need to +2 in Submit with defer. Removed defer since handleCallEnd already has this responsibility.
This commit is contained in:
@@ -112,21 +112,22 @@ type agent struct {
|
|||||||
resources ResourceTracker
|
resources ResourceTracker
|
||||||
|
|
||||||
// used to track running calls / safe shutdown
|
// used to track running calls / safe shutdown
|
||||||
wg sync.WaitGroup // TODO rename
|
shutWg *common.WaitGroup
|
||||||
shutonce sync.Once
|
shutonce sync.Once
|
||||||
shutdown chan struct{}
|
|
||||||
callEndCount int64
|
callEndCount int64
|
||||||
}
|
}
|
||||||
|
|
||||||
// New creates an Agent that executes functions locally as Docker containers.
|
// New creates an Agent that executes functions locally as Docker containers.
|
||||||
func New(da DataAccess) Agent {
|
func New(da DataAccess) Agent {
|
||||||
a := createAgent(da, true).(*agent)
|
a := createAgent(da, true, nil).(*agent)
|
||||||
a.wg.Add(1)
|
if !a.shutWg.AddSession(1) {
|
||||||
|
logrus.Fatalf("cannot start agent, unable to add session")
|
||||||
|
}
|
||||||
go a.asyncDequeue() // safe shutdown can nanny this fine
|
go a.asyncDequeue() // safe shutdown can nanny this fine
|
||||||
return a
|
return a
|
||||||
}
|
}
|
||||||
|
|
||||||
func createAgent(da DataAccess, withDocker bool) Agent {
|
func createAgent(da DataAccess, withDocker bool, withShutWg *common.WaitGroup) Agent {
|
||||||
cfg, err := NewAgentConfig()
|
cfg, err := NewAgentConfig()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logrus.WithError(err).Fatalf("error in agent config cfg=%+v", cfg)
|
logrus.WithError(err).Fatalf("error in agent config cfg=%+v", cfg)
|
||||||
@@ -147,6 +148,9 @@ func createAgent(da DataAccess, withDocker bool) Agent {
|
|||||||
} else {
|
} else {
|
||||||
driver = mock.New()
|
driver = mock.New()
|
||||||
}
|
}
|
||||||
|
if withShutWg == nil {
|
||||||
|
withShutWg = common.NewWaitGroup()
|
||||||
|
}
|
||||||
|
|
||||||
a := &agent{
|
a := &agent{
|
||||||
cfg: *cfg,
|
cfg: *cfg,
|
||||||
@@ -154,7 +158,7 @@ func createAgent(da DataAccess, withDocker bool) Agent {
|
|||||||
driver: driver,
|
driver: driver,
|
||||||
slotMgr: NewSlotQueueMgr(),
|
slotMgr: NewSlotQueueMgr(),
|
||||||
resources: NewResourceTracker(cfg),
|
resources: NewResourceTracker(cfg),
|
||||||
shutdown: make(chan struct{}),
|
shutWg: withShutWg,
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO assert that agent doesn't get started for API nodes up above ?
|
// TODO assert that agent doesn't get started for API nodes up above ?
|
||||||
@@ -176,25 +180,23 @@ func (a *agent) Enqueue(ctx context.Context, call *models.Call) error {
|
|||||||
|
|
||||||
func (a *agent) Close() error {
|
func (a *agent) Close() error {
|
||||||
var err error
|
var err error
|
||||||
|
|
||||||
|
// wait for ongoing sessions
|
||||||
|
a.shutWg.CloseGroup()
|
||||||
|
|
||||||
a.shutonce.Do(func() {
|
a.shutonce.Do(func() {
|
||||||
|
// now close docker layer
|
||||||
if a.driver != nil {
|
if a.driver != nil {
|
||||||
err = a.driver.Close()
|
err = a.driver.Close()
|
||||||
}
|
}
|
||||||
close(a.shutdown)
|
|
||||||
})
|
})
|
||||||
|
|
||||||
a.wg.Wait()
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *agent) Submit(callI Call) error {
|
func (a *agent) Submit(callI Call) error {
|
||||||
a.wg.Add(1)
|
if !a.shutWg.AddSession(1) {
|
||||||
defer a.wg.Done()
|
|
||||||
|
|
||||||
select {
|
|
||||||
case <-a.shutdown:
|
|
||||||
return models.ErrCallTimeoutServerBusy
|
return models.ErrCallTimeoutServerBusy
|
||||||
default:
|
|
||||||
}
|
}
|
||||||
|
|
||||||
call := callI.(*call)
|
call := callI.(*call)
|
||||||
@@ -254,15 +256,24 @@ func (a *agent) submit(ctx context.Context, call *call) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (a *agent) scheduleCallEnd(fn func()) {
|
func (a *agent) scheduleCallEnd(fn func()) {
|
||||||
a.wg.Add(1)
|
|
||||||
atomic.AddInt64(&a.callEndCount, 1)
|
atomic.AddInt64(&a.callEndCount, 1)
|
||||||
go func() {
|
go func() {
|
||||||
fn()
|
fn()
|
||||||
atomic.AddInt64(&a.callEndCount, -1)
|
atomic.AddInt64(&a.callEndCount, -1)
|
||||||
a.wg.Done()
|
a.shutWg.AddSession(-1)
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *agent) finalizeCallEnd(ctx context.Context, err error, isRetriable, isScheduled bool) error {
|
||||||
|
// if scheduled in background, let scheduleCallEnd() handle
|
||||||
|
// the shutWg group, otherwise decrement here.
|
||||||
|
if !isScheduled {
|
||||||
|
a.shutWg.AddSession(-1)
|
||||||
|
}
|
||||||
|
handleStatsEnd(ctx, err)
|
||||||
|
return transformTimeout(err, isRetriable)
|
||||||
|
}
|
||||||
|
|
||||||
func (a *agent) handleCallEnd(ctx context.Context, call *call, slot Slot, err error, isCommitted bool) error {
|
func (a *agent) handleCallEnd(ctx context.Context, call *call, slot Slot, err error, isCommitted bool) error {
|
||||||
|
|
||||||
// For hot-containers, slot close is a simple channel close... No need
|
// For hot-containers, slot close is a simple channel close... No need
|
||||||
@@ -284,9 +295,7 @@ func (a *agent) handleCallEnd(ctx context.Context, call *call, slot Slot, err er
|
|||||||
call.End(ctx, err)
|
call.End(ctx, err)
|
||||||
cancel()
|
cancel()
|
||||||
})
|
})
|
||||||
|
return a.finalizeCallEnd(ctx, err, false, true)
|
||||||
handleStatsEnd(ctx, err)
|
|
||||||
return transformTimeout(err, false)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// The call did not succeed. And it is retriable. We close the slot
|
// The call did not succeed. And it is retriable. We close the slot
|
||||||
@@ -296,10 +305,10 @@ func (a *agent) handleCallEnd(ctx context.Context, call *call, slot Slot, err er
|
|||||||
a.scheduleCallEnd(func() {
|
a.scheduleCallEnd(func() {
|
||||||
slot.Close(common.BackgroundContext(ctx)) // (no timeout)
|
slot.Close(common.BackgroundContext(ctx)) // (no timeout)
|
||||||
})
|
})
|
||||||
|
return a.finalizeCallEnd(ctx, err, true, true)
|
||||||
}
|
}
|
||||||
|
|
||||||
handleStatsDequeue(ctx, err)
|
return a.finalizeCallEnd(ctx, err, true, false)
|
||||||
return transformTimeout(err, true)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func transformTimeout(e error, isRetriable bool) error {
|
func transformTimeout(e error, isRetriable bool) error {
|
||||||
@@ -400,7 +409,7 @@ func (a *agent) hotLauncher(ctx context.Context, call *call) {
|
|||||||
a.checkLaunch(ctx, call)
|
a.checkLaunch(ctx, call)
|
||||||
|
|
||||||
select {
|
select {
|
||||||
case <-a.shutdown: // server shutdown
|
case <-a.shutWg.Closer(): // server shutdown
|
||||||
cancel()
|
cancel()
|
||||||
return
|
return
|
||||||
case <-ctx.Done(): // timed out
|
case <-ctx.Done(): // timed out
|
||||||
@@ -431,17 +440,22 @@ func (a *agent) checkLaunch(ctx context.Context, call *call) {
|
|||||||
|
|
||||||
select {
|
select {
|
||||||
case tok := <-a.resources.GetResourceToken(ctx, call.Memory, uint64(call.CPUs), isAsync):
|
case tok := <-a.resources.GetResourceToken(ctx, call.Memory, uint64(call.CPUs), isAsync):
|
||||||
a.wg.Add(1) // add waiter in this thread
|
if a.shutWg.AddSession(1) {
|
||||||
go func() {
|
go func() {
|
||||||
// NOTE: runHot will not inherit the timeout from ctx (ignore timings)
|
// NOTE: runHot will not inherit the timeout from ctx (ignore timings)
|
||||||
a.runHot(ctx, call, tok, state)
|
a.runHot(ctx, call, tok, state)
|
||||||
a.wg.Done()
|
a.shutWg.AddSession(-1)
|
||||||
}()
|
}()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if tok != nil {
|
||||||
|
tok.Close()
|
||||||
|
}
|
||||||
case <-ctx.Done(): // timeout
|
case <-ctx.Done(): // timeout
|
||||||
state.UpdateState(ctx, ContainerStateDone, call.slots)
|
case <-a.shutWg.Closer(): // server shutdown
|
||||||
case <-a.shutdown: // server shutdown
|
|
||||||
state.UpdateState(ctx, ContainerStateDone, call.slots)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
state.UpdateState(ctx, ContainerStateDone, call.slots)
|
||||||
}
|
}
|
||||||
|
|
||||||
// waitHot pings and waits for a hot container from the slot queue
|
// waitHot pings and waits for a hot container from the slot queue
|
||||||
@@ -471,7 +485,7 @@ func (a *agent) waitHot(ctx context.Context, call *call) (Slot, error) {
|
|||||||
// we failed to take ownership of the token (eg. container idle timeout) => try again
|
// we failed to take ownership of the token (eg. container idle timeout) => try again
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
return nil, ctx.Err()
|
return nil, ctx.Err()
|
||||||
case <-a.shutdown: // server shutdown
|
case <-a.shutWg.Closer(): // server shutdown
|
||||||
return nil, models.ErrCallTimeoutServerBusy
|
return nil, models.ErrCallTimeoutServerBusy
|
||||||
case <-time.After(sleep):
|
case <-time.After(sleep):
|
||||||
// ping dequeuer again
|
// ping dequeuer again
|
||||||
@@ -735,7 +749,7 @@ func (a *agent) runHot(ctx context.Context, call *call, tok ResourceToken, state
|
|||||||
select { // make sure everything is up before trying to send slot
|
select { // make sure everything is up before trying to send slot
|
||||||
case <-ctx.Done(): // container shutdown
|
case <-ctx.Done(): // container shutdown
|
||||||
return
|
return
|
||||||
case <-a.shutdown: // server shutdown
|
case <-a.shutWg.Closer(): // server shutdown
|
||||||
return
|
return
|
||||||
default: // ok
|
default: // ok
|
||||||
}
|
}
|
||||||
@@ -808,7 +822,7 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
|
|||||||
select {
|
select {
|
||||||
case <-s.trigger: // slot already consumed
|
case <-s.trigger: // slot already consumed
|
||||||
case <-ctx.Done(): // container shutdown
|
case <-ctx.Done(): // container shutdown
|
||||||
case <-a.shutdown: // server shutdown
|
case <-a.shutWg.Closer(): // server shutdown
|
||||||
case <-idleTimer.C:
|
case <-idleTimer.C:
|
||||||
case <-freezeTimer.C:
|
case <-freezeTimer.C:
|
||||||
if !isFrozen {
|
if !isFrozen {
|
||||||
|
|||||||
@@ -12,8 +12,6 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func (a *agent) asyncDequeue() {
|
func (a *agent) asyncDequeue() {
|
||||||
defer a.wg.Done() // we can treat this thread like one big task and get safe shutdown fo free
|
|
||||||
|
|
||||||
// this is just so we can hang up the dequeue request if we get shut down
|
// this is just so we can hang up the dequeue request if we get shut down
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
defer cancel()
|
defer cancel()
|
||||||
@@ -24,7 +22,8 @@ func (a *agent) asyncDequeue() {
|
|||||||
|
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-a.shutdown:
|
case <-a.shutWg.Closer():
|
||||||
|
a.shutWg.AddSession(-1)
|
||||||
return
|
return
|
||||||
case <-a.resources.WaitAsyncResource(ctx):
|
case <-a.resources.WaitAsyncResource(ctx):
|
||||||
// TODO we _could_ return a token here to reserve the ram so that there's
|
// TODO we _could_ return a token here to reserve the ram so that there's
|
||||||
@@ -35,15 +34,20 @@ func (a *agent) asyncDequeue() {
|
|||||||
|
|
||||||
// we think we can get a cookie now, so go get a cookie
|
// we think we can get a cookie now, so go get a cookie
|
||||||
select {
|
select {
|
||||||
case <-a.shutdown:
|
case <-a.shutWg.Closer():
|
||||||
|
a.shutWg.AddSession(-1)
|
||||||
return
|
return
|
||||||
case model, ok := <-a.asyncChew(ctx):
|
case model, ok := <-a.asyncChew(ctx):
|
||||||
if ok {
|
if ok {
|
||||||
a.wg.Add(1) // need to add 1 in this thread to ensure safe shutdown
|
|
||||||
go func(model *models.Call) {
|
go func(model *models.Call) {
|
||||||
a.asyncRun(ctx, model)
|
a.asyncRun(ctx, model)
|
||||||
a.wg.Done() // can shed it after this is done, Submit will add 1 too but it's fine
|
a.shutWg.AddSession(-1)
|
||||||
}(model)
|
}(model)
|
||||||
|
|
||||||
|
// WARNING: tricky. We reserve another session for next iteration of the loop
|
||||||
|
if !a.shutWg.AddSession(1) {
|
||||||
|
return
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,12 +2,12 @@ package agent
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"sync"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
"go.opencensus.io/trace"
|
"go.opencensus.io/trace"
|
||||||
|
|
||||||
|
"github.com/fnproject/fn/api/common"
|
||||||
"github.com/fnproject/fn/api/models"
|
"github.com/fnproject/fn/api/models"
|
||||||
pool "github.com/fnproject/fn/api/runnerpool"
|
pool "github.com/fnproject/fn/api/runnerpool"
|
||||||
"github.com/fnproject/fn/fnext"
|
"github.com/fnproject/fn/fnext"
|
||||||
@@ -28,19 +28,19 @@ type lbAgent struct {
|
|||||||
delegatedAgent Agent
|
delegatedAgent Agent
|
||||||
rp pool.RunnerPool
|
rp pool.RunnerPool
|
||||||
placer pool.Placer
|
placer pool.Placer
|
||||||
|
shutWg *common.WaitGroup
|
||||||
wg sync.WaitGroup // Needs a good name
|
|
||||||
shutdown chan struct{}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewLBAgent creates an Agent that knows how to load-balance function calls
|
// NewLBAgent creates an Agent that knows how to load-balance function calls
|
||||||
// across a group of runner nodes.
|
// across a group of runner nodes.
|
||||||
func NewLBAgent(da DataAccess, rp pool.RunnerPool, p pool.Placer) (Agent, error) {
|
func NewLBAgent(da DataAccess, rp pool.RunnerPool, p pool.Placer) (Agent, error) {
|
||||||
agent := createAgent(da, false)
|
wg := common.NewWaitGroup()
|
||||||
|
agent := createAgent(da, false, wg)
|
||||||
a := &lbAgent{
|
a := &lbAgent{
|
||||||
delegatedAgent: agent,
|
delegatedAgent: agent,
|
||||||
rp: rp,
|
rp: rp,
|
||||||
placer: p,
|
placer: p,
|
||||||
|
shutWg: wg,
|
||||||
}
|
}
|
||||||
return a, nil
|
return a, nil
|
||||||
}
|
}
|
||||||
@@ -63,18 +63,31 @@ func (a *lbAgent) GetCall(opts ...CallOpt) (Call, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (a *lbAgent) Close() error {
|
func (a *lbAgent) Close() error {
|
||||||
// we should really be passing the server's context here
|
|
||||||
|
// start closing the front gate first
|
||||||
|
ch := a.shutWg.CloseGroupNB()
|
||||||
|
|
||||||
|
// delegated agent shutdown next, blocks here...
|
||||||
|
err1 := a.delegatedAgent.Close()
|
||||||
|
if err1 != nil {
|
||||||
|
logrus.WithError(err1).Warn("Delegated agent shutdown error")
|
||||||
|
}
|
||||||
|
|
||||||
|
// finally shutdown the runner pool
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), runnerPoolShutdownTimeout)
|
ctx, cancel := context.WithTimeout(context.Background(), runnerPoolShutdownTimeout)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
err2 := a.rp.Shutdown(ctx)
|
||||||
close(a.shutdown)
|
if err2 != nil {
|
||||||
a.rp.Shutdown(ctx)
|
logrus.WithError(err2).Warn("Runner pool shutdown error")
|
||||||
err := a.delegatedAgent.Close()
|
|
||||||
a.wg.Wait()
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
return nil
|
|
||||||
|
// gate-on front-gate, should be completed if delegated agent & runner pool is gone.
|
||||||
|
<-ch
|
||||||
|
|
||||||
|
if err1 != nil {
|
||||||
|
return err1
|
||||||
|
}
|
||||||
|
return err2
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetGroupID(call *models.Call) string {
|
func GetGroupID(call *models.Call) string {
|
||||||
@@ -90,13 +103,8 @@ func GetGroupID(call *models.Call) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (a *lbAgent) Submit(callI Call) error {
|
func (a *lbAgent) Submit(callI Call) error {
|
||||||
a.wg.Add(1)
|
if !a.shutWg.AddSession(1) {
|
||||||
defer a.wg.Done()
|
|
||||||
|
|
||||||
select {
|
|
||||||
case <-a.shutdown:
|
|
||||||
return models.ErrCallTimeoutServerBusy
|
return models.ErrCallTimeoutServerBusy
|
||||||
default:
|
|
||||||
}
|
}
|
||||||
|
|
||||||
call := callI.(*call)
|
call := callI.(*call)
|
||||||
|
|||||||
@@ -671,7 +671,7 @@ func DefaultPureRunner(cancel context.CancelFunc, addr string, da DataAccess, ce
|
|||||||
}
|
}
|
||||||
|
|
||||||
func NewPureRunner(cancel context.CancelFunc, addr string, da DataAccess, cert string, key string, ca string, gate CapacityGate) (Agent, error) {
|
func NewPureRunner(cancel context.CancelFunc, addr string, da DataAccess, cert string, key string, ca string, gate CapacityGate) (Agent, error) {
|
||||||
a := createAgent(da, true)
|
a := createAgent(da, true, nil)
|
||||||
var pr *pureRunner
|
var pr *pureRunner
|
||||||
var err error
|
var err error
|
||||||
if cert != "" && key != "" && ca != "" {
|
if cert != "" && key != "" && ca != "" {
|
||||||
|
|||||||
@@ -3,22 +3,26 @@ package agent
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
"io"
|
"io"
|
||||||
"sync"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"google.golang.org/grpc"
|
"google.golang.org/grpc"
|
||||||
"google.golang.org/grpc/credentials"
|
"google.golang.org/grpc/credentials"
|
||||||
|
|
||||||
pb "github.com/fnproject/fn/api/agent/grpc"
|
pb "github.com/fnproject/fn/api/agent/grpc"
|
||||||
|
"github.com/fnproject/fn/api/common"
|
||||||
pool "github.com/fnproject/fn/api/runnerpool"
|
pool "github.com/fnproject/fn/api/runnerpool"
|
||||||
"github.com/fnproject/fn/grpcutil"
|
"github.com/fnproject/fn/grpcutil"
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
ErrorRunnerClosed = errors.New("Runner is closed")
|
||||||
|
)
|
||||||
|
|
||||||
type gRPCRunner struct {
|
type gRPCRunner struct {
|
||||||
// Need a WaitGroup of TryExec in flight
|
shutWg *common.WaitGroup
|
||||||
wg sync.WaitGroup
|
|
||||||
address string
|
address string
|
||||||
conn *grpc.ClientConn
|
conn *grpc.ClientConn
|
||||||
client pb.RunnerProtocolClient
|
client pb.RunnerProtocolClient
|
||||||
@@ -31,6 +35,7 @@ func SecureGRPCRunnerFactory(addr, runnerCertCN string, pki *pool.PKIData) (pool
|
|||||||
}
|
}
|
||||||
|
|
||||||
return &gRPCRunner{
|
return &gRPCRunner{
|
||||||
|
shutWg: common.NewWaitGroup(),
|
||||||
address: addr,
|
address: addr,
|
||||||
conn: conn,
|
conn: conn,
|
||||||
client: client,
|
client: client,
|
||||||
@@ -43,7 +48,7 @@ func (r *gRPCRunner) Close(ctx context.Context) error {
|
|||||||
err := make(chan error, 1)
|
err := make(chan error, 1)
|
||||||
go func() {
|
go func() {
|
||||||
defer close(err)
|
defer close(err)
|
||||||
r.wg.Wait()
|
r.shutWg.CloseGroup()
|
||||||
err <- r.conn.Close()
|
err <- r.conn.Close()
|
||||||
}()
|
}()
|
||||||
|
|
||||||
@@ -86,8 +91,10 @@ func (r *gRPCRunner) Address() string {
|
|||||||
|
|
||||||
func (r *gRPCRunner) TryExec(ctx context.Context, call pool.RunnerCall) (bool, error) {
|
func (r *gRPCRunner) TryExec(ctx context.Context, call pool.RunnerCall) (bool, error) {
|
||||||
logrus.WithField("runner_addr", r.address).Debug("Attempting to place call")
|
logrus.WithField("runner_addr", r.address).Debug("Attempting to place call")
|
||||||
r.wg.Add(1)
|
if !r.shutWg.AddSession(1) {
|
||||||
defer r.wg.Done()
|
return true, ErrorRunnerClosed
|
||||||
|
}
|
||||||
|
defer r.shutWg.AddSession(-1)
|
||||||
|
|
||||||
// extract the call's model data to pass on to the pure runner
|
// extract the call's model data to pass on to the pure runner
|
||||||
modelJSON, err := json.Marshal(call.Model())
|
modelJSON, err := json.Marshal(call.Model())
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package common
|
package common
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
"sync"
|
"sync"
|
||||||
)
|
)
|
||||||
@@ -8,17 +9,20 @@ import (
|
|||||||
/*
|
/*
|
||||||
WaitGroup is used to manage and wait for a collection of
|
WaitGroup is used to manage and wait for a collection of
|
||||||
sessions. It is similar to sync.WaitGroup, but
|
sessions. It is similar to sync.WaitGroup, but
|
||||||
AddSession/RmSession/WaitClose session is not only thread
|
AddSession/CloseGroup session is not only thread
|
||||||
safe but can be executed in any order unlike sync.WaitGroup.
|
safe but can be executed in any order unlike sync.WaitGroup.
|
||||||
|
|
||||||
Once a shutdown is initiated via CloseGroup(), add/rm
|
Once a shutdown is initiated via CloseGroup(), add/rm
|
||||||
operations will still function correctly, where
|
operations will still function correctly, where
|
||||||
AddSession would return false error.
|
AddSession would return false. In this state,
|
||||||
In this state, CloseGroup() blocks until sessions get drained
|
CloseGroup() blocks until sessions get drained
|
||||||
via RmSession() calls.
|
via remove operations.
|
||||||
|
|
||||||
It is an error to call RmSession without a corresponding
|
It is an error to call AddSession() with invalid values.
|
||||||
successful AddSession.
|
For example, if current session count is 1, AddSession
|
||||||
|
can only add more or subtract 1 from this. Caller needs
|
||||||
|
to make sure addition/subtraction math is correct when
|
||||||
|
using WaitGroup.
|
||||||
|
|
||||||
Example usage:
|
Example usage:
|
||||||
|
|
||||||
@@ -26,11 +30,11 @@ import (
|
|||||||
|
|
||||||
for item := range(items) {
|
for item := range(items) {
|
||||||
go func(item string) {
|
go func(item string) {
|
||||||
if !group.AddSession() {
|
if !group.AddSession(1) {
|
||||||
// group may be closing or full
|
// group may be closing or full
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
defer group.RmSession()
|
defer group.AddSession(-1)
|
||||||
|
|
||||||
// do stuff
|
// do stuff
|
||||||
}(item)
|
}(item)
|
||||||
@@ -42,57 +46,95 @@ import (
|
|||||||
|
|
||||||
type WaitGroup struct {
|
type WaitGroup struct {
|
||||||
cond *sync.Cond
|
cond *sync.Cond
|
||||||
|
closer chan struct{}
|
||||||
isClosed bool
|
isClosed bool
|
||||||
sessions uint64
|
sessions uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewWaitGroup() *WaitGroup {
|
func NewWaitGroup() *WaitGroup {
|
||||||
return &WaitGroup{
|
return &WaitGroup{
|
||||||
cond: sync.NewCond(new(sync.Mutex)),
|
cond: sync.NewCond(new(sync.Mutex)),
|
||||||
|
closer: make(chan struct{}),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *WaitGroup) AddSession() bool {
|
// Closer returns a channel that is closed if
|
||||||
|
// WaitGroup is in closing state
|
||||||
|
func (r *WaitGroup) Closer() chan struct{} {
|
||||||
|
return r.closer
|
||||||
|
}
|
||||||
|
|
||||||
|
// AddSession manipulates the session counter by
|
||||||
|
// adding or subtracting the delta value. Incrementing
|
||||||
|
// the session counter is not possible and will set
|
||||||
|
// return value to false if a close was initiated.
|
||||||
|
// It's callers responsibility to make sure addition and
|
||||||
|
// subtraction math is correct.
|
||||||
|
func (r *WaitGroup) AddSession(delta int64) bool {
|
||||||
r.cond.L.Lock()
|
r.cond.L.Lock()
|
||||||
defer r.cond.L.Unlock()
|
defer r.cond.L.Unlock()
|
||||||
|
|
||||||
if r.isClosed {
|
if delta >= 0 {
|
||||||
return false
|
// we cannot add if we are being shutdown
|
||||||
}
|
if r.isClosed {
|
||||||
if r.sessions == math.MaxUint64 {
|
return false
|
||||||
return false
|
}
|
||||||
}
|
|
||||||
|
|
||||||
r.sessions++
|
incr := uint64(delta)
|
||||||
|
|
||||||
|
// we have maxed out
|
||||||
|
if r.sessions == math.MaxUint64-incr {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
r.sessions += incr
|
||||||
|
} else {
|
||||||
|
decr := uint64(-delta)
|
||||||
|
|
||||||
|
// illegal operation, it's callers responsibility
|
||||||
|
// to make sure subtraction and addition math is correct.
|
||||||
|
if r.sessions < decr {
|
||||||
|
panic(fmt.Sprintf("common.WaitGroup misuse sum=%d decr=%d isClosed=%v",
|
||||||
|
r.sessions, decr, r.isClosed))
|
||||||
|
}
|
||||||
|
|
||||||
|
r.sessions -= decr
|
||||||
|
|
||||||
|
// subtractions need to notify CloseGroup
|
||||||
|
r.cond.Broadcast()
|
||||||
|
}
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *WaitGroup) RmSession() {
|
// CloseGroup initiates a close and blocks until
|
||||||
r.cond.L.Lock()
|
// session counter becomes zero.
|
||||||
|
|
||||||
if r.sessions == 0 {
|
|
||||||
panic("WaitGroup misuse: no sessions to remove")
|
|
||||||
}
|
|
||||||
|
|
||||||
r.sessions--
|
|
||||||
r.cond.Broadcast()
|
|
||||||
|
|
||||||
r.cond.L.Unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *WaitGroup) CloseGroup() {
|
func (r *WaitGroup) CloseGroup() {
|
||||||
r.cond.L.Lock()
|
r.cond.L.Lock()
|
||||||
|
|
||||||
r.isClosed = true
|
if !r.isClosed {
|
||||||
for r.sessions > 0 {
|
r.isClosed = true
|
||||||
|
close(r.closer)
|
||||||
|
}
|
||||||
|
|
||||||
|
for r.sessions != 0 {
|
||||||
r.cond.Wait()
|
r.cond.Wait()
|
||||||
}
|
}
|
||||||
|
|
||||||
r.cond.L.Unlock()
|
r.cond.L.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CloseGroupNB is non-blocking version of CloseGroup
|
||||||
|
// which returns a channel that can be waited on.
|
||||||
func (r *WaitGroup) CloseGroupNB() chan struct{} {
|
func (r *WaitGroup) CloseGroupNB() chan struct{} {
|
||||||
|
|
||||||
|
// set to closing state immediately
|
||||||
|
r.cond.L.Lock()
|
||||||
|
if !r.isClosed {
|
||||||
|
r.isClosed = true
|
||||||
|
close(r.closer)
|
||||||
|
}
|
||||||
|
r.cond.L.Unlock()
|
||||||
|
|
||||||
closer := make(chan struct{})
|
closer := make(chan struct{})
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
|
|||||||
124
api/common/wait_utils_test.go
Normal file
124
api/common/wait_utils_test.go
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
package common
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func isClosed(ch chan struct{}) bool {
|
||||||
|
select {
|
||||||
|
case <-ch:
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestWaitGroupEmpty(t *testing.T) {
|
||||||
|
|
||||||
|
wg := NewWaitGroup()
|
||||||
|
|
||||||
|
if !wg.AddSession(0) {
|
||||||
|
t.Fatalf("Add 0 should not fail")
|
||||||
|
}
|
||||||
|
|
||||||
|
if isClosed(wg.Closer()) {
|
||||||
|
t.Fatalf("Should not be closed yet")
|
||||||
|
}
|
||||||
|
|
||||||
|
done := wg.CloseGroupNB()
|
||||||
|
|
||||||
|
// gate-on close
|
||||||
|
wg.CloseGroup()
|
||||||
|
|
||||||
|
if !isClosed(wg.Closer()) {
|
||||||
|
t.Fatalf("Should be closing state")
|
||||||
|
}
|
||||||
|
|
||||||
|
if isClosed(done) {
|
||||||
|
t.Fatalf("NB Chan I should be closed")
|
||||||
|
}
|
||||||
|
|
||||||
|
done = wg.CloseGroupNB()
|
||||||
|
if isClosed(done) {
|
||||||
|
t.Fatalf("NB Chan II should be closed")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestWaitGroupSingle(t *testing.T) {
|
||||||
|
|
||||||
|
wg := NewWaitGroup()
|
||||||
|
|
||||||
|
if isClosed(wg.Closer()) {
|
||||||
|
t.Fatalf("Should not be closing state yet")
|
||||||
|
}
|
||||||
|
|
||||||
|
if !wg.AddSession(1) {
|
||||||
|
t.Fatalf("Add 1 should not fail")
|
||||||
|
}
|
||||||
|
|
||||||
|
if isClosed(wg.Closer()) {
|
||||||
|
t.Fatalf("Should not be closing state yet")
|
||||||
|
}
|
||||||
|
|
||||||
|
if !wg.AddSession(-1) {
|
||||||
|
t.Fatalf("Add -1 should not fail")
|
||||||
|
}
|
||||||
|
|
||||||
|
// sum should be zero now.
|
||||||
|
|
||||||
|
if !wg.AddSession(2) {
|
||||||
|
t.Fatalf("Add 2 should not fail")
|
||||||
|
}
|
||||||
|
|
||||||
|
// sum is 2 now
|
||||||
|
// initiate shutdown
|
||||||
|
done := wg.CloseGroupNB()
|
||||||
|
|
||||||
|
if isClosed(done) {
|
||||||
|
t.Fatalf("NB Chan should not be closed yet, since sum is 2")
|
||||||
|
}
|
||||||
|
|
||||||
|
if !wg.AddSession(-1) {
|
||||||
|
t.Fatalf("Add -1 should not fail")
|
||||||
|
}
|
||||||
|
if wg.AddSession(1) {
|
||||||
|
t.Fatalf("Add 1 should fail (we are shutting down)")
|
||||||
|
}
|
||||||
|
if !isClosed(wg.Closer()) {
|
||||||
|
t.Fatalf("Should be closing state")
|
||||||
|
}
|
||||||
|
|
||||||
|
// sum is 1 now
|
||||||
|
|
||||||
|
if isClosed(done) {
|
||||||
|
t.Fatalf("NB Chan should not be closed yet, since sum is 1")
|
||||||
|
}
|
||||||
|
|
||||||
|
if wg.AddSession(0) {
|
||||||
|
t.Fatalf("Add 0 should fail (considered positive number and we are closing)")
|
||||||
|
}
|
||||||
|
|
||||||
|
if wg.AddSession(100) {
|
||||||
|
t.Fatalf("Add 100 should fail (we are shutting down)")
|
||||||
|
}
|
||||||
|
|
||||||
|
if !isClosed(wg.Closer()) {
|
||||||
|
t.Fatalf("Should be closing state")
|
||||||
|
}
|
||||||
|
|
||||||
|
if !wg.AddSession(-1) {
|
||||||
|
t.Fatalf("Add -1 should not fail")
|
||||||
|
}
|
||||||
|
|
||||||
|
// sum is 0 now
|
||||||
|
<-done
|
||||||
|
|
||||||
|
if !isClosed(done) {
|
||||||
|
t.Fatalf("NB Chan should be closed, since sum is 0")
|
||||||
|
}
|
||||||
|
|
||||||
|
if !isClosed(wg.Closer()) {
|
||||||
|
t.Fatalf("Should be closing state")
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user