fn: adding hot launcher eviction waiting (#1257)

If checkLaunch triggers evictions, it must wait
for these eviction to complete before returning.
Premature returning from checkLaunch will cause
checkLaunch to be called again by hot launcher.
This causes checkLaunch to receive an out of
capacity error and causes a 503.

The evictor is also improved with this PR and it
provides a slice of channels to wait on if evictions
are taking place.

Eviction token deletion is performed *after*
resource token close to ensure that once an
eviction is done, resource token is also free.
This commit is contained in:
Tolga Ceylan
2018-10-01 16:16:29 -07:00
committed by GitHub
parent 5a407dc3bd
commit f132bba3fb
3 changed files with 146 additions and 93 deletions

View File

@@ -451,6 +451,8 @@ func (a *agent) checkLaunch(ctx context.Context, call *call, notifyChan chan err
mem := call.Memory + uint64(call.TmpFsSize)
var notifyChans []chan struct{}
// WARNING: Tricky flow below. We are here because: isNewContainerNeeded is true,
// in other words, we need to launch a new container at this time due to high load.
//
@@ -471,9 +473,13 @@ func (a *agent) checkLaunch(ctx context.Context, call *call, notifyChan chan err
select {
case tok := <-a.resources.GetResourceToken(ctx, mem, call.CPUs, isNB):
if tok != nil && tok.Error() != nil {
// before returning error response, as a last resort, try evicting idle containers.
if tok.Error() != CapacityFull || !a.evictor.PerformEviction(call.slotHashId, mem, uint64(call.CPUs)) {
if tok.Error() != CapacityFull {
tryNotify(notifyChan, tok.Error())
} else {
notifyChans = a.evictor.PerformEviction(call.slotHashId, mem, uint64(call.CPUs))
if len(notifyChans) == 0 {
tryNotify(notifyChan, tok.Error())
}
}
} else if a.shutWg.AddSession(1) {
go func() {
@@ -492,12 +498,25 @@ func (a *agent) checkLaunch(ctx context.Context, call *call, notifyChan chan err
// same timer to assume that we waited for cpu/mem long enough. Let's try to evict an
// idle container.
case <-time.After(a.cfg.HotPoll):
a.evictor.PerformEviction(call.slotHashId, mem, uint64(call.CPUs))
notifyChans = a.evictor.PerformEviction(call.slotHashId, mem, uint64(call.CPUs))
case <-ctx.Done(): // timeout
case <-a.shutWg.Closer(): // server shutdown
}
state.UpdateState(ctx, ContainerStateDone, call.slots)
defer state.UpdateState(ctx, ContainerStateDone, call.slots)
// IMPORTANT: we wait here for any possible evictions to finalize. Otherwise
// hotLauncher could call checkLaunch again and cause a capacity full (http 503)
// error.
for _, wait := range notifyChans {
select {
case <-wait:
case <-ctx.Done(): // timeout
return
case <-a.shutWg.Closer(): // server shutdown
return
}
}
}
// waitHot pings and waits for a hot container from the slot queue
@@ -936,6 +955,12 @@ func (a *agent) runHot(ctx context.Context, call *call, tok ResourceToken, state
ctx, span := trace.StartSpan(ctx, "agent_run_hot")
defer span.End()
// IMPORTANT: evict token is deleted *after* resource token in defer statements below.
// This ordering allows resource token to be freed first, which means once evict token
// is deleted, eviction is considered to be completed.
evictor := a.evictor.CreateEvictToken(call.slotHashId, call.Memory+uint64(call.TmpFsSize), uint64(call.CPUs))
defer a.evictor.DeleteEvictToken(evictor)
statsUtilization(ctx, a.resources.GetUtilization())
defer func() {
statsUtilization(ctx, a.resources.GetUtilization())
@@ -1039,7 +1064,7 @@ func (a *agent) runHot(ctx context.Context, call *call, tok ResourceToken, state
udsClient: udsClient,
containerSpan: trace.FromContext(ctx).SpanContext(),
}
if !a.runHotReq(ctx, call, state, logger, cookie, slot) {
if !a.runHotReq(ctx, call, state, logger, cookie, slot, evictor) {
return
}
// wait for this call to finish
@@ -1141,7 +1166,7 @@ func inotifyAwait(ctx context.Context, iofsDir string) error {
// runHotReq enqueues a free slot to slot queue manager and watches various timers and the consumer until
// the slot is consumed. A return value of false means, the container should shutdown and no subsequent
// calls should be made to this function.
func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState, logger logrus.FieldLogger, cookie drivers.Cookie, slot *hotSlot) bool {
func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState, logger logrus.FieldLogger, cookie drivers.Cookie, slot *hotSlot, evictor *EvictToken) bool {
var err error
isFrozen := false
@@ -1149,10 +1174,9 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
freezeTimer := time.NewTimer(a.cfg.FreezeIdle)
idleTimer := time.NewTimer(time.Duration(call.IdleTimeout) * time.Second)
evictor := a.evictor.GetEvictor(call.ID, call.slotHashId, call.Memory+uint64(call.TmpFsSize), uint64(call.CPUs))
defer func() {
a.evictor.UnregisterEvictor(evictor)
evictor.SetEvictable(false)
freezeTimer.Stop()
idleTimer.Stop()
// log if any error is encountered
@@ -1161,7 +1185,7 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
}
}()
a.evictor.RegisterEvictor(evictor)
evictor.SetEvictable(true)
state.UpdateState(ctx, ContainerStateIdle, call.slots)
s := call.slots.queueSlot(slot)
@@ -1189,7 +1213,7 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
break
}
a.evictor.UnregisterEvictor(evictor)
evictor.SetEvictable(false)
// if we can acquire token, that means we are here due to
// abort/shutdown/timeout, attempt to acquire and terminate,

View File

@@ -2,15 +2,19 @@ package agent
import (
"sync"
"sync/atomic"
"github.com/fnproject/fn/api/id"
"github.com/sirupsen/logrus"
)
// Evictor For Agent
// Agent hot containers can register themselves as evictable using
// Register/Unregister calls. If a hot container registers itself,
// a starved request can call PerformEviction() to scan the eligible
// Agent hot containers register themselves to the evictor system.
// A starved request can call PerformEviction() to scan the evictable
// hot containers and if a number of these can be evicted to satisfy
// memory+cpu needs of the starved request, then those hot-containers
// are evicted (which is signalled using their channel.)
// are evicted.
type tokenKey struct {
id string
@@ -20,23 +24,24 @@ type tokenKey struct {
}
type EvictToken struct {
key tokenKey
C chan struct{}
key tokenKey
evictable uint32
C chan struct{}
DoneChan chan struct{}
}
type Evictor interface {
// Create an eviction token to be used in register/unregister functions
GetEvictor(id, slotId string, mem, cpu uint64) *EvictToken
// CreateEvictToken creates an eviction token to be used in evictor tracking. Returns
// an eviction token.
CreateEvictToken(slotId string, mem, cpu uint64) *EvictToken
// register an eviction token with evictor system
RegisterEvictor(token *EvictToken)
// DeleteEvictToken deletes an eviction token from evictor system
DeleteEvictToken(token *EvictToken)
// unregister an eviction token from evictor system
UnregisterEvictor(token *EvictToken)
// perform eviction to satisfy resource requirements of the call
// returns true if evictions were performed to satisfy the requirements.
PerformEviction(slotId string, mem, cpu uint64) bool
// PerformEviction performs evictions to satisfy cpu & mem arguments
// and returns a slice of channels for evictions performed. The callers
// can wait on these channel to ensure evictions are completed.
PerformEviction(slotId string, mem, cpu uint64) []chan struct{}
}
type evictor struct {
@@ -62,6 +67,15 @@ func (tok *EvictToken) isEvicted() bool {
return false
}
func (token *EvictToken) SetEvictable(isEvictable bool) {
val := uint32(0)
if isEvictable {
val = 1
}
atomic.StoreUint32(&token.evictable, val)
}
func (tok *EvictToken) isEligible() bool {
// if no resource limits are in place, then this
// function is not eligible.
@@ -71,39 +85,42 @@ func (tok *EvictToken) isEligible() bool {
return true
}
func (e *evictor) GetEvictor(id, slotId string, mem, cpu uint64) *EvictToken {
func (e *evictor) CreateEvictToken(slotId string, mem, cpu uint64) *EvictToken {
key := tokenKey{
id: id,
id: id.New().String(),
slotId: slotId,
memory: mem,
cpu: cpu,
}
return &EvictToken{
key: key,
C: make(chan struct{}),
token := &EvictToken{
key: key,
C: make(chan struct{}),
DoneChan: make(chan struct{}),
}
}
func (e *evictor) RegisterEvictor(token *EvictToken) {
if !token.isEligible() || token.isEvicted() {
return
if !token.isEligible() {
return token
}
e.lock.Lock()
// be paranoid, do not register if it's already there
_, ok := e.tokens[token.key.id]
if !ok {
e.tokens[token.key.id] = token
e.slots = append(e.slots, token.key)
if ok {
logrus.Fatalf("id collusion key=%+v", key)
}
e.tokens[token.key.id] = token
e.slots = append(e.slots, token.key)
e.lock.Unlock()
return token
}
func (e *evictor) UnregisterEvictor(token *EvictToken) {
if !token.isEligible() || token.isEvicted() {
func (e *evictor) DeleteEvictToken(token *EvictToken) {
if !token.isEligible() {
return
}
@@ -118,14 +135,18 @@ func (e *evictor) UnregisterEvictor(token *EvictToken) {
delete(e.tokens, token.key.id)
e.lock.Unlock()
close(token.DoneChan)
}
func (e *evictor) PerformEviction(slotId string, mem, cpu uint64) bool {
func (e *evictor) PerformEviction(slotId string, mem, cpu uint64) []chan struct{} {
var notifyChans []chan struct{}
// if no resources are defined for this function, then
// we don't know what to do here. We cannot evict anyone
// in this case.
if mem == 0 && cpu == 0 {
return false
return notifyChans
}
// Our eviction sum so far
@@ -134,7 +155,7 @@ func (e *evictor) PerformEviction(slotId string, mem, cpu uint64) bool {
isSatisfied := false
var keys []string
var chans []chan struct{}
var completionChans []chan struct{}
e.lock.Lock()
@@ -143,6 +164,10 @@ func (e *evictor) PerformEviction(slotId string, mem, cpu uint64) bool {
if slotId == val.slotId {
continue
}
// descend into map to verify evictable state
if atomic.LoadUint32(&e.tokens[val.id].evictable) == 0 {
continue
}
totalMemory += val.memory
totalCpu += val.cpu
@@ -158,7 +183,9 @@ func (e *evictor) PerformEviction(slotId string, mem, cpu uint64) bool {
// If we can satisfy the need, then let's commit/perform eviction
if isSatisfied {
chans = make([]chan struct{}, 0, len(keys))
notifyChans = make([]chan struct{}, 0, len(keys))
completionChans = make([]chan struct{}, 0, len(keys))
idx := 0
for _, id := range keys {
@@ -171,16 +198,18 @@ func (e *evictor) PerformEviction(slotId string, mem, cpu uint64) bool {
}
}
chans = append(chans, e.tokens[id].C)
notifyChans = append(notifyChans, e.tokens[id].C)
completionChans = append(completionChans, e.tokens[id].DoneChan)
delete(e.tokens, id)
}
}
e.lock.Unlock()
for _, ch := range chans {
for _, ch := range notifyChans {
close(ch)
}
return isSatisfied
return completionChans
}

View File

@@ -4,30 +4,30 @@ import (
"testing"
)
func getACall(id, slot string, mem, cpu int) (string, string, uint64, uint64) {
return id, slot, uint64(mem), uint64(cpu)
func getACall(slot string, mem, cpu int) (string, uint64, uint64) {
return slot, uint64(mem), uint64(cpu)
}
func TestEvictorSimple01(t *testing.T) {
evictor := NewEvictor()
slotId := "slot1"
id1, _, mem1, cpu1 := getACall("id1", slotId, 1, 100)
id2, _, mem2, cpu2 := getACall("id2", slotId, 1, 100)
_, mem1, cpu1 := getACall(slotId, 1, 100)
_, mem2, cpu2 := getACall(slotId, 1, 100)
token1 := evictor.GetEvictor(id1, slotId, mem1, cpu1)
token2 := evictor.GetEvictor(id2, slotId, mem2, cpu2)
token1 := evictor.CreateEvictToken(slotId, mem1, cpu1)
token2 := evictor.CreateEvictToken(slotId, mem2, cpu2)
evictor.RegisterEvictor(token1)
evictor.RegisterEvictor(token2)
token1.SetEvictable(true)
token2.SetEvictable(true)
if evictor.PerformEviction(slotId, mem1, cpu1) {
if len(evictor.PerformEviction(slotId, mem1, cpu1)) > 0 {
t.Fatalf("We should not be able to self evict")
}
if evictor.PerformEviction("foo", 0, 0) {
if len(evictor.PerformEviction("foo", 0, 0)) > 0 {
t.Fatalf("We should not be able to evict: zero cpu/mem")
}
if evictor.PerformEviction("foo", 1, 300) {
if len(evictor.PerformEviction("foo", 1, 300)) > 0 {
t.Fatalf("We should not be able to evict (resource not enough)")
}
@@ -38,7 +38,7 @@ func TestEvictorSimple01(t *testing.T) {
t.Fatalf("should not be evicted")
}
if !evictor.PerformEviction("foo", 1, 100) {
if len(evictor.PerformEviction("foo", 1, 100)) != 1 {
t.Fatalf("We should be able to evict")
}
@@ -49,44 +49,44 @@ func TestEvictorSimple01(t *testing.T) {
t.Fatalf("should not be evicted")
}
evictor.UnregisterEvictor(token1)
evictor.UnregisterEvictor(token2)
evictor.DeleteEvictToken(token1)
evictor.DeleteEvictToken(token2)
}
func TestEvictorSimple02(t *testing.T) {
evictor := NewEvictor()
id1, slotId1, mem1, cpu1 := getACall("id1", "slot1", 1, 100)
id2, slotId2, mem2, cpu2 := getACall("id2", "slot1", 1, 100)
slotId1, mem1, cpu1 := getACall("slot1", 1, 100)
slotId2, mem2, cpu2 := getACall("slot1", 1, 100)
token1 := evictor.GetEvictor(id1, slotId1, mem1, cpu1)
token2 := evictor.GetEvictor(id2, slotId2, mem2, cpu2)
token1 := evictor.CreateEvictToken(slotId1, mem1, cpu1)
token2 := evictor.CreateEvictToken(slotId2, mem2, cpu2)
// add/rm/add
evictor.RegisterEvictor(token1)
evictor.UnregisterEvictor(token1)
evictor.RegisterEvictor(token1)
token1.SetEvictable(true)
token1.SetEvictable(false)
token1.SetEvictable(true)
// add/rm
evictor.RegisterEvictor(token2)
evictor.UnregisterEvictor(token2)
token2.SetEvictable(true)
token2.SetEvictable(false)
if evictor.PerformEviction(slotId1, mem1, cpu1) {
if len(evictor.PerformEviction(slotId1, mem1, cpu1)) > 0 {
t.Fatalf("We should not be able to self evict")
}
if evictor.PerformEviction("foo", 0, 0) {
if len(evictor.PerformEviction("foo", 0, 0)) > 0 {
t.Fatalf("We should not be able to evict: zero cpu/mem")
}
if token1.isEvicted() {
t.Fatalf("should not be evicted")
}
evictor.UnregisterEvictor(token1)
token1.SetEvictable(false)
// not registered... but should be OK
evictor.UnregisterEvictor(token2)
token2.SetEvictable(false)
if evictor.PerformEviction("foo", mem1, cpu1) {
if len(evictor.PerformEviction("foo", mem1, cpu1)) > 0 {
t.Fatalf("We should not be able to evict (unregistered)")
}
if token1.isEvicted() {
@@ -102,22 +102,22 @@ func TestEvictorSimple03(t *testing.T) {
taboo := "foo"
slotId := "slot1"
id0, slotId0, mem0, cpu0 := getACall("id0", taboo, 1, 100)
id1, _, mem1, cpu1 := getACall("id1", slotId, 1, 100)
id2, _, mem2, cpu2 := getACall("id2", slotId, 1, 100)
id3, _, mem3, cpu3 := getACall("id3", slotId, 1, 100)
slotId0, mem0, cpu0 := getACall(taboo, 1, 100)
_, mem1, cpu1 := getACall(slotId, 1, 100)
_, mem2, cpu2 := getACall(slotId, 1, 100)
_, mem3, cpu3 := getACall(slotId, 1, 100)
token0 := evictor.GetEvictor(id0, slotId0, mem0, cpu0)
token1 := evictor.GetEvictor(id1, slotId, mem1, cpu1)
token2 := evictor.GetEvictor(id2, slotId, mem2, cpu2)
token3 := evictor.GetEvictor(id3, slotId, mem3, cpu3)
token0 := evictor.CreateEvictToken(slotId0, mem0, cpu0)
token1 := evictor.CreateEvictToken(slotId, mem1, cpu1)
token2 := evictor.CreateEvictToken(slotId, mem2, cpu2)
token3 := evictor.CreateEvictToken(slotId, mem3, cpu3)
evictor.RegisterEvictor(token0)
evictor.RegisterEvictor(token1)
evictor.RegisterEvictor(token2)
evictor.RegisterEvictor(token3)
token0.SetEvictable(true)
token1.SetEvictable(true)
token2.SetEvictable(true)
token3.SetEvictable(true)
if !evictor.PerformEviction(taboo, 1, 200) {
if len(evictor.PerformEviction(taboo, 1, 200)) == 0 {
t.Fatalf("We should be able to evict")
}
@@ -136,8 +136,8 @@ func TestEvictorSimple03(t *testing.T) {
t.Fatalf("should not be evicted")
}
evictor.UnregisterEvictor(token0)
evictor.UnregisterEvictor(token1)
evictor.UnregisterEvictor(token2)
evictor.UnregisterEvictor(token3)
evictor.DeleteEvictToken(token0)
evictor.DeleteEvictToken(token1)
evictor.DeleteEvictToken(token2)
evictor.DeleteEvictToken(token3)
}