fn: adding hot launcher eviction waiting (#1257)

If checkLaunch triggers evictions, it must wait
for these eviction to complete before returning.
Premature returning from checkLaunch will cause
checkLaunch to be called again by hot launcher.
This causes checkLaunch to receive an out of
capacity error and causes a 503.

The evictor is also improved with this PR and it
provides a slice of channels to wait on if evictions
are taking place.

Eviction token deletion is performed *after*
resource token close to ensure that once an
eviction is done, resource token is also free.
This commit is contained in:
Tolga Ceylan
2018-10-01 16:16:29 -07:00
committed by GitHub
parent 5a407dc3bd
commit f132bba3fb
3 changed files with 146 additions and 93 deletions

View File

@@ -451,6 +451,8 @@ func (a *agent) checkLaunch(ctx context.Context, call *call, notifyChan chan err
mem := call.Memory + uint64(call.TmpFsSize) mem := call.Memory + uint64(call.TmpFsSize)
var notifyChans []chan struct{}
// WARNING: Tricky flow below. We are here because: isNewContainerNeeded is true, // WARNING: Tricky flow below. We are here because: isNewContainerNeeded is true,
// in other words, we need to launch a new container at this time due to high load. // in other words, we need to launch a new container at this time due to high load.
// //
@@ -471,9 +473,13 @@ func (a *agent) checkLaunch(ctx context.Context, call *call, notifyChan chan err
select { select {
case tok := <-a.resources.GetResourceToken(ctx, mem, call.CPUs, isNB): case tok := <-a.resources.GetResourceToken(ctx, mem, call.CPUs, isNB):
if tok != nil && tok.Error() != nil { if tok != nil && tok.Error() != nil {
// before returning error response, as a last resort, try evicting idle containers. if tok.Error() != CapacityFull {
if tok.Error() != CapacityFull || !a.evictor.PerformEviction(call.slotHashId, mem, uint64(call.CPUs)) {
tryNotify(notifyChan, tok.Error()) tryNotify(notifyChan, tok.Error())
} else {
notifyChans = a.evictor.PerformEviction(call.slotHashId, mem, uint64(call.CPUs))
if len(notifyChans) == 0 {
tryNotify(notifyChan, tok.Error())
}
} }
} else if a.shutWg.AddSession(1) { } else if a.shutWg.AddSession(1) {
go func() { go func() {
@@ -492,12 +498,25 @@ func (a *agent) checkLaunch(ctx context.Context, call *call, notifyChan chan err
// same timer to assume that we waited for cpu/mem long enough. Let's try to evict an // same timer to assume that we waited for cpu/mem long enough. Let's try to evict an
// idle container. // idle container.
case <-time.After(a.cfg.HotPoll): case <-time.After(a.cfg.HotPoll):
a.evictor.PerformEviction(call.slotHashId, mem, uint64(call.CPUs)) notifyChans = a.evictor.PerformEviction(call.slotHashId, mem, uint64(call.CPUs))
case <-ctx.Done(): // timeout case <-ctx.Done(): // timeout
case <-a.shutWg.Closer(): // server shutdown case <-a.shutWg.Closer(): // server shutdown
} }
state.UpdateState(ctx, ContainerStateDone, call.slots) defer state.UpdateState(ctx, ContainerStateDone, call.slots)
// IMPORTANT: we wait here for any possible evictions to finalize. Otherwise
// hotLauncher could call checkLaunch again and cause a capacity full (http 503)
// error.
for _, wait := range notifyChans {
select {
case <-wait:
case <-ctx.Done(): // timeout
return
case <-a.shutWg.Closer(): // server shutdown
return
}
}
} }
// waitHot pings and waits for a hot container from the slot queue // waitHot pings and waits for a hot container from the slot queue
@@ -936,6 +955,12 @@ func (a *agent) runHot(ctx context.Context, call *call, tok ResourceToken, state
ctx, span := trace.StartSpan(ctx, "agent_run_hot") ctx, span := trace.StartSpan(ctx, "agent_run_hot")
defer span.End() defer span.End()
// IMPORTANT: evict token is deleted *after* resource token in defer statements below.
// This ordering allows resource token to be freed first, which means once evict token
// is deleted, eviction is considered to be completed.
evictor := a.evictor.CreateEvictToken(call.slotHashId, call.Memory+uint64(call.TmpFsSize), uint64(call.CPUs))
defer a.evictor.DeleteEvictToken(evictor)
statsUtilization(ctx, a.resources.GetUtilization()) statsUtilization(ctx, a.resources.GetUtilization())
defer func() { defer func() {
statsUtilization(ctx, a.resources.GetUtilization()) statsUtilization(ctx, a.resources.GetUtilization())
@@ -1039,7 +1064,7 @@ func (a *agent) runHot(ctx context.Context, call *call, tok ResourceToken, state
udsClient: udsClient, udsClient: udsClient,
containerSpan: trace.FromContext(ctx).SpanContext(), containerSpan: trace.FromContext(ctx).SpanContext(),
} }
if !a.runHotReq(ctx, call, state, logger, cookie, slot) { if !a.runHotReq(ctx, call, state, logger, cookie, slot, evictor) {
return return
} }
// wait for this call to finish // wait for this call to finish
@@ -1141,7 +1166,7 @@ func inotifyAwait(ctx context.Context, iofsDir string) error {
// runHotReq enqueues a free slot to slot queue manager and watches various timers and the consumer until // runHotReq enqueues a free slot to slot queue manager and watches various timers and the consumer until
// the slot is consumed. A return value of false means, the container should shutdown and no subsequent // the slot is consumed. A return value of false means, the container should shutdown and no subsequent
// calls should be made to this function. // calls should be made to this function.
func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState, logger logrus.FieldLogger, cookie drivers.Cookie, slot *hotSlot) bool { func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState, logger logrus.FieldLogger, cookie drivers.Cookie, slot *hotSlot, evictor *EvictToken) bool {
var err error var err error
isFrozen := false isFrozen := false
@@ -1149,10 +1174,9 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
freezeTimer := time.NewTimer(a.cfg.FreezeIdle) freezeTimer := time.NewTimer(a.cfg.FreezeIdle)
idleTimer := time.NewTimer(time.Duration(call.IdleTimeout) * time.Second) idleTimer := time.NewTimer(time.Duration(call.IdleTimeout) * time.Second)
evictor := a.evictor.GetEvictor(call.ID, call.slotHashId, call.Memory+uint64(call.TmpFsSize), uint64(call.CPUs))
defer func() { defer func() {
a.evictor.UnregisterEvictor(evictor) evictor.SetEvictable(false)
freezeTimer.Stop() freezeTimer.Stop()
idleTimer.Stop() idleTimer.Stop()
// log if any error is encountered // log if any error is encountered
@@ -1161,7 +1185,7 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
} }
}() }()
a.evictor.RegisterEvictor(evictor) evictor.SetEvictable(true)
state.UpdateState(ctx, ContainerStateIdle, call.slots) state.UpdateState(ctx, ContainerStateIdle, call.slots)
s := call.slots.queueSlot(slot) s := call.slots.queueSlot(slot)
@@ -1189,7 +1213,7 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
break break
} }
a.evictor.UnregisterEvictor(evictor) evictor.SetEvictable(false)
// if we can acquire token, that means we are here due to // if we can acquire token, that means we are here due to
// abort/shutdown/timeout, attempt to acquire and terminate, // abort/shutdown/timeout, attempt to acquire and terminate,

View File

@@ -2,15 +2,19 @@ package agent
import ( import (
"sync" "sync"
"sync/atomic"
"github.com/fnproject/fn/api/id"
"github.com/sirupsen/logrus"
) )
// Evictor For Agent // Evictor For Agent
// Agent hot containers can register themselves as evictable using // Agent hot containers register themselves to the evictor system.
// Register/Unregister calls. If a hot container registers itself, // A starved request can call PerformEviction() to scan the evictable
// a starved request can call PerformEviction() to scan the eligible
// hot containers and if a number of these can be evicted to satisfy // hot containers and if a number of these can be evicted to satisfy
// memory+cpu needs of the starved request, then those hot-containers // memory+cpu needs of the starved request, then those hot-containers
// are evicted (which is signalled using their channel.) // are evicted.
type tokenKey struct { type tokenKey struct {
id string id string
@@ -20,23 +24,24 @@ type tokenKey struct {
} }
type EvictToken struct { type EvictToken struct {
key tokenKey key tokenKey
C chan struct{} evictable uint32
C chan struct{}
DoneChan chan struct{}
} }
type Evictor interface { type Evictor interface {
// Create an eviction token to be used in register/unregister functions // CreateEvictToken creates an eviction token to be used in evictor tracking. Returns
GetEvictor(id, slotId string, mem, cpu uint64) *EvictToken // an eviction token.
CreateEvictToken(slotId string, mem, cpu uint64) *EvictToken
// register an eviction token with evictor system // DeleteEvictToken deletes an eviction token from evictor system
RegisterEvictor(token *EvictToken) DeleteEvictToken(token *EvictToken)
// unregister an eviction token from evictor system // PerformEviction performs evictions to satisfy cpu & mem arguments
UnregisterEvictor(token *EvictToken) // and returns a slice of channels for evictions performed. The callers
// can wait on these channel to ensure evictions are completed.
// perform eviction to satisfy resource requirements of the call PerformEviction(slotId string, mem, cpu uint64) []chan struct{}
// returns true if evictions were performed to satisfy the requirements.
PerformEviction(slotId string, mem, cpu uint64) bool
} }
type evictor struct { type evictor struct {
@@ -62,6 +67,15 @@ func (tok *EvictToken) isEvicted() bool {
return false return false
} }
func (token *EvictToken) SetEvictable(isEvictable bool) {
val := uint32(0)
if isEvictable {
val = 1
}
atomic.StoreUint32(&token.evictable, val)
}
func (tok *EvictToken) isEligible() bool { func (tok *EvictToken) isEligible() bool {
// if no resource limits are in place, then this // if no resource limits are in place, then this
// function is not eligible. // function is not eligible.
@@ -71,39 +85,42 @@ func (tok *EvictToken) isEligible() bool {
return true return true
} }
func (e *evictor) GetEvictor(id, slotId string, mem, cpu uint64) *EvictToken { func (e *evictor) CreateEvictToken(slotId string, mem, cpu uint64) *EvictToken {
key := tokenKey{ key := tokenKey{
id: id, id: id.New().String(),
slotId: slotId, slotId: slotId,
memory: mem, memory: mem,
cpu: cpu, cpu: cpu,
} }
return &EvictToken{ token := &EvictToken{
key: key, key: key,
C: make(chan struct{}), C: make(chan struct{}),
DoneChan: make(chan struct{}),
} }
}
func (e *evictor) RegisterEvictor(token *EvictToken) { if !token.isEligible() {
if !token.isEligible() || token.isEvicted() { return token
return
} }
e.lock.Lock() e.lock.Lock()
// be paranoid, do not register if it's already there
_, ok := e.tokens[token.key.id] _, ok := e.tokens[token.key.id]
if !ok { if ok {
e.tokens[token.key.id] = token logrus.Fatalf("id collusion key=%+v", key)
e.slots = append(e.slots, token.key)
} }
e.tokens[token.key.id] = token
e.slots = append(e.slots, token.key)
e.lock.Unlock() e.lock.Unlock()
return token
} }
func (e *evictor) UnregisterEvictor(token *EvictToken) { func (e *evictor) DeleteEvictToken(token *EvictToken) {
if !token.isEligible() || token.isEvicted() { if !token.isEligible() {
return return
} }
@@ -118,14 +135,18 @@ func (e *evictor) UnregisterEvictor(token *EvictToken) {
delete(e.tokens, token.key.id) delete(e.tokens, token.key.id)
e.lock.Unlock() e.lock.Unlock()
close(token.DoneChan)
} }
func (e *evictor) PerformEviction(slotId string, mem, cpu uint64) bool { func (e *evictor) PerformEviction(slotId string, mem, cpu uint64) []chan struct{} {
var notifyChans []chan struct{}
// if no resources are defined for this function, then // if no resources are defined for this function, then
// we don't know what to do here. We cannot evict anyone // we don't know what to do here. We cannot evict anyone
// in this case. // in this case.
if mem == 0 && cpu == 0 { if mem == 0 && cpu == 0 {
return false return notifyChans
} }
// Our eviction sum so far // Our eviction sum so far
@@ -134,7 +155,7 @@ func (e *evictor) PerformEviction(slotId string, mem, cpu uint64) bool {
isSatisfied := false isSatisfied := false
var keys []string var keys []string
var chans []chan struct{} var completionChans []chan struct{}
e.lock.Lock() e.lock.Lock()
@@ -143,6 +164,10 @@ func (e *evictor) PerformEviction(slotId string, mem, cpu uint64) bool {
if slotId == val.slotId { if slotId == val.slotId {
continue continue
} }
// descend into map to verify evictable state
if atomic.LoadUint32(&e.tokens[val.id].evictable) == 0 {
continue
}
totalMemory += val.memory totalMemory += val.memory
totalCpu += val.cpu totalCpu += val.cpu
@@ -158,7 +183,9 @@ func (e *evictor) PerformEviction(slotId string, mem, cpu uint64) bool {
// If we can satisfy the need, then let's commit/perform eviction // If we can satisfy the need, then let's commit/perform eviction
if isSatisfied { if isSatisfied {
chans = make([]chan struct{}, 0, len(keys)) notifyChans = make([]chan struct{}, 0, len(keys))
completionChans = make([]chan struct{}, 0, len(keys))
idx := 0 idx := 0
for _, id := range keys { for _, id := range keys {
@@ -171,16 +198,18 @@ func (e *evictor) PerformEviction(slotId string, mem, cpu uint64) bool {
} }
} }
chans = append(chans, e.tokens[id].C) notifyChans = append(notifyChans, e.tokens[id].C)
completionChans = append(completionChans, e.tokens[id].DoneChan)
delete(e.tokens, id) delete(e.tokens, id)
} }
} }
e.lock.Unlock() e.lock.Unlock()
for _, ch := range chans { for _, ch := range notifyChans {
close(ch) close(ch)
} }
return isSatisfied return completionChans
} }

View File

@@ -4,30 +4,30 @@ import (
"testing" "testing"
) )
func getACall(id, slot string, mem, cpu int) (string, string, uint64, uint64) { func getACall(slot string, mem, cpu int) (string, uint64, uint64) {
return id, slot, uint64(mem), uint64(cpu) return slot, uint64(mem), uint64(cpu)
} }
func TestEvictorSimple01(t *testing.T) { func TestEvictorSimple01(t *testing.T) {
evictor := NewEvictor() evictor := NewEvictor()
slotId := "slot1" slotId := "slot1"
id1, _, mem1, cpu1 := getACall("id1", slotId, 1, 100) _, mem1, cpu1 := getACall(slotId, 1, 100)
id2, _, mem2, cpu2 := getACall("id2", slotId, 1, 100) _, mem2, cpu2 := getACall(slotId, 1, 100)
token1 := evictor.GetEvictor(id1, slotId, mem1, cpu1) token1 := evictor.CreateEvictToken(slotId, mem1, cpu1)
token2 := evictor.GetEvictor(id2, slotId, mem2, cpu2) token2 := evictor.CreateEvictToken(slotId, mem2, cpu2)
evictor.RegisterEvictor(token1) token1.SetEvictable(true)
evictor.RegisterEvictor(token2) token2.SetEvictable(true)
if evictor.PerformEviction(slotId, mem1, cpu1) { if len(evictor.PerformEviction(slotId, mem1, cpu1)) > 0 {
t.Fatalf("We should not be able to self evict") t.Fatalf("We should not be able to self evict")
} }
if evictor.PerformEviction("foo", 0, 0) { if len(evictor.PerformEviction("foo", 0, 0)) > 0 {
t.Fatalf("We should not be able to evict: zero cpu/mem") t.Fatalf("We should not be able to evict: zero cpu/mem")
} }
if evictor.PerformEviction("foo", 1, 300) { if len(evictor.PerformEviction("foo", 1, 300)) > 0 {
t.Fatalf("We should not be able to evict (resource not enough)") t.Fatalf("We should not be able to evict (resource not enough)")
} }
@@ -38,7 +38,7 @@ func TestEvictorSimple01(t *testing.T) {
t.Fatalf("should not be evicted") t.Fatalf("should not be evicted")
} }
if !evictor.PerformEviction("foo", 1, 100) { if len(evictor.PerformEviction("foo", 1, 100)) != 1 {
t.Fatalf("We should be able to evict") t.Fatalf("We should be able to evict")
} }
@@ -49,44 +49,44 @@ func TestEvictorSimple01(t *testing.T) {
t.Fatalf("should not be evicted") t.Fatalf("should not be evicted")
} }
evictor.UnregisterEvictor(token1) evictor.DeleteEvictToken(token1)
evictor.UnregisterEvictor(token2) evictor.DeleteEvictToken(token2)
} }
func TestEvictorSimple02(t *testing.T) { func TestEvictorSimple02(t *testing.T) {
evictor := NewEvictor() evictor := NewEvictor()
id1, slotId1, mem1, cpu1 := getACall("id1", "slot1", 1, 100) slotId1, mem1, cpu1 := getACall("slot1", 1, 100)
id2, slotId2, mem2, cpu2 := getACall("id2", "slot1", 1, 100) slotId2, mem2, cpu2 := getACall("slot1", 1, 100)
token1 := evictor.GetEvictor(id1, slotId1, mem1, cpu1) token1 := evictor.CreateEvictToken(slotId1, mem1, cpu1)
token2 := evictor.GetEvictor(id2, slotId2, mem2, cpu2) token2 := evictor.CreateEvictToken(slotId2, mem2, cpu2)
// add/rm/add // add/rm/add
evictor.RegisterEvictor(token1) token1.SetEvictable(true)
evictor.UnregisterEvictor(token1) token1.SetEvictable(false)
evictor.RegisterEvictor(token1) token1.SetEvictable(true)
// add/rm // add/rm
evictor.RegisterEvictor(token2) token2.SetEvictable(true)
evictor.UnregisterEvictor(token2) token2.SetEvictable(false)
if evictor.PerformEviction(slotId1, mem1, cpu1) { if len(evictor.PerformEviction(slotId1, mem1, cpu1)) > 0 {
t.Fatalf("We should not be able to self evict") t.Fatalf("We should not be able to self evict")
} }
if evictor.PerformEviction("foo", 0, 0) { if len(evictor.PerformEviction("foo", 0, 0)) > 0 {
t.Fatalf("We should not be able to evict: zero cpu/mem") t.Fatalf("We should not be able to evict: zero cpu/mem")
} }
if token1.isEvicted() { if token1.isEvicted() {
t.Fatalf("should not be evicted") t.Fatalf("should not be evicted")
} }
evictor.UnregisterEvictor(token1) token1.SetEvictable(false)
// not registered... but should be OK // not registered... but should be OK
evictor.UnregisterEvictor(token2) token2.SetEvictable(false)
if evictor.PerformEviction("foo", mem1, cpu1) { if len(evictor.PerformEviction("foo", mem1, cpu1)) > 0 {
t.Fatalf("We should not be able to evict (unregistered)") t.Fatalf("We should not be able to evict (unregistered)")
} }
if token1.isEvicted() { if token1.isEvicted() {
@@ -102,22 +102,22 @@ func TestEvictorSimple03(t *testing.T) {
taboo := "foo" taboo := "foo"
slotId := "slot1" slotId := "slot1"
id0, slotId0, mem0, cpu0 := getACall("id0", taboo, 1, 100) slotId0, mem0, cpu0 := getACall(taboo, 1, 100)
id1, _, mem1, cpu1 := getACall("id1", slotId, 1, 100) _, mem1, cpu1 := getACall(slotId, 1, 100)
id2, _, mem2, cpu2 := getACall("id2", slotId, 1, 100) _, mem2, cpu2 := getACall(slotId, 1, 100)
id3, _, mem3, cpu3 := getACall("id3", slotId, 1, 100) _, mem3, cpu3 := getACall(slotId, 1, 100)
token0 := evictor.GetEvictor(id0, slotId0, mem0, cpu0) token0 := evictor.CreateEvictToken(slotId0, mem0, cpu0)
token1 := evictor.GetEvictor(id1, slotId, mem1, cpu1) token1 := evictor.CreateEvictToken(slotId, mem1, cpu1)
token2 := evictor.GetEvictor(id2, slotId, mem2, cpu2) token2 := evictor.CreateEvictToken(slotId, mem2, cpu2)
token3 := evictor.GetEvictor(id3, slotId, mem3, cpu3) token3 := evictor.CreateEvictToken(slotId, mem3, cpu3)
evictor.RegisterEvictor(token0) token0.SetEvictable(true)
evictor.RegisterEvictor(token1) token1.SetEvictable(true)
evictor.RegisterEvictor(token2) token2.SetEvictable(true)
evictor.RegisterEvictor(token3) token3.SetEvictable(true)
if !evictor.PerformEviction(taboo, 1, 200) { if len(evictor.PerformEviction(taboo, 1, 200)) == 0 {
t.Fatalf("We should be able to evict") t.Fatalf("We should be able to evict")
} }
@@ -136,8 +136,8 @@ func TestEvictorSimple03(t *testing.T) {
t.Fatalf("should not be evicted") t.Fatalf("should not be evicted")
} }
evictor.UnregisterEvictor(token0) evictor.DeleteEvictToken(token0)
evictor.UnregisterEvictor(token1) evictor.DeleteEvictToken(token1)
evictor.UnregisterEvictor(token2) evictor.DeleteEvictToken(token2)
evictor.UnregisterEvictor(token3) evictor.DeleteEvictToken(token3)
} }