mirror of
https://github.com/fnproject/fn.git
synced 2022-10-28 21:29:17 +03:00
fn: adding hot launcher eviction waiting (#1257)
If checkLaunch triggers evictions, it must wait for these eviction to complete before returning. Premature returning from checkLaunch will cause checkLaunch to be called again by hot launcher. This causes checkLaunch to receive an out of capacity error and causes a 503. The evictor is also improved with this PR and it provides a slice of channels to wait on if evictions are taking place. Eviction token deletion is performed *after* resource token close to ensure that once an eviction is done, resource token is also free.
This commit is contained in:
@@ -451,6 +451,8 @@ func (a *agent) checkLaunch(ctx context.Context, call *call, notifyChan chan err
|
||||
|
||||
mem := call.Memory + uint64(call.TmpFsSize)
|
||||
|
||||
var notifyChans []chan struct{}
|
||||
|
||||
// WARNING: Tricky flow below. We are here because: isNewContainerNeeded is true,
|
||||
// in other words, we need to launch a new container at this time due to high load.
|
||||
//
|
||||
@@ -471,9 +473,13 @@ func (a *agent) checkLaunch(ctx context.Context, call *call, notifyChan chan err
|
||||
select {
|
||||
case tok := <-a.resources.GetResourceToken(ctx, mem, call.CPUs, isNB):
|
||||
if tok != nil && tok.Error() != nil {
|
||||
// before returning error response, as a last resort, try evicting idle containers.
|
||||
if tok.Error() != CapacityFull || !a.evictor.PerformEviction(call.slotHashId, mem, uint64(call.CPUs)) {
|
||||
if tok.Error() != CapacityFull {
|
||||
tryNotify(notifyChan, tok.Error())
|
||||
} else {
|
||||
notifyChans = a.evictor.PerformEviction(call.slotHashId, mem, uint64(call.CPUs))
|
||||
if len(notifyChans) == 0 {
|
||||
tryNotify(notifyChan, tok.Error())
|
||||
}
|
||||
}
|
||||
} else if a.shutWg.AddSession(1) {
|
||||
go func() {
|
||||
@@ -492,12 +498,25 @@ func (a *agent) checkLaunch(ctx context.Context, call *call, notifyChan chan err
|
||||
// same timer to assume that we waited for cpu/mem long enough. Let's try to evict an
|
||||
// idle container.
|
||||
case <-time.After(a.cfg.HotPoll):
|
||||
a.evictor.PerformEviction(call.slotHashId, mem, uint64(call.CPUs))
|
||||
notifyChans = a.evictor.PerformEviction(call.slotHashId, mem, uint64(call.CPUs))
|
||||
case <-ctx.Done(): // timeout
|
||||
case <-a.shutWg.Closer(): // server shutdown
|
||||
}
|
||||
|
||||
state.UpdateState(ctx, ContainerStateDone, call.slots)
|
||||
defer state.UpdateState(ctx, ContainerStateDone, call.slots)
|
||||
|
||||
// IMPORTANT: we wait here for any possible evictions to finalize. Otherwise
|
||||
// hotLauncher could call checkLaunch again and cause a capacity full (http 503)
|
||||
// error.
|
||||
for _, wait := range notifyChans {
|
||||
select {
|
||||
case <-wait:
|
||||
case <-ctx.Done(): // timeout
|
||||
return
|
||||
case <-a.shutWg.Closer(): // server shutdown
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// waitHot pings and waits for a hot container from the slot queue
|
||||
@@ -936,6 +955,12 @@ func (a *agent) runHot(ctx context.Context, call *call, tok ResourceToken, state
|
||||
ctx, span := trace.StartSpan(ctx, "agent_run_hot")
|
||||
defer span.End()
|
||||
|
||||
// IMPORTANT: evict token is deleted *after* resource token in defer statements below.
|
||||
// This ordering allows resource token to be freed first, which means once evict token
|
||||
// is deleted, eviction is considered to be completed.
|
||||
evictor := a.evictor.CreateEvictToken(call.slotHashId, call.Memory+uint64(call.TmpFsSize), uint64(call.CPUs))
|
||||
defer a.evictor.DeleteEvictToken(evictor)
|
||||
|
||||
statsUtilization(ctx, a.resources.GetUtilization())
|
||||
defer func() {
|
||||
statsUtilization(ctx, a.resources.GetUtilization())
|
||||
@@ -1039,7 +1064,7 @@ func (a *agent) runHot(ctx context.Context, call *call, tok ResourceToken, state
|
||||
udsClient: udsClient,
|
||||
containerSpan: trace.FromContext(ctx).SpanContext(),
|
||||
}
|
||||
if !a.runHotReq(ctx, call, state, logger, cookie, slot) {
|
||||
if !a.runHotReq(ctx, call, state, logger, cookie, slot, evictor) {
|
||||
return
|
||||
}
|
||||
// wait for this call to finish
|
||||
@@ -1141,7 +1166,7 @@ func inotifyAwait(ctx context.Context, iofsDir string) error {
|
||||
// runHotReq enqueues a free slot to slot queue manager and watches various timers and the consumer until
|
||||
// the slot is consumed. A return value of false means, the container should shutdown and no subsequent
|
||||
// calls should be made to this function.
|
||||
func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState, logger logrus.FieldLogger, cookie drivers.Cookie, slot *hotSlot) bool {
|
||||
func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState, logger logrus.FieldLogger, cookie drivers.Cookie, slot *hotSlot, evictor *EvictToken) bool {
|
||||
|
||||
var err error
|
||||
isFrozen := false
|
||||
@@ -1149,10 +1174,9 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
|
||||
|
||||
freezeTimer := time.NewTimer(a.cfg.FreezeIdle)
|
||||
idleTimer := time.NewTimer(time.Duration(call.IdleTimeout) * time.Second)
|
||||
evictor := a.evictor.GetEvictor(call.ID, call.slotHashId, call.Memory+uint64(call.TmpFsSize), uint64(call.CPUs))
|
||||
|
||||
defer func() {
|
||||
a.evictor.UnregisterEvictor(evictor)
|
||||
evictor.SetEvictable(false)
|
||||
freezeTimer.Stop()
|
||||
idleTimer.Stop()
|
||||
// log if any error is encountered
|
||||
@@ -1161,7 +1185,7 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
|
||||
}
|
||||
}()
|
||||
|
||||
a.evictor.RegisterEvictor(evictor)
|
||||
evictor.SetEvictable(true)
|
||||
state.UpdateState(ctx, ContainerStateIdle, call.slots)
|
||||
|
||||
s := call.slots.queueSlot(slot)
|
||||
@@ -1189,7 +1213,7 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
|
||||
break
|
||||
}
|
||||
|
||||
a.evictor.UnregisterEvictor(evictor)
|
||||
evictor.SetEvictable(false)
|
||||
|
||||
// if we can acquire token, that means we are here due to
|
||||
// abort/shutdown/timeout, attempt to acquire and terminate,
|
||||
|
||||
@@ -2,15 +2,19 @@ package agent
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/fnproject/fn/api/id"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// Evictor For Agent
|
||||
// Agent hot containers can register themselves as evictable using
|
||||
// Register/Unregister calls. If a hot container registers itself,
|
||||
// a starved request can call PerformEviction() to scan the eligible
|
||||
// Agent hot containers register themselves to the evictor system.
|
||||
// A starved request can call PerformEviction() to scan the evictable
|
||||
// hot containers and if a number of these can be evicted to satisfy
|
||||
// memory+cpu needs of the starved request, then those hot-containers
|
||||
// are evicted (which is signalled using their channel.)
|
||||
// are evicted.
|
||||
|
||||
type tokenKey struct {
|
||||
id string
|
||||
@@ -20,23 +24,24 @@ type tokenKey struct {
|
||||
}
|
||||
|
||||
type EvictToken struct {
|
||||
key tokenKey
|
||||
C chan struct{}
|
||||
key tokenKey
|
||||
evictable uint32
|
||||
C chan struct{}
|
||||
DoneChan chan struct{}
|
||||
}
|
||||
|
||||
type Evictor interface {
|
||||
// Create an eviction token to be used in register/unregister functions
|
||||
GetEvictor(id, slotId string, mem, cpu uint64) *EvictToken
|
||||
// CreateEvictToken creates an eviction token to be used in evictor tracking. Returns
|
||||
// an eviction token.
|
||||
CreateEvictToken(slotId string, mem, cpu uint64) *EvictToken
|
||||
|
||||
// register an eviction token with evictor system
|
||||
RegisterEvictor(token *EvictToken)
|
||||
// DeleteEvictToken deletes an eviction token from evictor system
|
||||
DeleteEvictToken(token *EvictToken)
|
||||
|
||||
// unregister an eviction token from evictor system
|
||||
UnregisterEvictor(token *EvictToken)
|
||||
|
||||
// perform eviction to satisfy resource requirements of the call
|
||||
// returns true if evictions were performed to satisfy the requirements.
|
||||
PerformEviction(slotId string, mem, cpu uint64) bool
|
||||
// PerformEviction performs evictions to satisfy cpu & mem arguments
|
||||
// and returns a slice of channels for evictions performed. The callers
|
||||
// can wait on these channel to ensure evictions are completed.
|
||||
PerformEviction(slotId string, mem, cpu uint64) []chan struct{}
|
||||
}
|
||||
|
||||
type evictor struct {
|
||||
@@ -62,6 +67,15 @@ func (tok *EvictToken) isEvicted() bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func (token *EvictToken) SetEvictable(isEvictable bool) {
|
||||
val := uint32(0)
|
||||
if isEvictable {
|
||||
val = 1
|
||||
}
|
||||
|
||||
atomic.StoreUint32(&token.evictable, val)
|
||||
}
|
||||
|
||||
func (tok *EvictToken) isEligible() bool {
|
||||
// if no resource limits are in place, then this
|
||||
// function is not eligible.
|
||||
@@ -71,39 +85,42 @@ func (tok *EvictToken) isEligible() bool {
|
||||
return true
|
||||
}
|
||||
|
||||
func (e *evictor) GetEvictor(id, slotId string, mem, cpu uint64) *EvictToken {
|
||||
func (e *evictor) CreateEvictToken(slotId string, mem, cpu uint64) *EvictToken {
|
||||
|
||||
key := tokenKey{
|
||||
id: id,
|
||||
id: id.New().String(),
|
||||
slotId: slotId,
|
||||
memory: mem,
|
||||
cpu: cpu,
|
||||
}
|
||||
|
||||
return &EvictToken{
|
||||
key: key,
|
||||
C: make(chan struct{}),
|
||||
token := &EvictToken{
|
||||
key: key,
|
||||
C: make(chan struct{}),
|
||||
DoneChan: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
func (e *evictor) RegisterEvictor(token *EvictToken) {
|
||||
if !token.isEligible() || token.isEvicted() {
|
||||
return
|
||||
if !token.isEligible() {
|
||||
return token
|
||||
}
|
||||
|
||||
e.lock.Lock()
|
||||
|
||||
// be paranoid, do not register if it's already there
|
||||
_, ok := e.tokens[token.key.id]
|
||||
if !ok {
|
||||
e.tokens[token.key.id] = token
|
||||
e.slots = append(e.slots, token.key)
|
||||
if ok {
|
||||
logrus.Fatalf("id collusion key=%+v", key)
|
||||
}
|
||||
|
||||
e.tokens[token.key.id] = token
|
||||
e.slots = append(e.slots, token.key)
|
||||
|
||||
e.lock.Unlock()
|
||||
|
||||
return token
|
||||
}
|
||||
|
||||
func (e *evictor) UnregisterEvictor(token *EvictToken) {
|
||||
if !token.isEligible() || token.isEvicted() {
|
||||
func (e *evictor) DeleteEvictToken(token *EvictToken) {
|
||||
if !token.isEligible() {
|
||||
return
|
||||
}
|
||||
|
||||
@@ -118,14 +135,18 @@ func (e *evictor) UnregisterEvictor(token *EvictToken) {
|
||||
delete(e.tokens, token.key.id)
|
||||
|
||||
e.lock.Unlock()
|
||||
|
||||
close(token.DoneChan)
|
||||
}
|
||||
|
||||
func (e *evictor) PerformEviction(slotId string, mem, cpu uint64) bool {
|
||||
func (e *evictor) PerformEviction(slotId string, mem, cpu uint64) []chan struct{} {
|
||||
var notifyChans []chan struct{}
|
||||
|
||||
// if no resources are defined for this function, then
|
||||
// we don't know what to do here. We cannot evict anyone
|
||||
// in this case.
|
||||
if mem == 0 && cpu == 0 {
|
||||
return false
|
||||
return notifyChans
|
||||
}
|
||||
|
||||
// Our eviction sum so far
|
||||
@@ -134,7 +155,7 @@ func (e *evictor) PerformEviction(slotId string, mem, cpu uint64) bool {
|
||||
isSatisfied := false
|
||||
|
||||
var keys []string
|
||||
var chans []chan struct{}
|
||||
var completionChans []chan struct{}
|
||||
|
||||
e.lock.Lock()
|
||||
|
||||
@@ -143,6 +164,10 @@ func (e *evictor) PerformEviction(slotId string, mem, cpu uint64) bool {
|
||||
if slotId == val.slotId {
|
||||
continue
|
||||
}
|
||||
// descend into map to verify evictable state
|
||||
if atomic.LoadUint32(&e.tokens[val.id].evictable) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
totalMemory += val.memory
|
||||
totalCpu += val.cpu
|
||||
@@ -158,7 +183,9 @@ func (e *evictor) PerformEviction(slotId string, mem, cpu uint64) bool {
|
||||
// If we can satisfy the need, then let's commit/perform eviction
|
||||
if isSatisfied {
|
||||
|
||||
chans = make([]chan struct{}, 0, len(keys))
|
||||
notifyChans = make([]chan struct{}, 0, len(keys))
|
||||
completionChans = make([]chan struct{}, 0, len(keys))
|
||||
|
||||
idx := 0
|
||||
for _, id := range keys {
|
||||
|
||||
@@ -171,16 +198,18 @@ func (e *evictor) PerformEviction(slotId string, mem, cpu uint64) bool {
|
||||
}
|
||||
}
|
||||
|
||||
chans = append(chans, e.tokens[id].C)
|
||||
notifyChans = append(notifyChans, e.tokens[id].C)
|
||||
completionChans = append(completionChans, e.tokens[id].DoneChan)
|
||||
|
||||
delete(e.tokens, id)
|
||||
}
|
||||
}
|
||||
|
||||
e.lock.Unlock()
|
||||
|
||||
for _, ch := range chans {
|
||||
for _, ch := range notifyChans {
|
||||
close(ch)
|
||||
}
|
||||
|
||||
return isSatisfied
|
||||
return completionChans
|
||||
}
|
||||
|
||||
@@ -4,30 +4,30 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func getACall(id, slot string, mem, cpu int) (string, string, uint64, uint64) {
|
||||
return id, slot, uint64(mem), uint64(cpu)
|
||||
func getACall(slot string, mem, cpu int) (string, uint64, uint64) {
|
||||
return slot, uint64(mem), uint64(cpu)
|
||||
}
|
||||
|
||||
func TestEvictorSimple01(t *testing.T) {
|
||||
evictor := NewEvictor()
|
||||
|
||||
slotId := "slot1"
|
||||
id1, _, mem1, cpu1 := getACall("id1", slotId, 1, 100)
|
||||
id2, _, mem2, cpu2 := getACall("id2", slotId, 1, 100)
|
||||
_, mem1, cpu1 := getACall(slotId, 1, 100)
|
||||
_, mem2, cpu2 := getACall(slotId, 1, 100)
|
||||
|
||||
token1 := evictor.GetEvictor(id1, slotId, mem1, cpu1)
|
||||
token2 := evictor.GetEvictor(id2, slotId, mem2, cpu2)
|
||||
token1 := evictor.CreateEvictToken(slotId, mem1, cpu1)
|
||||
token2 := evictor.CreateEvictToken(slotId, mem2, cpu2)
|
||||
|
||||
evictor.RegisterEvictor(token1)
|
||||
evictor.RegisterEvictor(token2)
|
||||
token1.SetEvictable(true)
|
||||
token2.SetEvictable(true)
|
||||
|
||||
if evictor.PerformEviction(slotId, mem1, cpu1) {
|
||||
if len(evictor.PerformEviction(slotId, mem1, cpu1)) > 0 {
|
||||
t.Fatalf("We should not be able to self evict")
|
||||
}
|
||||
if evictor.PerformEviction("foo", 0, 0) {
|
||||
if len(evictor.PerformEviction("foo", 0, 0)) > 0 {
|
||||
t.Fatalf("We should not be able to evict: zero cpu/mem")
|
||||
}
|
||||
if evictor.PerformEviction("foo", 1, 300) {
|
||||
if len(evictor.PerformEviction("foo", 1, 300)) > 0 {
|
||||
t.Fatalf("We should not be able to evict (resource not enough)")
|
||||
}
|
||||
|
||||
@@ -38,7 +38,7 @@ func TestEvictorSimple01(t *testing.T) {
|
||||
t.Fatalf("should not be evicted")
|
||||
}
|
||||
|
||||
if !evictor.PerformEviction("foo", 1, 100) {
|
||||
if len(evictor.PerformEviction("foo", 1, 100)) != 1 {
|
||||
t.Fatalf("We should be able to evict")
|
||||
}
|
||||
|
||||
@@ -49,44 +49,44 @@ func TestEvictorSimple01(t *testing.T) {
|
||||
t.Fatalf("should not be evicted")
|
||||
}
|
||||
|
||||
evictor.UnregisterEvictor(token1)
|
||||
evictor.UnregisterEvictor(token2)
|
||||
evictor.DeleteEvictToken(token1)
|
||||
evictor.DeleteEvictToken(token2)
|
||||
}
|
||||
|
||||
func TestEvictorSimple02(t *testing.T) {
|
||||
evictor := NewEvictor()
|
||||
|
||||
id1, slotId1, mem1, cpu1 := getACall("id1", "slot1", 1, 100)
|
||||
id2, slotId2, mem2, cpu2 := getACall("id2", "slot1", 1, 100)
|
||||
slotId1, mem1, cpu1 := getACall("slot1", 1, 100)
|
||||
slotId2, mem2, cpu2 := getACall("slot1", 1, 100)
|
||||
|
||||
token1 := evictor.GetEvictor(id1, slotId1, mem1, cpu1)
|
||||
token2 := evictor.GetEvictor(id2, slotId2, mem2, cpu2)
|
||||
token1 := evictor.CreateEvictToken(slotId1, mem1, cpu1)
|
||||
token2 := evictor.CreateEvictToken(slotId2, mem2, cpu2)
|
||||
|
||||
// add/rm/add
|
||||
evictor.RegisterEvictor(token1)
|
||||
evictor.UnregisterEvictor(token1)
|
||||
evictor.RegisterEvictor(token1)
|
||||
token1.SetEvictable(true)
|
||||
token1.SetEvictable(false)
|
||||
token1.SetEvictable(true)
|
||||
|
||||
// add/rm
|
||||
evictor.RegisterEvictor(token2)
|
||||
evictor.UnregisterEvictor(token2)
|
||||
token2.SetEvictable(true)
|
||||
token2.SetEvictable(false)
|
||||
|
||||
if evictor.PerformEviction(slotId1, mem1, cpu1) {
|
||||
if len(evictor.PerformEviction(slotId1, mem1, cpu1)) > 0 {
|
||||
t.Fatalf("We should not be able to self evict")
|
||||
}
|
||||
if evictor.PerformEviction("foo", 0, 0) {
|
||||
if len(evictor.PerformEviction("foo", 0, 0)) > 0 {
|
||||
t.Fatalf("We should not be able to evict: zero cpu/mem")
|
||||
}
|
||||
if token1.isEvicted() {
|
||||
t.Fatalf("should not be evicted")
|
||||
}
|
||||
|
||||
evictor.UnregisterEvictor(token1)
|
||||
token1.SetEvictable(false)
|
||||
|
||||
// not registered... but should be OK
|
||||
evictor.UnregisterEvictor(token2)
|
||||
token2.SetEvictable(false)
|
||||
|
||||
if evictor.PerformEviction("foo", mem1, cpu1) {
|
||||
if len(evictor.PerformEviction("foo", mem1, cpu1)) > 0 {
|
||||
t.Fatalf("We should not be able to evict (unregistered)")
|
||||
}
|
||||
if token1.isEvicted() {
|
||||
@@ -102,22 +102,22 @@ func TestEvictorSimple03(t *testing.T) {
|
||||
|
||||
taboo := "foo"
|
||||
slotId := "slot1"
|
||||
id0, slotId0, mem0, cpu0 := getACall("id0", taboo, 1, 100)
|
||||
id1, _, mem1, cpu1 := getACall("id1", slotId, 1, 100)
|
||||
id2, _, mem2, cpu2 := getACall("id2", slotId, 1, 100)
|
||||
id3, _, mem3, cpu3 := getACall("id3", slotId, 1, 100)
|
||||
slotId0, mem0, cpu0 := getACall(taboo, 1, 100)
|
||||
_, mem1, cpu1 := getACall(slotId, 1, 100)
|
||||
_, mem2, cpu2 := getACall(slotId, 1, 100)
|
||||
_, mem3, cpu3 := getACall(slotId, 1, 100)
|
||||
|
||||
token0 := evictor.GetEvictor(id0, slotId0, mem0, cpu0)
|
||||
token1 := evictor.GetEvictor(id1, slotId, mem1, cpu1)
|
||||
token2 := evictor.GetEvictor(id2, slotId, mem2, cpu2)
|
||||
token3 := evictor.GetEvictor(id3, slotId, mem3, cpu3)
|
||||
token0 := evictor.CreateEvictToken(slotId0, mem0, cpu0)
|
||||
token1 := evictor.CreateEvictToken(slotId, mem1, cpu1)
|
||||
token2 := evictor.CreateEvictToken(slotId, mem2, cpu2)
|
||||
token3 := evictor.CreateEvictToken(slotId, mem3, cpu3)
|
||||
|
||||
evictor.RegisterEvictor(token0)
|
||||
evictor.RegisterEvictor(token1)
|
||||
evictor.RegisterEvictor(token2)
|
||||
evictor.RegisterEvictor(token3)
|
||||
token0.SetEvictable(true)
|
||||
token1.SetEvictable(true)
|
||||
token2.SetEvictable(true)
|
||||
token3.SetEvictable(true)
|
||||
|
||||
if !evictor.PerformEviction(taboo, 1, 200) {
|
||||
if len(evictor.PerformEviction(taboo, 1, 200)) == 0 {
|
||||
t.Fatalf("We should be able to evict")
|
||||
}
|
||||
|
||||
@@ -136,8 +136,8 @@ func TestEvictorSimple03(t *testing.T) {
|
||||
t.Fatalf("should not be evicted")
|
||||
}
|
||||
|
||||
evictor.UnregisterEvictor(token0)
|
||||
evictor.UnregisterEvictor(token1)
|
||||
evictor.UnregisterEvictor(token2)
|
||||
evictor.UnregisterEvictor(token3)
|
||||
evictor.DeleteEvictToken(token0)
|
||||
evictor.DeleteEvictToken(token1)
|
||||
evictor.DeleteEvictToken(token2)
|
||||
evictor.DeleteEvictToken(token3)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user