mirror of
https://github.com/fnproject/fn.git
synced 2022-10-28 21:29:17 +03:00
fn: agent eviction revisited (#1131)
* fn: agent eviction revisited
Previously, the hot-container eviction logic used
number of waiters of cpu/mem resources to decide to
evict a container. An ejection ticker used to wake up
its associated container every 1 sec to reasses system
load based on waiter count. However, this does not work
for non-blocking agent since there are no waiters for
non-blocking mode.
Background on blocking versus non-blocking agent:
*) Blocking agent holds a request until the
the request is serviced or client times out. It assumes
the request can be eventually serviced when idle
containers eject themselves or busy containers finish
their work.
*) Non-blocking mode tries to limit this wait time.
However non-blocking agent has never been truly
non-blocking. This simply means that we only
make a request wait if we take some action in
the system. Non-blocking agents are configured with
a much higher hot poll frequency to make the system
more responsive as well as to handle cases where an
too-busy event is missed by the request. This is because
the communication between hot-launcher and waiting
requests are not 1-1 and lossy if another request
arrives for the same slot queue and receives a
too-busy response before the original request.
Introducing an evictor where each hot container can
register itself, if it is idle for more than 1 seconds.
Upon registry, these idle containers become eligible
for eviction.
In hot container launcher, in non-blocking mode,
before we attempt to emit a too-busy response, now
we attempt an evict. If this is successful, then
we wait some more. This could result in requests
waiting for more than they used to only if a
container was evicted. For blocking-mode, the
hot launcher uses hot-poll period to assess if
a request has waited for too long, then eviction
is triggered.
This commit is contained in:
@@ -97,6 +97,7 @@ type agent struct {
|
||||
driver drivers.Driver
|
||||
|
||||
slotMgr *slotQueueMgr
|
||||
evictor Evictor
|
||||
// track usage
|
||||
resources ResourceTracker
|
||||
|
||||
@@ -129,6 +130,7 @@ func New(da CallHandler, options ...AgentOption) Agent {
|
||||
a.shutWg = common.NewWaitGroup()
|
||||
a.da = da
|
||||
a.slotMgr = NewSlotQueueMgr()
|
||||
a.evictor = NewEvictor()
|
||||
|
||||
// Allow overriding config
|
||||
for _, option := range options {
|
||||
@@ -421,7 +423,12 @@ func (a *agent) getSlot(ctx context.Context, call *call) (Slot, error) {
|
||||
if protocol.IsStreamable(protocol.Protocol(call.Format)) {
|
||||
// For hot requests, we use a long lived slot queue, which we use to manage hot containers
|
||||
var isNew bool
|
||||
call.slots, isNew = a.slotMgr.getSlotQueue(call)
|
||||
|
||||
if call.slotHashId == "" {
|
||||
call.slotHashId = getSlotQueueKey(call)
|
||||
}
|
||||
|
||||
call.slots, isNew = a.slotMgr.getSlotQueue(call.slotHashId)
|
||||
call.requestState.UpdateState(ctx, RequestStateWait, call.slots)
|
||||
if isNew {
|
||||
go a.hotLauncher(ctx, call)
|
||||
@@ -501,7 +508,7 @@ func (a *agent) checkLaunch(ctx context.Context, call *call, notifyChan chan err
|
||||
state := NewContainerState()
|
||||
state.UpdateState(ctx, ContainerStateWait, call.slots)
|
||||
|
||||
common.Logger(ctx).WithFields(logrus.Fields{"currentStats": call.slots.getStats(), "isNeeded": isNeeded}).Info("Hot function launcher starting hot container")
|
||||
common.Logger(ctx).WithFields(logrus.Fields{"currentStats": call.slots.getStats(), "isNeeded": isNeeded}).Debug("Hot function launcher attempting to start a container")
|
||||
|
||||
mem := call.Memory + uint64(call.TmpFsSize)
|
||||
|
||||
@@ -525,18 +532,27 @@ func (a *agent) checkLaunch(ctx context.Context, call *call, notifyChan chan err
|
||||
select {
|
||||
case tok := <-a.resources.GetResourceToken(ctx, mem, uint64(call.CPUs), isAsync, isNB):
|
||||
if tok != nil && tok.Error() != nil {
|
||||
tryNotify(notifyChan, tok.Error())
|
||||
// before returning error response, as a last resort, try evicting idle containers.
|
||||
if tok.Error() != CapacityFull || !a.evictor.PerformEviction(call.slotHashId, mem, uint64(call.CPUs)) {
|
||||
tryNotify(notifyChan, tok.Error())
|
||||
}
|
||||
} else if a.shutWg.AddSession(1) {
|
||||
go func() {
|
||||
// NOTE: runHot will not inherit the timeout from ctx (ignore timings)
|
||||
a.runHot(ctx, call, tok, state)
|
||||
a.shutWg.DoneSession()
|
||||
}()
|
||||
// early return (do not allow container state to switch to ContainerStateDone)
|
||||
return
|
||||
}
|
||||
if tok != nil {
|
||||
tok.Close()
|
||||
}
|
||||
// Request routines are polling us with this a.cfg.HotPoll frequency. We can use this
|
||||
// same timer to assume that we waited for cpu/mem long enough. Let's try to evict an
|
||||
// idle container.
|
||||
case <-time.After(a.cfg.HotPoll):
|
||||
a.evictor.PerformEviction(call.slotHashId, mem, uint64(call.CPUs))
|
||||
case <-ctx.Done(): // timeout
|
||||
case <-a.shutWg.Closer(): // server shutdown
|
||||
}
|
||||
@@ -913,14 +929,15 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
|
||||
|
||||
var err error
|
||||
isFrozen := false
|
||||
isEvictable := false
|
||||
|
||||
freezeTimer := time.NewTimer(a.cfg.FreezeIdle)
|
||||
idleTimer := time.NewTimer(time.Duration(call.IdleTimeout) * time.Second)
|
||||
ejectTicker := time.NewTicker(a.cfg.EjectIdle)
|
||||
ejectTimer := time.NewTimer(a.cfg.EjectIdle)
|
||||
|
||||
defer freezeTimer.Stop()
|
||||
defer idleTimer.Stop()
|
||||
defer ejectTicker.Stop()
|
||||
defer ejectTimer.Stop()
|
||||
|
||||
// log if any error is encountered
|
||||
defer func() {
|
||||
@@ -938,6 +955,8 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
|
||||
isFrozen = true
|
||||
}
|
||||
|
||||
evictor := a.evictor.GetEvictor(call.ID, call.slotHashId, call.Memory+uint64(call.TmpFsSize), uint64(call.CPUs))
|
||||
|
||||
state.UpdateState(ctx, ContainerStateIdle, call.slots)
|
||||
s := call.slots.queueSlot(slot)
|
||||
|
||||
@@ -956,19 +975,21 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
|
||||
isFrozen = true
|
||||
}
|
||||
continue
|
||||
case <-ejectTicker.C:
|
||||
// if someone is waiting for resource in our slot queue, we must not terminate,
|
||||
// otherwise, see if other slot queues have resource waiters that are blocked.
|
||||
stats := call.slots.getStats()
|
||||
if stats.containerStates[ContainerStateWait] > 0 ||
|
||||
a.resources.GetResourceTokenWaiterCount() <= 0 {
|
||||
continue
|
||||
}
|
||||
case <-evictor.C:
|
||||
logger.Debug("attempting hot function eject")
|
||||
case <-ejectTimer.C:
|
||||
// we've been idle too long, now we are ejectable
|
||||
a.evictor.RegisterEvictor(evictor)
|
||||
isEvictable = true
|
||||
continue
|
||||
}
|
||||
break
|
||||
}
|
||||
|
||||
if isEvictable {
|
||||
a.evictor.UnregisterEvictor(evictor)
|
||||
}
|
||||
|
||||
// if we can acquire token, that means we are here due to
|
||||
// abort/shutdown/timeout, attempt to acquire and terminate,
|
||||
// otherwise continue processing the request
|
||||
|
||||
Reference in New Issue
Block a user