fn: agent eviction revisited (#1131)

* fn: agent eviction revisited

Previously, the hot-container eviction logic used
number of waiters of cpu/mem resources to decide to
evict a container. An ejection ticker used to wake up
its associated container every 1 sec to reasses system
load based on waiter count. However, this does not work
for non-blocking agent since there are no waiters for
non-blocking mode.

Background on blocking versus non-blocking agent:
    *) Blocking agent holds a request until the
    the request is serviced or client times out. It assumes
    the request can be eventually serviced when idle
    containers eject themselves or busy containers finish
    their work.
    *) Non-blocking mode tries to limit this wait time.
    However non-blocking agent has never been truly
    non-blocking. This simply means that we only
    make a request wait if we take some action in
    the system. Non-blocking agents are configured with
    a much higher hot poll frequency to make the system
    more responsive as well as to handle cases where an
    too-busy event is missed by the request. This is because
    the communication between hot-launcher and waiting
    requests are not 1-1 and lossy if another request
    arrives for the same slot queue and receives a
    too-busy response before the original request.

Introducing an evictor where each hot container can
register itself, if it is idle for more than 1 seconds.
Upon registry, these idle containers become eligible
for eviction.

In hot container launcher, in non-blocking mode,
before we attempt to emit a too-busy response, now
we attempt an evict. If this is successful, then
we wait some more. This could result in requests
waiting for more than they used to only if a
container was evicted. For blocking-mode, the
hot launcher uses hot-poll period to assess if
a request has waited for too long, then eviction
is triggered.
This commit is contained in:
Tolga Ceylan
2018-07-19 15:04:15 -07:00
committed by GitHub
parent 8e373005a0
commit 1258baeb7f
6 changed files with 365 additions and 40 deletions

View File

@@ -97,6 +97,7 @@ type agent struct {
driver drivers.Driver
slotMgr *slotQueueMgr
evictor Evictor
// track usage
resources ResourceTracker
@@ -129,6 +130,7 @@ func New(da CallHandler, options ...AgentOption) Agent {
a.shutWg = common.NewWaitGroup()
a.da = da
a.slotMgr = NewSlotQueueMgr()
a.evictor = NewEvictor()
// Allow overriding config
for _, option := range options {
@@ -421,7 +423,12 @@ func (a *agent) getSlot(ctx context.Context, call *call) (Slot, error) {
if protocol.IsStreamable(protocol.Protocol(call.Format)) {
// For hot requests, we use a long lived slot queue, which we use to manage hot containers
var isNew bool
call.slots, isNew = a.slotMgr.getSlotQueue(call)
if call.slotHashId == "" {
call.slotHashId = getSlotQueueKey(call)
}
call.slots, isNew = a.slotMgr.getSlotQueue(call.slotHashId)
call.requestState.UpdateState(ctx, RequestStateWait, call.slots)
if isNew {
go a.hotLauncher(ctx, call)
@@ -501,7 +508,7 @@ func (a *agent) checkLaunch(ctx context.Context, call *call, notifyChan chan err
state := NewContainerState()
state.UpdateState(ctx, ContainerStateWait, call.slots)
common.Logger(ctx).WithFields(logrus.Fields{"currentStats": call.slots.getStats(), "isNeeded": isNeeded}).Info("Hot function launcher starting hot container")
common.Logger(ctx).WithFields(logrus.Fields{"currentStats": call.slots.getStats(), "isNeeded": isNeeded}).Debug("Hot function launcher attempting to start a container")
mem := call.Memory + uint64(call.TmpFsSize)
@@ -525,18 +532,27 @@ func (a *agent) checkLaunch(ctx context.Context, call *call, notifyChan chan err
select {
case tok := <-a.resources.GetResourceToken(ctx, mem, uint64(call.CPUs), isAsync, isNB):
if tok != nil && tok.Error() != nil {
tryNotify(notifyChan, tok.Error())
// before returning error response, as a last resort, try evicting idle containers.
if tok.Error() != CapacityFull || !a.evictor.PerformEviction(call.slotHashId, mem, uint64(call.CPUs)) {
tryNotify(notifyChan, tok.Error())
}
} else if a.shutWg.AddSession(1) {
go func() {
// NOTE: runHot will not inherit the timeout from ctx (ignore timings)
a.runHot(ctx, call, tok, state)
a.shutWg.DoneSession()
}()
// early return (do not allow container state to switch to ContainerStateDone)
return
}
if tok != nil {
tok.Close()
}
// Request routines are polling us with this a.cfg.HotPoll frequency. We can use this
// same timer to assume that we waited for cpu/mem long enough. Let's try to evict an
// idle container.
case <-time.After(a.cfg.HotPoll):
a.evictor.PerformEviction(call.slotHashId, mem, uint64(call.CPUs))
case <-ctx.Done(): // timeout
case <-a.shutWg.Closer(): // server shutdown
}
@@ -913,14 +929,15 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
var err error
isFrozen := false
isEvictable := false
freezeTimer := time.NewTimer(a.cfg.FreezeIdle)
idleTimer := time.NewTimer(time.Duration(call.IdleTimeout) * time.Second)
ejectTicker := time.NewTicker(a.cfg.EjectIdle)
ejectTimer := time.NewTimer(a.cfg.EjectIdle)
defer freezeTimer.Stop()
defer idleTimer.Stop()
defer ejectTicker.Stop()
defer ejectTimer.Stop()
// log if any error is encountered
defer func() {
@@ -938,6 +955,8 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
isFrozen = true
}
evictor := a.evictor.GetEvictor(call.ID, call.slotHashId, call.Memory+uint64(call.TmpFsSize), uint64(call.CPUs))
state.UpdateState(ctx, ContainerStateIdle, call.slots)
s := call.slots.queueSlot(slot)
@@ -956,19 +975,21 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
isFrozen = true
}
continue
case <-ejectTicker.C:
// if someone is waiting for resource in our slot queue, we must not terminate,
// otherwise, see if other slot queues have resource waiters that are blocked.
stats := call.slots.getStats()
if stats.containerStates[ContainerStateWait] > 0 ||
a.resources.GetResourceTokenWaiterCount() <= 0 {
continue
}
case <-evictor.C:
logger.Debug("attempting hot function eject")
case <-ejectTimer.C:
// we've been idle too long, now we are ejectable
a.evictor.RegisterEvictor(evictor)
isEvictable = true
continue
}
break
}
if isEvictable {
a.evictor.UnregisterEvictor(evictor)
}
// if we can acquire token, that means we are here due to
// abort/shutdown/timeout, attempt to acquire and terminate,
// otherwise continue processing the request