fn: agent eviction revisited (#1131)

* fn: agent eviction revisited Previously, the hot-container eviction logic used number of waiters of cpu/mem resources to decide to evict a container. An ejection ticker used to wake up its associated container every 1 sec to reasses system load based on waiter count. However, this does not work for non-blocking agent since there are no waiters for non-blocking mode. Background on blocking versus non-blocking agent: *) Blocking agent holds a request until the the request is serviced or client times out. It assumes the request can be eventually serviced when idle containers eject themselves or busy containers finish their work. *) Non-blocking mode tries to limit this wait time. However non-blocking agent has never been truly non-blocking. This simply means that we only make a request wait if we take some action in the system. Non-blocking agents are configured with a much higher hot poll frequency to make the system more responsive as well as to handle cases where an too-busy event is missed by the request. This is because the communication between hot-launcher and waiting requests are not 1-1 and lossy if another request arrives for the same slot queue and receives a too-busy response before the original request. Introducing an evictor where each hot container can register itself, if it is idle for more than 1 seconds. Upon registry, these idle containers become eligible for eviction. In hot container launcher, in non-blocking mode, before we attempt to emit a too-busy response, now we attempt an evict. If this is successful, then we wait some more. This could result in requests waiting for more than they used to only if a container was evicted. For blocking-mode, the hot launcher uses hot-poll period to assess if a request has waited for too long, then eviction is triggered.
2022-10-28 21:29:17 +03:00 · 2018-07-19 15:04:15 -07:00
parent 8e373005a0
commit 1258baeb7f
6 changed files with 365 additions and 40 deletions
--- a/api/agent/agent.go
+++ b/api/agent/agent.go
@@ -97,6 +97,7 @@ type agent struct {
 	driver drivers.Driver

 	slotMgr *slotQueueMgr
+	evictor Evictor
 	// track usage
 	resources ResourceTracker

@@ -129,6 +130,7 @@ func New(da CallHandler, options ...AgentOption) Agent {
 	a.shutWg = common.NewWaitGroup()
 	a.da = da
 	a.slotMgr = NewSlotQueueMgr()
+	a.evictor = NewEvictor()

 	// Allow overriding config
 	for _, option := range options {
@@ -421,7 +423,12 @@ func (a *agent) getSlot(ctx context.Context, call *call) (Slot, error) {
 	if protocol.IsStreamable(protocol.Protocol(call.Format)) {
 		// For hot requests, we use a long lived slot queue, which we use to manage hot containers
 		var isNew bool
-		call.slots, isNew = a.slotMgr.getSlotQueue(call)
+
+		if call.slotHashId == "" {
+			call.slotHashId = getSlotQueueKey(call)
+		}
+
+		call.slots, isNew = a.slotMgr.getSlotQueue(call.slotHashId)
 		call.requestState.UpdateState(ctx, RequestStateWait, call.slots)
 		if isNew {
 			go a.hotLauncher(ctx, call)
@@ -501,7 +508,7 @@ func (a *agent) checkLaunch(ctx context.Context, call *call, notifyChan chan err
 	state := NewContainerState()
 	state.UpdateState(ctx, ContainerStateWait, call.slots)

-	common.Logger(ctx).WithFields(logrus.Fields{"currentStats": call.slots.getStats(), "isNeeded": isNeeded}).Info("Hot function launcher starting hot container")
+	common.Logger(ctx).WithFields(logrus.Fields{"currentStats": call.slots.getStats(), "isNeeded": isNeeded}).Debug("Hot function launcher attempting to start a container")

 	mem := call.Memory + uint64(call.TmpFsSize)

@@ -525,18 +532,27 @@ func (a *agent) checkLaunch(ctx context.Context, call *call, notifyChan chan err
 	select {
 	case tok := <-a.resources.GetResourceToken(ctx, mem, uint64(call.CPUs), isAsync, isNB):
 		if tok != nil && tok.Error() != nil {
-			tryNotify(notifyChan, tok.Error())
+			// before returning error response, as a last resort, try evicting idle containers.
+			if tok.Error() != CapacityFull || !a.evictor.PerformEviction(call.slotHashId, mem, uint64(call.CPUs)) {
+				tryNotify(notifyChan, tok.Error())
+			}
 		} else if a.shutWg.AddSession(1) {
 			go func() {
 				// NOTE: runHot will not inherit the timeout from ctx (ignore timings)
 				a.runHot(ctx, call, tok, state)
 				a.shutWg.DoneSession()
 			}()
+			// early return (do not allow container state to switch to ContainerStateDone)
 			return
 		}
 		if tok != nil {
 			tok.Close()
 		}
+	// Request routines are polling us with this a.cfg.HotPoll frequency. We can use this
+	// same timer to assume that we waited for cpu/mem long enough. Let's try to evict an
+	// idle container.
+	case <-time.After(a.cfg.HotPoll):
+		a.evictor.PerformEviction(call.slotHashId, mem, uint64(call.CPUs))
 	case <-ctx.Done(): // timeout
 	case <-a.shutWg.Closer(): // server shutdown
 	}
@@ -913,14 +929,15 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,

 	var err error
 	isFrozen := false
+	isEvictable := false

 	freezeTimer := time.NewTimer(a.cfg.FreezeIdle)
 	idleTimer := time.NewTimer(time.Duration(call.IdleTimeout) * time.Second)
-	ejectTicker := time.NewTicker(a.cfg.EjectIdle)
+	ejectTimer := time.NewTimer(a.cfg.EjectIdle)

 	defer freezeTimer.Stop()
 	defer idleTimer.Stop()
-	defer ejectTicker.Stop()
+	defer ejectTimer.Stop()

 	// log if any error is encountered
 	defer func() {
@@ -938,6 +955,8 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
 		isFrozen = true
 	}

+	evictor := a.evictor.GetEvictor(call.ID, call.slotHashId, call.Memory+uint64(call.TmpFsSize), uint64(call.CPUs))
+
 	state.UpdateState(ctx, ContainerStateIdle, call.slots)
 	s := call.slots.queueSlot(slot)

@@ -956,19 +975,21 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
 				isFrozen = true
 			}
 			continue
-		case <-ejectTicker.C:
-			// if someone is waiting for resource in our slot queue, we must not terminate,
-			// otherwise, see if other slot queues have resource waiters that are blocked.
-			stats := call.slots.getStats()
-			if stats.containerStates[ContainerStateWait] > 0 ||
-				a.resources.GetResourceTokenWaiterCount() <= 0 {
-				continue
-			}
+		case <-evictor.C:
 			logger.Debug("attempting hot function eject")
+		case <-ejectTimer.C:
+			// we've been idle too long, now we are ejectable
+			a.evictor.RegisterEvictor(evictor)
+			isEvictable = true
+			continue
 		}
 		break
 	}

+	if isEvictable {
+		a.evictor.UnregisterEvictor(evictor)
+	}
+
 	// if we can acquire token, that means we are here due to
 	// abort/shutdown/timeout, attempt to acquire and terminate,
 	// otherwise continue processing the request