fn: agent slot improvements (#704)

*) Stopped using latency previous/current stats, this was not working as expected. Fresh starts usually have these stats zero for a long time, and initial samples are high due to downloads, caches, etc. *) New state to track: containers that are idle. In other words, containers that have an unused token in the slot queue. *) Removed latency counts since these are not used in container start decision anymore. Simplifies logs. *) isNewContainerNeeded() simplified to use idle count to estimate effective waiters. Removed speculative latency based logic and progress check comparison. In agent, waitHot() delayed signalling compansates for these changes. If the estimation may fail, but this should correct itself in the next 200 msec signal.
2022-10-28 21:29:17 +03:00 · 2018-01-19 12:35:52 -08:00
parent f2b15299d9
commit 8c31e47c01
3 changed files with 60 additions and 117 deletions
--- a/api/agent/agent.go
+++ b/api/agent/agent.go
@@ -319,7 +319,6 @@ func (a *agent) hotLauncher(ctx context.Context, callObj *call) {
 	logger := common.Logger(ctx)
 	logger.WithField("launcher_timeout", timeout).Info("Hot function launcher starting")
 	isAsync := callObj.Type == models.TypeAsync
 	prevStats := callObj.slots.getStats()
 	for {
 		select {
@@ -334,11 +333,10 @@ func (a *agent) hotLauncher(ctx context.Context, callObj *call) {
 		}
 		curStats := callObj.slots.getStats()
-		isNeeded := isNewContainerNeeded(&curStats, &prevStats)
+		isNeeded := isNewContainerNeeded(&curStats)
 		prevStats = curStats
 		logger.WithFields(logrus.Fields{
-			"currentStats":  curStats,
+			"currentStats": curStats,
-			"previousStats": curStats,
+			"isNeeded":     isNeeded,
 		}).Debug("Hot function launcher stats")
 		if !isNeeded {
 			continue
@@ -346,8 +344,8 @@ func (a *agent) hotLauncher(ctx context.Context, callObj *call) {
 		ctxResource, cancelResource := context.WithCancel(context.Background())
 		logger.WithFields(logrus.Fields{
-			"currentStats":  curStats,
+			"currentStats": curStats,
-			"previousStats": curStats,
+			"isNeeded":     isNeeded,
 		}).Info("Hot function launcher starting hot container")
 		select {
@@ -673,22 +671,28 @@ func (a *agent) runHot(ctxArg context.Context, call *call, tok ResourceToken) {
 			}
 			done := make(chan struct{})
 			start := time.Now()
 			call.slots.enterState(SlotQueueIdle)
 			s := call.slots.queueSlot(&hotSlot{done, proto, errC, container, nil})
 			select {
 			case <-s.trigger:
 				call.slots.exitStateWithLatency(SlotQueueIdle, uint64(time.Now().Sub(start).Seconds()*1000))
 			case <-time.After(time.Duration(call.IdleTimeout) * time.Second):
 				if call.slots.ejectSlot(s) {
 					call.slots.exitStateWithLatency(SlotQueueIdle, uint64(time.Now().Sub(start).Seconds()*1000))
 					logger.Info("Canceling inactive hot function")
 					shutdownContainer()
 					return
 				}
 			case <-ctx.Done(): // container shutdown
 				if call.slots.ejectSlot(s) {
 					call.slots.exitStateWithLatency(SlotQueueIdle, uint64(time.Now().Sub(start).Seconds()*1000))
 					return
 				}
 			case <-a.shutdown: // server shutdown
 				if call.slots.ejectSlot(s) {
 					call.slots.exitStateWithLatency(SlotQueueIdle, uint64(time.Now().Sub(start).Seconds()*1000))
 					shutdownContainer()
 					return
 				}
--- a/api/agent/slots.go
+++ b/api/agent/slots.go
@@ -30,17 +30,17 @@ type slotQueueMgr struct {
 type SlotQueueMetricType int
 const (
-	SlotQueueRunner SlotQueueMetricType = iota
+	SlotQueueRunner  SlotQueueMetricType = iota // container is running
-	SlotQueueStarter
+	SlotQueueStarter                            // container is launching
-	SlotQueueWaiter
+	SlotQueueWaiter                             // requests are waiting
 	SlotQueueIdle                               // hot container is running, but idle (free tokens)
 	SlotQueueLast
 )
 // counters per state and moving avg of time spent in each state
 type slotQueueStats struct {
-	states       [SlotQueueLast]uint64
+	states    [SlotQueueLast]uint64
-	latencyCount [SlotQueueLast]uint64
+	latencies [SlotQueueLast]uint64
 	latencies    [SlotQueueLast]uint64
 }
 type slotToken struct {
@@ -195,59 +195,29 @@ func (a *slotQueue) getStats() slotQueueStats {
 	return out
 }
-func isNewContainerNeeded(cur, prev *slotQueueStats) bool {
+func isNewContainerNeeded(cur *slotQueueStats) bool {
-	waiters := cur.states[SlotQueueWaiter]
+	idlers := cur.states[SlotQueueIdle]
 	if waiters == 0 {
 		return false
 	}
 	// while a container is starting, do not start more than waiters
 	starters := cur.states[SlotQueueStarter]
-	if starters >= waiters {
+	waiters := cur.states[SlotQueueWaiter]
 	// we expect idle containers to immediately pick up
 	// any waiters. We assume non-idle containers busy.
 	effectiveWaiters := uint64(0)
 	if idlers < waiters {
 		effectiveWaiters = waiters - idlers
 	}
 	if effectiveWaiters == 0 {
 		return false
 	}
-	// no executors? We need to spin up a container quickly
+	// if containers are starting, do not start more than effective waiters
-	executors := starters + cur.states[SlotQueueRunner]
+	if starters > 0 && starters >= effectiveWaiters {
-	if executors == 0 {
+		return false
 		return true
 	}
-	// This means we are not making any progress and stats are
+	return true
 	// not being refreshed quick enough. We err on side
 	// of new container here.
 	isEqual := true
 	for idx, _ := range cur.latencies {
 		if prev.latencies[idx] != cur.latencies[idx] {
 			isEqual = false
 			break
 		}
 	}
 	if isEqual {
 		return true
 	}
 	// WARNING: Below is a few heuristics that are
 	// speculative, which may (and will) likely need
 	// adjustments.
 	runLat := cur.latencies[SlotQueueRunner]
 	waitLat := cur.latencies[SlotQueueWaiter]
 	startLat := cur.latencies[SlotQueueStarter]
 	// this determines the aggresiveness of the container launch.
 	if executors > 0 && runLat/executors*2 < waitLat {
 		return true
 	}
 	if runLat < waitLat {
 		return true
 	}
 	if startLat < waitLat {
 		return true
 	}
 	return false
 }
 func (a *slotQueue) enterState(metricIdx SlotQueueMetricType) {
@@ -270,14 +240,7 @@ func (a *slotQueue) recordLatencyLocked(metricIdx SlotQueueMetricType, latency u
 	// 0.5 is a high value to age older observations fast while filtering
 	// some noise. For our purposes, newer observations are much more important
 	// than older, but we still would like to low pass some noise.
-	// first samples are ignored.
+	a.stats.latencies[metricIdx] = (a.stats.latencies[metricIdx]*5 + latency*5) / 10
 	if a.stats.latencyCount[metricIdx] != 0 {
 		a.stats.latencies[metricIdx] = (a.stats.latencies[metricIdx]*5 + latency*5) / 10
 	}
 	a.stats.latencyCount[metricIdx] += 1
 	if a.stats.latencyCount[metricIdx] == 0 {
 		a.stats.latencyCount[metricIdx] += 1
 	}
 }
 func (a *slotQueue) recordLatency(metricIdx SlotQueueMetricType, latency uint64) {
--- a/api/agent/slots_test.go
+++ b/api/agent/slots_test.go
@@ -174,74 +174,50 @@ func TestSlotQueueBasic2(t *testing.T) {
 	}
 }
-func statsHelperSet(runC, startC, waitC, runL, startL, waitL uint64) slotQueueStats {
+func statsHelperSet(runC, startC, waitC, idleC uint64) slotQueueStats {
 	return slotQueueStats{
-		states:    [SlotQueueLast]uint64{runC, startC, waitC},
+		states: [SlotQueueLast]uint64{runC, startC, waitC, idleC},
 		latencies: [SlotQueueLast]uint64{runL, startL, waitL},
 	}
 }
 func TestSlotNewContainerLogic1(t *testing.T) {
 	var cur slotQueueStats
 	var prev slotQueueStats
-	cur = statsHelperSet(0, 0, 0, 0, 0, 0)
+	cur = statsHelperSet(0, 0, 0, 0)
-	prev = statsHelperSet(0, 0, 0, 0, 0, 0)
+	// CASE: There's no one waiting
-	// CASE I: There's no one waiting despite cur == prev
+	if isNewContainerNeeded(&cur) {
-	if isNewContainerNeeded(&cur, &prev) {
+		t.Fatalf("Should not need a new container cur: %#v", cur)
 		t.Fatalf("Should not need a new container cur: %#v prev: %#v", cur, prev)
 	}
-	// CASE II: There are starters >= waiters
+	// CASE: There are starters >= waiters
-	cur = statsHelperSet(0, 10, 1, 0, 0, 0)
+	cur = statsHelperSet(1, 10, 10, 0)
-	prev = statsHelperSet(0, 10, 1, 0, 0, 0)
+	if isNewContainerNeeded(&cur) {
-	if isNewContainerNeeded(&cur, &prev) {
+		t.Fatalf("Should not need a new container cur: %#v", cur)
 		t.Fatalf("Should not need a new container cur: %#v prev: %#v", cur, prev)
 	}
-	// CASE III: no executors
+	// CASE: There are starters < waiters
-	cur = statsHelperSet(0, 0, 1, 0, 0, 0)
+	cur = statsHelperSet(1, 5, 10, 0)
-	prev = statsHelperSet(0, 0, 1, 0, 0, 0)
+	if !isNewContainerNeeded(&cur) {
-	if !isNewContainerNeeded(&cur, &prev) {
+		t.Fatalf("Should need a new container cur: %#v", cur)
 		t.Fatalf("Should need a new container cur: %#v prev: %#v", cur, prev)
 	}
-	// CASE IV: cur == prev same, progress has stalled, with waiters and
+	// CASE: effective waiters 0 (idle = waiter = 10)
-	// small num of executors
+	cur = statsHelperSet(11, 0, 10, 10)
-	cur = statsHelperSet(2, 0, 10, 0, 0, 0)
+	if isNewContainerNeeded(&cur) {
-	prev = statsHelperSet(2, 0, 10, 0, 0, 0)
+		t.Fatalf("Should not need a new container cur: %#v", cur)
 	if !isNewContainerNeeded(&cur, &prev) {
 		t.Fatalf("Should need a new container cur: %#v prev: %#v", cur, prev)
 	}
-	// CASE V: cur != prev, runLat/executors*2 < waitLat
+	// CASE: effective waiters > 0 (idle = 5 waiter = 10)
-	// Let's make cur and prev unequal to prevent blocked progress detection
+	cur = statsHelperSet(11, 0, 10, 5)
-	cur = statsHelperSet(2, 0, 10, 12, 100, 13)
+	if !isNewContainerNeeded(&cur) {
-	prev = statsHelperSet(2, 0, 10, 12, 101, 13)
+		t.Fatalf("Should need a new container cur: %#v", cur)
 	if !isNewContainerNeeded(&cur, &prev) {
 		t.Fatalf("Should need a new container cur: %#v prev: %#v", cur, prev)
 	}
-	// CASE VI: cur != prev, runLat < waitLat
+	// CASE: no executors, but 1 waiter
-	cur = statsHelperSet(1, 0, 10, 12, 100, 14)
+	cur = statsHelperSet(0, 0, 1, 0)
-	prev = statsHelperSet(1, 0, 10, 12, 101, 14)
+	if !isNewContainerNeeded(&cur) {
-	if !isNewContainerNeeded(&cur, &prev) {
+		t.Fatalf("Should need a new container cur: %#v", cur)
 		t.Fatalf("Should need a new container cur: %#v prev: %#v", cur, prev)
 	}
 	// CAST VII: cur != prev, startLat < waitLat
 	cur = statsHelperSet(1, 0, 10, 2, 10, 20)
 	prev = statsHelperSet(1, 0, 10, 1, 11, 20)
 	if !isNewContainerNeeded(&cur, &prev) {
 		t.Fatalf("Should need a new container cur: %#v prev: %#v", cur, prev)
 	}
 	// CAST VIII: cur != prev, fallback
 	cur = statsHelperSet(1, 0, 10, 2, 10, 2)
 	prev = statsHelperSet(1, 0, 10, 1, 11, 2)
 	if isNewContainerNeeded(&cur, &prev) {
 		t.Fatalf("Should not need a new container cur: %#v prev: %#v", cur, prev)
 	}
 }