fn: agent slot improvements (#704)

*) Stopped using latency previous/current stats, this
was not working as expected. Fresh starts usually have
these stats zero for a long time, and initial samples
are high due to downloads, caches, etc.

*) New state to track: containers that are idle. In other
words, containers that have an unused token in the slot
queue.

*) Removed latency counts since these are not used in
container start decision anymore. Simplifies logs.

*) isNewContainerNeeded() simplified to use idle count
to estimate effective waiters. Removed speculative
latency based logic and progress check comparison.
In agent, waitHot() delayed signalling compansates
for these changes. If the estimation may fail, but
this should correct itself in the next 200 msec
signal.
This commit is contained in:
Tolga Ceylan
2018-01-19 12:35:52 -08:00
committed by GitHub
parent f2b15299d9
commit 8c31e47c01
3 changed files with 60 additions and 117 deletions

View File

@@ -319,7 +319,6 @@ func (a *agent) hotLauncher(ctx context.Context, callObj *call) {
logger := common.Logger(ctx) logger := common.Logger(ctx)
logger.WithField("launcher_timeout", timeout).Info("Hot function launcher starting") logger.WithField("launcher_timeout", timeout).Info("Hot function launcher starting")
isAsync := callObj.Type == models.TypeAsync isAsync := callObj.Type == models.TypeAsync
prevStats := callObj.slots.getStats()
for { for {
select { select {
@@ -334,11 +333,10 @@ func (a *agent) hotLauncher(ctx context.Context, callObj *call) {
} }
curStats := callObj.slots.getStats() curStats := callObj.slots.getStats()
isNeeded := isNewContainerNeeded(&curStats, &prevStats) isNeeded := isNewContainerNeeded(&curStats)
prevStats = curStats
logger.WithFields(logrus.Fields{ logger.WithFields(logrus.Fields{
"currentStats": curStats, "currentStats": curStats,
"previousStats": curStats, "isNeeded": isNeeded,
}).Debug("Hot function launcher stats") }).Debug("Hot function launcher stats")
if !isNeeded { if !isNeeded {
continue continue
@@ -346,8 +344,8 @@ func (a *agent) hotLauncher(ctx context.Context, callObj *call) {
ctxResource, cancelResource := context.WithCancel(context.Background()) ctxResource, cancelResource := context.WithCancel(context.Background())
logger.WithFields(logrus.Fields{ logger.WithFields(logrus.Fields{
"currentStats": curStats, "currentStats": curStats,
"previousStats": curStats, "isNeeded": isNeeded,
}).Info("Hot function launcher starting hot container") }).Info("Hot function launcher starting hot container")
select { select {
@@ -673,22 +671,28 @@ func (a *agent) runHot(ctxArg context.Context, call *call, tok ResourceToken) {
} }
done := make(chan struct{}) done := make(chan struct{})
start := time.Now()
call.slots.enterState(SlotQueueIdle)
s := call.slots.queueSlot(&hotSlot{done, proto, errC, container, nil}) s := call.slots.queueSlot(&hotSlot{done, proto, errC, container, nil})
select { select {
case <-s.trigger: case <-s.trigger:
call.slots.exitStateWithLatency(SlotQueueIdle, uint64(time.Now().Sub(start).Seconds()*1000))
case <-time.After(time.Duration(call.IdleTimeout) * time.Second): case <-time.After(time.Duration(call.IdleTimeout) * time.Second):
if call.slots.ejectSlot(s) { if call.slots.ejectSlot(s) {
call.slots.exitStateWithLatency(SlotQueueIdle, uint64(time.Now().Sub(start).Seconds()*1000))
logger.Info("Canceling inactive hot function") logger.Info("Canceling inactive hot function")
shutdownContainer() shutdownContainer()
return return
} }
case <-ctx.Done(): // container shutdown case <-ctx.Done(): // container shutdown
if call.slots.ejectSlot(s) { if call.slots.ejectSlot(s) {
call.slots.exitStateWithLatency(SlotQueueIdle, uint64(time.Now().Sub(start).Seconds()*1000))
return return
} }
case <-a.shutdown: // server shutdown case <-a.shutdown: // server shutdown
if call.slots.ejectSlot(s) { if call.slots.ejectSlot(s) {
call.slots.exitStateWithLatency(SlotQueueIdle, uint64(time.Now().Sub(start).Seconds()*1000))
shutdownContainer() shutdownContainer()
return return
} }

View File

@@ -30,17 +30,17 @@ type slotQueueMgr struct {
type SlotQueueMetricType int type SlotQueueMetricType int
const ( const (
SlotQueueRunner SlotQueueMetricType = iota SlotQueueRunner SlotQueueMetricType = iota // container is running
SlotQueueStarter SlotQueueStarter // container is launching
SlotQueueWaiter SlotQueueWaiter // requests are waiting
SlotQueueIdle // hot container is running, but idle (free tokens)
SlotQueueLast SlotQueueLast
) )
// counters per state and moving avg of time spent in each state // counters per state and moving avg of time spent in each state
type slotQueueStats struct { type slotQueueStats struct {
states [SlotQueueLast]uint64 states [SlotQueueLast]uint64
latencyCount [SlotQueueLast]uint64 latencies [SlotQueueLast]uint64
latencies [SlotQueueLast]uint64
} }
type slotToken struct { type slotToken struct {
@@ -195,59 +195,29 @@ func (a *slotQueue) getStats() slotQueueStats {
return out return out
} }
func isNewContainerNeeded(cur, prev *slotQueueStats) bool { func isNewContainerNeeded(cur *slotQueueStats) bool {
waiters := cur.states[SlotQueueWaiter] idlers := cur.states[SlotQueueIdle]
if waiters == 0 {
return false
}
// while a container is starting, do not start more than waiters
starters := cur.states[SlotQueueStarter] starters := cur.states[SlotQueueStarter]
if starters >= waiters { waiters := cur.states[SlotQueueWaiter]
// we expect idle containers to immediately pick up
// any waiters. We assume non-idle containers busy.
effectiveWaiters := uint64(0)
if idlers < waiters {
effectiveWaiters = waiters - idlers
}
if effectiveWaiters == 0 {
return false return false
} }
// no executors? We need to spin up a container quickly // if containers are starting, do not start more than effective waiters
executors := starters + cur.states[SlotQueueRunner] if starters > 0 && starters >= effectiveWaiters {
if executors == 0 { return false
return true
} }
// This means we are not making any progress and stats are return true
// not being refreshed quick enough. We err on side
// of new container here.
isEqual := true
for idx, _ := range cur.latencies {
if prev.latencies[idx] != cur.latencies[idx] {
isEqual = false
break
}
}
if isEqual {
return true
}
// WARNING: Below is a few heuristics that are
// speculative, which may (and will) likely need
// adjustments.
runLat := cur.latencies[SlotQueueRunner]
waitLat := cur.latencies[SlotQueueWaiter]
startLat := cur.latencies[SlotQueueStarter]
// this determines the aggresiveness of the container launch.
if executors > 0 && runLat/executors*2 < waitLat {
return true
}
if runLat < waitLat {
return true
}
if startLat < waitLat {
return true
}
return false
} }
func (a *slotQueue) enterState(metricIdx SlotQueueMetricType) { func (a *slotQueue) enterState(metricIdx SlotQueueMetricType) {
@@ -270,14 +240,7 @@ func (a *slotQueue) recordLatencyLocked(metricIdx SlotQueueMetricType, latency u
// 0.5 is a high value to age older observations fast while filtering // 0.5 is a high value to age older observations fast while filtering
// some noise. For our purposes, newer observations are much more important // some noise. For our purposes, newer observations are much more important
// than older, but we still would like to low pass some noise. // than older, but we still would like to low pass some noise.
// first samples are ignored. a.stats.latencies[metricIdx] = (a.stats.latencies[metricIdx]*5 + latency*5) / 10
if a.stats.latencyCount[metricIdx] != 0 {
a.stats.latencies[metricIdx] = (a.stats.latencies[metricIdx]*5 + latency*5) / 10
}
a.stats.latencyCount[metricIdx] += 1
if a.stats.latencyCount[metricIdx] == 0 {
a.stats.latencyCount[metricIdx] += 1
}
} }
func (a *slotQueue) recordLatency(metricIdx SlotQueueMetricType, latency uint64) { func (a *slotQueue) recordLatency(metricIdx SlotQueueMetricType, latency uint64) {

View File

@@ -174,74 +174,50 @@ func TestSlotQueueBasic2(t *testing.T) {
} }
} }
func statsHelperSet(runC, startC, waitC, runL, startL, waitL uint64) slotQueueStats { func statsHelperSet(runC, startC, waitC, idleC uint64) slotQueueStats {
return slotQueueStats{ return slotQueueStats{
states: [SlotQueueLast]uint64{runC, startC, waitC}, states: [SlotQueueLast]uint64{runC, startC, waitC, idleC},
latencies: [SlotQueueLast]uint64{runL, startL, waitL},
} }
} }
func TestSlotNewContainerLogic1(t *testing.T) { func TestSlotNewContainerLogic1(t *testing.T) {
var cur slotQueueStats var cur slotQueueStats
var prev slotQueueStats
cur = statsHelperSet(0, 0, 0, 0, 0, 0) cur = statsHelperSet(0, 0, 0, 0)
prev = statsHelperSet(0, 0, 0, 0, 0, 0) // CASE: There's no one waiting
// CASE I: There's no one waiting despite cur == prev if isNewContainerNeeded(&cur) {
if isNewContainerNeeded(&cur, &prev) { t.Fatalf("Should not need a new container cur: %#v", cur)
t.Fatalf("Should not need a new container cur: %#v prev: %#v", cur, prev)
} }
// CASE II: There are starters >= waiters // CASE: There are starters >= waiters
cur = statsHelperSet(0, 10, 1, 0, 0, 0) cur = statsHelperSet(1, 10, 10, 0)
prev = statsHelperSet(0, 10, 1, 0, 0, 0) if isNewContainerNeeded(&cur) {
if isNewContainerNeeded(&cur, &prev) { t.Fatalf("Should not need a new container cur: %#v", cur)
t.Fatalf("Should not need a new container cur: %#v prev: %#v", cur, prev)
} }
// CASE III: no executors // CASE: There are starters < waiters
cur = statsHelperSet(0, 0, 1, 0, 0, 0) cur = statsHelperSet(1, 5, 10, 0)
prev = statsHelperSet(0, 0, 1, 0, 0, 0) if !isNewContainerNeeded(&cur) {
if !isNewContainerNeeded(&cur, &prev) { t.Fatalf("Should need a new container cur: %#v", cur)
t.Fatalf("Should need a new container cur: %#v prev: %#v", cur, prev)
} }
// CASE IV: cur == prev same, progress has stalled, with waiters and // CASE: effective waiters 0 (idle = waiter = 10)
// small num of executors cur = statsHelperSet(11, 0, 10, 10)
cur = statsHelperSet(2, 0, 10, 0, 0, 0) if isNewContainerNeeded(&cur) {
prev = statsHelperSet(2, 0, 10, 0, 0, 0) t.Fatalf("Should not need a new container cur: %#v", cur)
if !isNewContainerNeeded(&cur, &prev) {
t.Fatalf("Should need a new container cur: %#v prev: %#v", cur, prev)
} }
// CASE V: cur != prev, runLat/executors*2 < waitLat // CASE: effective waiters > 0 (idle = 5 waiter = 10)
// Let's make cur and prev unequal to prevent blocked progress detection cur = statsHelperSet(11, 0, 10, 5)
cur = statsHelperSet(2, 0, 10, 12, 100, 13) if !isNewContainerNeeded(&cur) {
prev = statsHelperSet(2, 0, 10, 12, 101, 13) t.Fatalf("Should need a new container cur: %#v", cur)
if !isNewContainerNeeded(&cur, &prev) {
t.Fatalf("Should need a new container cur: %#v prev: %#v", cur, prev)
} }
// CASE VI: cur != prev, runLat < waitLat // CASE: no executors, but 1 waiter
cur = statsHelperSet(1, 0, 10, 12, 100, 14) cur = statsHelperSet(0, 0, 1, 0)
prev = statsHelperSet(1, 0, 10, 12, 101, 14) if !isNewContainerNeeded(&cur) {
if !isNewContainerNeeded(&cur, &prev) { t.Fatalf("Should need a new container cur: %#v", cur)
t.Fatalf("Should need a new container cur: %#v prev: %#v", cur, prev)
}
// CAST VII: cur != prev, startLat < waitLat
cur = statsHelperSet(1, 0, 10, 2, 10, 20)
prev = statsHelperSet(1, 0, 10, 1, 11, 20)
if !isNewContainerNeeded(&cur, &prev) {
t.Fatalf("Should need a new container cur: %#v prev: %#v", cur, prev)
}
// CAST VIII: cur != prev, fallback
cur = statsHelperSet(1, 0, 10, 2, 10, 2)
prev = statsHelperSet(1, 0, 10, 1, 11, 2)
if isNewContainerNeeded(&cur, &prev) {
t.Fatalf("Should not need a new container cur: %#v prev: %#v", cur, prev)
} }
} }