fn: new container lauch adjustments (#677)

*) revert executor wait queue size comparison. This is too
   aggresive and with stall check below, now unnecessary.
*) new container logic now checks if stats are constant, if
   this is the case, then we assume the system is stalled (eg
   running functions that take long time), this means we need
   to make progress and spin up a new container.
This commit is contained in:
Tolga Ceylan
2018-01-11 14:09:21 -08:00
committed by GitHub
parent 91d282eb4e
commit db159e595f
3 changed files with 64 additions and 50 deletions

View File

@@ -306,6 +306,7 @@ func (a *agent) hotLauncher(ctx context.Context, callObj *call) {
logger := common.Logger(ctx) logger := common.Logger(ctx)
logger.WithField("launcher_timeout", timeout).Info("Hot function launcher starting") logger.WithField("launcher_timeout", timeout).Info("Hot function launcher starting")
isAsync := callObj.Type == models.TypeAsync isAsync := callObj.Type == models.TypeAsync
prevStats := callObj.slots.getStats()
for { for {
select { select {
@@ -319,14 +320,22 @@ func (a *agent) hotLauncher(ctx context.Context, callObj *call) {
case <-callObj.slots.signaller: case <-callObj.slots.signaller:
} }
isNeeded, stats := callObj.slots.isNewContainerNeeded() curStats := callObj.slots.getStats()
logger.WithField("stats", stats).Debug("Hot function launcher stats") isNeeded := isNewContainerNeeded(&curStats, &prevStats)
prevStats = curStats
logger.WithFields(logrus.Fields{
"currentStats": curStats,
"previousStats": curStats,
}).Debug("Hot function launcher stats")
if !isNeeded { if !isNeeded {
continue continue
} }
resourceCtx, cancel := context.WithCancel(context.Background()) resourceCtx, cancel := context.WithCancel(context.Background())
logger.WithField("stats", stats).Info("Hot function launcher starting hot container") logger.WithFields(logrus.Fields{
"currentStats": curStats,
"previousStats": curStats,
}).Info("Hot function launcher starting hot container")
select { select {
case tok, isOpen := <-a.resources.GetResourceToken(resourceCtx, callObj.Memory, isAsync): case tok, isOpen := <-a.resources.GetResourceToken(resourceCtx, callObj.Memory, isAsync):

View File

@@ -189,60 +189,59 @@ func (a *slotQueue) getStats() slotQueueStats {
return out return out
} }
func (a *slotQueue) isNewContainerNeeded() (bool, slotQueueStats) { func isNewContainerNeeded(cur, prev *slotQueueStats) bool {
stats := a.getStats() waiters := cur.states[SlotQueueWaiter]
waiters := stats.states[SlotQueueWaiter]
if waiters == 0 { if waiters == 0 {
return false, stats return false
} }
// while a container is starting, do not start more than waiters // while a container is starting, do not start more than waiters
starters := stats.states[SlotQueueStarter] starters := cur.states[SlotQueueStarter]
if starters >= waiters { if starters >= waiters {
return false, stats return false
} }
// this is a bit aggresive and assumes that we only // no executors? We need to spin up a container quickly
// want to queue as much as num of containers. executors := starters + cur.states[SlotQueueRunner]
executors := starters + stats.states[SlotQueueRunner] if executors == 0 {
if executors < waiters { return true
return true, stats }
// This means we are not making any progress and stats are
// not being refreshed quick enough. We err on side
// of new container here.
isEqual := true
for idx, _ := range cur.latencies {
if prev.latencies[idx] != cur.latencies[idx] {
isEqual = false
break
}
}
if isEqual {
return true
} }
// WARNING: Below is a few heuristics that are // WARNING: Below is a few heuristics that are
// speculative, which may (and will) likely need // speculative, which may (and will) likely need
// adjustments. // adjustments.
// WARNING: latencies below are updated after a call runLat := cur.latencies[SlotQueueRunner]
// switches to/from different states. Do not assume waitLat := cur.latencies[SlotQueueWaiter]
// the metrics below are always up-to-date. For example, startLat := cur.latencies[SlotQueueStarter]
// a sudden burst of incoming requests will increase
// waiter count but not necessarily wait latency until
// those requests switch from waiter state.
runLat := stats.latencies[SlotQueueRunner]
waitLat := stats.latencies[SlotQueueWaiter]
startLat := stats.latencies[SlotQueueStarter]
// no wait latency? No need to spin up new container
if waitLat == 0 {
return false, stats
}
// this determines the aggresiveness of the container launch. // this determines the aggresiveness of the container launch.
if runLat/executors*2 < waitLat { if executors > 0 && runLat/executors*2 < waitLat {
return true, stats return true
} }
if runLat < waitLat { if runLat < waitLat {
return true, stats return true
} }
if startLat < waitLat { if startLat < waitLat {
return true, stats return true
} }
return false, stats return false
} }
func (a *slotQueue) enterState(metricIdx SlotQueueMetricType) { func (a *slotQueue) enterState(metricIdx SlotQueueMetricType) {

View File

@@ -152,18 +152,6 @@ func TestSlotQueueBasic1(t *testing.T) {
} }
case <-time.After(time.Duration(500) * time.Millisecond): case <-time.After(time.Duration(500) * time.Millisecond):
} }
stats1 := obj.getStats()
isNeeded, stats2 := obj.isNewContainerNeeded()
if stats1 != stats2 {
t.Fatalf("Faulty stats %#v != %#v", stats1, stats2)
}
// there are no waiters.
if isNeeded {
t.Fatalf("Shouldn't need a container")
}
} }
func TestSlotQueueBasic2(t *testing.T) { func TestSlotQueueBasic2(t *testing.T) {
@@ -173,9 +161,6 @@ func TestSlotQueueBasic2(t *testing.T) {
if !obj.isIdle() { if !obj.isIdle() {
t.Fatalf("Should be idle") t.Fatalf("Should be idle")
} }
if ok, _ := obj.isNewContainerNeeded(); ok {
t.Fatalf("Should not need a new container")
}
outChan, cancel := obj.startDequeuer(context.Background()) outChan, cancel := obj.startDequeuer(context.Background())
select { select {
@@ -187,6 +172,27 @@ func TestSlotQueueBasic2(t *testing.T) {
cancel() cancel()
} }
func TestSlotNewContainerLogic1(t *testing.T) {
cur := slotQueueStats{}
cur.states[SlotQueueRunner] = 10
cur.states[SlotQueueWaiter] = 1
prev := cur
if !isNewContainerNeeded(&cur, &prev) {
t.Fatalf("Should need a new container cur: %#v prev: %#v", cur, prev)
}
prev.latencies[SlotQueueRunner] = 1
prev.latencies[SlotQueueWaiter] = 1
prev.latencies[SlotQueueStarter] = 1
if isNewContainerNeeded(&cur, &prev) {
t.Fatalf("Should not need a new container cur: %#v prev: %#v", cur, prev)
}
}
func TestSlotQueueBasic3(t *testing.T) { func TestSlotQueueBasic3(t *testing.T) {
slotName := "test3" slotName := "test3"