fn: paused and evicted container stats (#1209)

* fn: paused and evicted container stats

With this change, now stats reports paused state
as well as incidents of container exit due to evictions.

* fn: update/document state transitions in state tracker

There's no case of a transition moving from done to waiting. This
must be deprecated behavior.
This commit is contained in:
Tolga Ceylan
2018-09-13 16:24:26 -07:00
committed by GitHub
parent ede5b93c34
commit 4dcdb7d982
4 changed files with 48 additions and 13 deletions

View File

@@ -898,13 +898,18 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
return false return false
} }
isFrozen = true isFrozen = true
state.UpdateState(ctx, ContainerStatePaused, call.slots)
} }
evictor := a.evictor.GetEvictor(call.ID, call.slotHashId, call.Memory+uint64(call.TmpFsSize), uint64(call.CPUs)) evictor := a.evictor.GetEvictor(call.ID, call.slotHashId, call.Memory+uint64(call.TmpFsSize), uint64(call.CPUs))
state.UpdateState(ctx, ContainerStateIdle, call.slots) if !isFrozen {
state.UpdateState(ctx, ContainerStateIdle, call.slots)
}
s := call.slots.queueSlot(slot) s := call.slots.queueSlot(slot)
isEvictEvent := false
for { for {
select { select {
case <-s.trigger: // slot already consumed case <-s.trigger: // slot already consumed
@@ -918,10 +923,12 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
return false return false
} }
isFrozen = true isFrozen = true
state.UpdateState(ctx, ContainerStatePaused, call.slots)
} }
continue continue
case <-evictor.C: case <-evictor.C:
logger.Debug("attempting hot function eject") logger.Debug("attempting hot function eject")
isEvictEvent = true
case <-ejectTimer.C: case <-ejectTimer.C:
// we've been idle too long, now we are ejectable // we've been idle too long, now we are ejectable
a.evictor.RegisterEvictor(evictor) a.evictor.RegisterEvictor(evictor)
@@ -940,6 +947,9 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
// otherwise continue processing the request // otherwise continue processing the request
if call.slots.acquireSlot(s) { if call.slots.acquireSlot(s) {
slot.Close(ctx) slot.Close(ctx)
if isEvictEvent {
statsContainerEvicted(ctx)
}
return false return false
} }

View File

@@ -163,6 +163,7 @@ func (a *slotQueue) isIdle() bool {
a.stats.containerStates[ContainerStateWait] == 0 && a.stats.containerStates[ContainerStateWait] == 0 &&
a.stats.containerStates[ContainerStateStart] == 0 && a.stats.containerStates[ContainerStateStart] == 0 &&
a.stats.containerStates[ContainerStateIdle] == 0 && a.stats.containerStates[ContainerStateIdle] == 0 &&
a.stats.containerStates[ContainerStatePaused] == 0 &&
a.stats.containerStates[ContainerStateBusy] == 0 a.stats.containerStates[ContainerStateBusy] == 0
a.statsLock.Unlock() a.statsLock.Unlock()
@@ -180,7 +181,7 @@ func (a *slotQueue) getStats() slotQueueStats {
func isNewContainerNeeded(cur *slotQueueStats) bool { func isNewContainerNeeded(cur *slotQueueStats) bool {
idleWorkers := cur.containerStates[ContainerStateIdle] idleWorkers := cur.containerStates[ContainerStateIdle] + cur.containerStates[ContainerStatePaused]
starters := cur.containerStates[ContainerStateStart] starters := cur.containerStates[ContainerStateStart]
startWaiters := cur.containerStates[ContainerStateWait] startWaiters := cur.containerStates[ContainerStateWait]

View File

@@ -47,12 +47,13 @@ const (
) )
const ( const (
ContainerStateNone ContainerStateType = iota // uninitialized ContainerStateNone ContainerStateType = iota // uninitialized
ContainerStateWait // resource (cpu + mem) waiting ContainerStateWait // resource (cpu + mem) waiting
ContainerStateStart // launching ContainerStateStart // launching
ContainerStateIdle // running idle ContainerStateIdle // running: idle but not paused
ContainerStateBusy // running busy ContainerStatePaused // running: idle but paused
ContainerStateDone // exited/failed/done ContainerStateBusy // running: busy
ContainerStateDone // exited/failed/done
ContainerStateMax ContainerStateMax
) )
@@ -61,14 +62,16 @@ var containerGaugeKeys = [ContainerStateMax]string{
"container_wait_total", "container_wait_total",
"container_start_total", "container_start_total",
"container_idle_total", "container_idle_total",
"container_paused_total",
"container_busy_total", "container_busy_total",
"container_done_total",
} }
var containerTimeKeys = [ContainerStateMax]string{ var containerTimeKeys = [ContainerStateMax]string{
"", "",
"container_wait_duration_seconds", "container_wait_duration_seconds",
"container_start_duration_seconds", "container_start_duration_seconds",
"container_idle_duration_seconds", "container_idle_duration_seconds",
"container_paused_duration_seconds",
"container_busy_duration_seconds", "container_busy_duration_seconds",
} }
@@ -101,6 +104,10 @@ func (c *requestState) UpdateState(ctx context.Context, newState RequestStateTyp
} }
} }
func isIdleState(state ContainerStateType) bool {
return state == ContainerStateIdle || state == ContainerStatePaused
}
func (c *containerState) UpdateState(ctx context.Context, newState ContainerStateType, slots *slotQueue) { func (c *containerState) UpdateState(ctx context.Context, newState ContainerStateType, slots *slotQueue) {
var now time.Time var now time.Time
@@ -109,11 +116,13 @@ func (c *containerState) UpdateState(ctx context.Context, newState ContainerStat
c.lock.Lock() c.lock.Lock()
// except for 1) switching back to idle from busy (hot containers) or 2) // Only the following state transitions are allowed:
// to waiting from done, otherwise we can only move forward in states // 1) any move forward in states as per ContainerStateType order
// 2) move back: from paused to idle
// 3) move back: from busy to idle/paused
if c.state < newState || if c.state < newState ||
(c.state == ContainerStateBusy && newState == ContainerStateIdle) || (c.state == ContainerStatePaused && newState == ContainerStateIdle) ||
(c.state == ContainerStateDone && newState == ContainerStateIdle) { (c.state == ContainerStateBusy && isIdleState(newState)) {
now = time.Now() now = time.Now()
oldState = c.state oldState = c.state

View File

@@ -66,6 +66,10 @@ func statsLBAgentRunnerExecLatency(ctx context.Context, dur time.Duration) {
stats.Record(ctx, runnerExecLatencyMeasure.M(int64(dur/time.Millisecond))) stats.Record(ctx, runnerExecLatencyMeasure.M(int64(dur/time.Millisecond)))
} }
func statsContainerEvicted(ctx context.Context) {
stats.Record(ctx, containerEvictedMeasure.M(0))
}
const ( const (
// TODO we should probably prefix these with calls_ ? // TODO we should probably prefix these with calls_ ?
queuedMetricName = "queued" queuedMetricName = "queued"
@@ -77,6 +81,8 @@ const (
errorsMetricName = "errors" errorsMetricName = "errors"
serverBusyMetricName = "server_busy" serverBusyMetricName = "server_busy"
containerEvictedMetricName = "container_evictions"
// Reported By LB // Reported By LB
runnerSchedLatencyMetricName = "lb_runner_sched_latency" runnerSchedLatencyMetricName = "lb_runner_sched_latency"
runnerExecLatencyMetricName = "lb_runner_exec_latency" runnerExecLatencyMetricName = "lb_runner_exec_latency"
@@ -96,6 +102,8 @@ var (
containerGaugeMeasures = initContainerGaugeMeasures() containerGaugeMeasures = initContainerGaugeMeasures()
containerTimeMeasures = initContainerTimeMeasures() containerTimeMeasures = initContainerTimeMeasures()
containerEvictedMeasure = common.MakeMeasure(containerEvictedMetricName, "containers evicted", "")
// Reported By LB: How long does a runner scheduler wait for a committed call? eg. wait/launch/pull containers // Reported By LB: How long does a runner scheduler wait for a committed call? eg. wait/launch/pull containers
runnerSchedLatencyMeasure = common.MakeMeasure(runnerSchedLatencyMetricName, "Runner Scheduler Latency Reported By LBAgent", "msecs") runnerSchedLatencyMeasure = common.MakeMeasure(runnerSchedLatencyMetricName, "Runner Scheduler Latency Reported By LBAgent", "msecs")
// Reported By LB: Function execution time inside a container. // Reported By LB: Function execution time inside a container.
@@ -179,6 +187,13 @@ func RegisterContainerViews(tagKeys []string, latencyDist []float64) {
logrus.WithError(err).Fatal("cannot register view") logrus.WithError(err).Fatal("cannot register view")
} }
} }
err := view.Register(
common.CreateView(containerEvictedMeasure, view.Count(), tagKeys),
)
if err != nil {
logrus.WithError(err).Fatal("cannot register view")
}
} }
// initDockerMeasures initializes Docker related measures // initDockerMeasures initializes Docker related measures