From 4dcdb7d9823f1bbba4a93ad28f35aea904095e62 Mon Sep 17 00:00:00 2001 From: Tolga Ceylan Date: Thu, 13 Sep 2018 16:24:26 -0700 Subject: [PATCH] fn: paused and evicted container stats (#1209) * fn: paused and evicted container stats With this change, now stats reports paused state as well as incidents of container exit due to evictions. * fn: update/document state transitions in state tracker There's no case of a transition moving from done to waiting. This must be deprecated behavior. --- api/agent/agent.go | 12 +++++++++++- api/agent/slots.go | 3 ++- api/agent/state_trackers.go | 31 ++++++++++++++++++++----------- api/agent/stats.go | 15 +++++++++++++++ 4 files changed, 48 insertions(+), 13 deletions(-) diff --git a/api/agent/agent.go b/api/agent/agent.go index 3e62bade9..76107df36 100644 --- a/api/agent/agent.go +++ b/api/agent/agent.go @@ -898,13 +898,18 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState, return false } isFrozen = true + state.UpdateState(ctx, ContainerStatePaused, call.slots) } evictor := a.evictor.GetEvictor(call.ID, call.slotHashId, call.Memory+uint64(call.TmpFsSize), uint64(call.CPUs)) - state.UpdateState(ctx, ContainerStateIdle, call.slots) + if !isFrozen { + state.UpdateState(ctx, ContainerStateIdle, call.slots) + } s := call.slots.queueSlot(slot) + isEvictEvent := false + for { select { case <-s.trigger: // slot already consumed @@ -918,10 +923,12 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState, return false } isFrozen = true + state.UpdateState(ctx, ContainerStatePaused, call.slots) } continue case <-evictor.C: logger.Debug("attempting hot function eject") + isEvictEvent = true case <-ejectTimer.C: // we've been idle too long, now we are ejectable a.evictor.RegisterEvictor(evictor) @@ -940,6 +947,9 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState, // otherwise continue processing the request if call.slots.acquireSlot(s) { slot.Close(ctx) + if isEvictEvent { + statsContainerEvicted(ctx) + } return false } diff --git a/api/agent/slots.go b/api/agent/slots.go index a19083272..085db61cc 100644 --- a/api/agent/slots.go +++ b/api/agent/slots.go @@ -163,6 +163,7 @@ func (a *slotQueue) isIdle() bool { a.stats.containerStates[ContainerStateWait] == 0 && a.stats.containerStates[ContainerStateStart] == 0 && a.stats.containerStates[ContainerStateIdle] == 0 && + a.stats.containerStates[ContainerStatePaused] == 0 && a.stats.containerStates[ContainerStateBusy] == 0 a.statsLock.Unlock() @@ -180,7 +181,7 @@ func (a *slotQueue) getStats() slotQueueStats { func isNewContainerNeeded(cur *slotQueueStats) bool { - idleWorkers := cur.containerStates[ContainerStateIdle] + idleWorkers := cur.containerStates[ContainerStateIdle] + cur.containerStates[ContainerStatePaused] starters := cur.containerStates[ContainerStateStart] startWaiters := cur.containerStates[ContainerStateWait] diff --git a/api/agent/state_trackers.go b/api/agent/state_trackers.go index 2d73da9e3..1c99e7a8c 100644 --- a/api/agent/state_trackers.go +++ b/api/agent/state_trackers.go @@ -47,12 +47,13 @@ const ( ) const ( - ContainerStateNone ContainerStateType = iota // uninitialized - ContainerStateWait // resource (cpu + mem) waiting - ContainerStateStart // launching - ContainerStateIdle // running idle - ContainerStateBusy // running busy - ContainerStateDone // exited/failed/done + ContainerStateNone ContainerStateType = iota // uninitialized + ContainerStateWait // resource (cpu + mem) waiting + ContainerStateStart // launching + ContainerStateIdle // running: idle but not paused + ContainerStatePaused // running: idle but paused + ContainerStateBusy // running: busy + ContainerStateDone // exited/failed/done ContainerStateMax ) @@ -61,14 +62,16 @@ var containerGaugeKeys = [ContainerStateMax]string{ "container_wait_total", "container_start_total", "container_idle_total", + "container_paused_total", "container_busy_total", - "container_done_total", } + var containerTimeKeys = [ContainerStateMax]string{ "", "container_wait_duration_seconds", "container_start_duration_seconds", "container_idle_duration_seconds", + "container_paused_duration_seconds", "container_busy_duration_seconds", } @@ -101,6 +104,10 @@ func (c *requestState) UpdateState(ctx context.Context, newState RequestStateTyp } } +func isIdleState(state ContainerStateType) bool { + return state == ContainerStateIdle || state == ContainerStatePaused +} + func (c *containerState) UpdateState(ctx context.Context, newState ContainerStateType, slots *slotQueue) { var now time.Time @@ -109,11 +116,13 @@ func (c *containerState) UpdateState(ctx context.Context, newState ContainerStat c.lock.Lock() - // except for 1) switching back to idle from busy (hot containers) or 2) - // to waiting from done, otherwise we can only move forward in states + // Only the following state transitions are allowed: + // 1) any move forward in states as per ContainerStateType order + // 2) move back: from paused to idle + // 3) move back: from busy to idle/paused if c.state < newState || - (c.state == ContainerStateBusy && newState == ContainerStateIdle) || - (c.state == ContainerStateDone && newState == ContainerStateIdle) { + (c.state == ContainerStatePaused && newState == ContainerStateIdle) || + (c.state == ContainerStateBusy && isIdleState(newState)) { now = time.Now() oldState = c.state diff --git a/api/agent/stats.go b/api/agent/stats.go index 830487cbe..77f99deee 100644 --- a/api/agent/stats.go +++ b/api/agent/stats.go @@ -66,6 +66,10 @@ func statsLBAgentRunnerExecLatency(ctx context.Context, dur time.Duration) { stats.Record(ctx, runnerExecLatencyMeasure.M(int64(dur/time.Millisecond))) } +func statsContainerEvicted(ctx context.Context) { + stats.Record(ctx, containerEvictedMeasure.M(0)) +} + const ( // TODO we should probably prefix these with calls_ ? queuedMetricName = "queued" @@ -77,6 +81,8 @@ const ( errorsMetricName = "errors" serverBusyMetricName = "server_busy" + containerEvictedMetricName = "container_evictions" + // Reported By LB runnerSchedLatencyMetricName = "lb_runner_sched_latency" runnerExecLatencyMetricName = "lb_runner_exec_latency" @@ -96,6 +102,8 @@ var ( containerGaugeMeasures = initContainerGaugeMeasures() containerTimeMeasures = initContainerTimeMeasures() + containerEvictedMeasure = common.MakeMeasure(containerEvictedMetricName, "containers evicted", "") + // Reported By LB: How long does a runner scheduler wait for a committed call? eg. wait/launch/pull containers runnerSchedLatencyMeasure = common.MakeMeasure(runnerSchedLatencyMetricName, "Runner Scheduler Latency Reported By LBAgent", "msecs") // Reported By LB: Function execution time inside a container. @@ -179,6 +187,13 @@ func RegisterContainerViews(tagKeys []string, latencyDist []float64) { logrus.WithError(err).Fatal("cannot register view") } } + + err := view.Register( + common.CreateView(containerEvictedMeasure, view.Count(), tagKeys), + ) + if err != nil { + logrus.WithError(err).Fatal("cannot register view") + } } // initDockerMeasures initializes Docker related measures