mirror of
https://github.com/fnproject/fn.git
synced 2022-10-28 21:29:17 +03:00
fn: paused and evicted container stats (#1209)
* fn: paused and evicted container stats With this change, now stats reports paused state as well as incidents of container exit due to evictions. * fn: update/document state transitions in state tracker There's no case of a transition moving from done to waiting. This must be deprecated behavior.
This commit is contained in:
@@ -898,13 +898,18 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
|
||||
return false
|
||||
}
|
||||
isFrozen = true
|
||||
state.UpdateState(ctx, ContainerStatePaused, call.slots)
|
||||
}
|
||||
|
||||
evictor := a.evictor.GetEvictor(call.ID, call.slotHashId, call.Memory+uint64(call.TmpFsSize), uint64(call.CPUs))
|
||||
|
||||
state.UpdateState(ctx, ContainerStateIdle, call.slots)
|
||||
if !isFrozen {
|
||||
state.UpdateState(ctx, ContainerStateIdle, call.slots)
|
||||
}
|
||||
s := call.slots.queueSlot(slot)
|
||||
|
||||
isEvictEvent := false
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-s.trigger: // slot already consumed
|
||||
@@ -918,10 +923,12 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
|
||||
return false
|
||||
}
|
||||
isFrozen = true
|
||||
state.UpdateState(ctx, ContainerStatePaused, call.slots)
|
||||
}
|
||||
continue
|
||||
case <-evictor.C:
|
||||
logger.Debug("attempting hot function eject")
|
||||
isEvictEvent = true
|
||||
case <-ejectTimer.C:
|
||||
// we've been idle too long, now we are ejectable
|
||||
a.evictor.RegisterEvictor(evictor)
|
||||
@@ -940,6 +947,9 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
|
||||
// otherwise continue processing the request
|
||||
if call.slots.acquireSlot(s) {
|
||||
slot.Close(ctx)
|
||||
if isEvictEvent {
|
||||
statsContainerEvicted(ctx)
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
|
||||
@@ -163,6 +163,7 @@ func (a *slotQueue) isIdle() bool {
|
||||
a.stats.containerStates[ContainerStateWait] == 0 &&
|
||||
a.stats.containerStates[ContainerStateStart] == 0 &&
|
||||
a.stats.containerStates[ContainerStateIdle] == 0 &&
|
||||
a.stats.containerStates[ContainerStatePaused] == 0 &&
|
||||
a.stats.containerStates[ContainerStateBusy] == 0
|
||||
|
||||
a.statsLock.Unlock()
|
||||
@@ -180,7 +181,7 @@ func (a *slotQueue) getStats() slotQueueStats {
|
||||
|
||||
func isNewContainerNeeded(cur *slotQueueStats) bool {
|
||||
|
||||
idleWorkers := cur.containerStates[ContainerStateIdle]
|
||||
idleWorkers := cur.containerStates[ContainerStateIdle] + cur.containerStates[ContainerStatePaused]
|
||||
starters := cur.containerStates[ContainerStateStart]
|
||||
startWaiters := cur.containerStates[ContainerStateWait]
|
||||
|
||||
|
||||
@@ -47,12 +47,13 @@ const (
|
||||
)
|
||||
|
||||
const (
|
||||
ContainerStateNone ContainerStateType = iota // uninitialized
|
||||
ContainerStateWait // resource (cpu + mem) waiting
|
||||
ContainerStateStart // launching
|
||||
ContainerStateIdle // running idle
|
||||
ContainerStateBusy // running busy
|
||||
ContainerStateDone // exited/failed/done
|
||||
ContainerStateNone ContainerStateType = iota // uninitialized
|
||||
ContainerStateWait // resource (cpu + mem) waiting
|
||||
ContainerStateStart // launching
|
||||
ContainerStateIdle // running: idle but not paused
|
||||
ContainerStatePaused // running: idle but paused
|
||||
ContainerStateBusy // running: busy
|
||||
ContainerStateDone // exited/failed/done
|
||||
ContainerStateMax
|
||||
)
|
||||
|
||||
@@ -61,14 +62,16 @@ var containerGaugeKeys = [ContainerStateMax]string{
|
||||
"container_wait_total",
|
||||
"container_start_total",
|
||||
"container_idle_total",
|
||||
"container_paused_total",
|
||||
"container_busy_total",
|
||||
"container_done_total",
|
||||
}
|
||||
|
||||
var containerTimeKeys = [ContainerStateMax]string{
|
||||
"",
|
||||
"container_wait_duration_seconds",
|
||||
"container_start_duration_seconds",
|
||||
"container_idle_duration_seconds",
|
||||
"container_paused_duration_seconds",
|
||||
"container_busy_duration_seconds",
|
||||
}
|
||||
|
||||
@@ -101,6 +104,10 @@ func (c *requestState) UpdateState(ctx context.Context, newState RequestStateTyp
|
||||
}
|
||||
}
|
||||
|
||||
func isIdleState(state ContainerStateType) bool {
|
||||
return state == ContainerStateIdle || state == ContainerStatePaused
|
||||
}
|
||||
|
||||
func (c *containerState) UpdateState(ctx context.Context, newState ContainerStateType, slots *slotQueue) {
|
||||
|
||||
var now time.Time
|
||||
@@ -109,11 +116,13 @@ func (c *containerState) UpdateState(ctx context.Context, newState ContainerStat
|
||||
|
||||
c.lock.Lock()
|
||||
|
||||
// except for 1) switching back to idle from busy (hot containers) or 2)
|
||||
// to waiting from done, otherwise we can only move forward in states
|
||||
// Only the following state transitions are allowed:
|
||||
// 1) any move forward in states as per ContainerStateType order
|
||||
// 2) move back: from paused to idle
|
||||
// 3) move back: from busy to idle/paused
|
||||
if c.state < newState ||
|
||||
(c.state == ContainerStateBusy && newState == ContainerStateIdle) ||
|
||||
(c.state == ContainerStateDone && newState == ContainerStateIdle) {
|
||||
(c.state == ContainerStatePaused && newState == ContainerStateIdle) ||
|
||||
(c.state == ContainerStateBusy && isIdleState(newState)) {
|
||||
|
||||
now = time.Now()
|
||||
oldState = c.state
|
||||
|
||||
@@ -66,6 +66,10 @@ func statsLBAgentRunnerExecLatency(ctx context.Context, dur time.Duration) {
|
||||
stats.Record(ctx, runnerExecLatencyMeasure.M(int64(dur/time.Millisecond)))
|
||||
}
|
||||
|
||||
func statsContainerEvicted(ctx context.Context) {
|
||||
stats.Record(ctx, containerEvictedMeasure.M(0))
|
||||
}
|
||||
|
||||
const (
|
||||
// TODO we should probably prefix these with calls_ ?
|
||||
queuedMetricName = "queued"
|
||||
@@ -77,6 +81,8 @@ const (
|
||||
errorsMetricName = "errors"
|
||||
serverBusyMetricName = "server_busy"
|
||||
|
||||
containerEvictedMetricName = "container_evictions"
|
||||
|
||||
// Reported By LB
|
||||
runnerSchedLatencyMetricName = "lb_runner_sched_latency"
|
||||
runnerExecLatencyMetricName = "lb_runner_exec_latency"
|
||||
@@ -96,6 +102,8 @@ var (
|
||||
containerGaugeMeasures = initContainerGaugeMeasures()
|
||||
containerTimeMeasures = initContainerTimeMeasures()
|
||||
|
||||
containerEvictedMeasure = common.MakeMeasure(containerEvictedMetricName, "containers evicted", "")
|
||||
|
||||
// Reported By LB: How long does a runner scheduler wait for a committed call? eg. wait/launch/pull containers
|
||||
runnerSchedLatencyMeasure = common.MakeMeasure(runnerSchedLatencyMetricName, "Runner Scheduler Latency Reported By LBAgent", "msecs")
|
||||
// Reported By LB: Function execution time inside a container.
|
||||
@@ -179,6 +187,13 @@ func RegisterContainerViews(tagKeys []string, latencyDist []float64) {
|
||||
logrus.WithError(err).Fatal("cannot register view")
|
||||
}
|
||||
}
|
||||
|
||||
err := view.Register(
|
||||
common.CreateView(containerEvictedMeasure, view.Count(), tagKeys),
|
||||
)
|
||||
if err != nil {
|
||||
logrus.WithError(err).Fatal("cannot register view")
|
||||
}
|
||||
}
|
||||
|
||||
// initDockerMeasures initializes Docker related measures
|
||||
|
||||
Reference in New Issue
Block a user