mirror of
https://github.com/fnproject/fn.git
synced 2022-10-28 21:29:17 +03:00
fn: paused and evicted container stats (#1209)
* fn: paused and evicted container stats With this change, now stats reports paused state as well as incidents of container exit due to evictions. * fn: update/document state transitions in state tracker There's no case of a transition moving from done to waiting. This must be deprecated behavior.
This commit is contained in:
@@ -898,13 +898,18 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
isFrozen = true
|
isFrozen = true
|
||||||
|
state.UpdateState(ctx, ContainerStatePaused, call.slots)
|
||||||
}
|
}
|
||||||
|
|
||||||
evictor := a.evictor.GetEvictor(call.ID, call.slotHashId, call.Memory+uint64(call.TmpFsSize), uint64(call.CPUs))
|
evictor := a.evictor.GetEvictor(call.ID, call.slotHashId, call.Memory+uint64(call.TmpFsSize), uint64(call.CPUs))
|
||||||
|
|
||||||
state.UpdateState(ctx, ContainerStateIdle, call.slots)
|
if !isFrozen {
|
||||||
|
state.UpdateState(ctx, ContainerStateIdle, call.slots)
|
||||||
|
}
|
||||||
s := call.slots.queueSlot(slot)
|
s := call.slots.queueSlot(slot)
|
||||||
|
|
||||||
|
isEvictEvent := false
|
||||||
|
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-s.trigger: // slot already consumed
|
case <-s.trigger: // slot already consumed
|
||||||
@@ -918,10 +923,12 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
isFrozen = true
|
isFrozen = true
|
||||||
|
state.UpdateState(ctx, ContainerStatePaused, call.slots)
|
||||||
}
|
}
|
||||||
continue
|
continue
|
||||||
case <-evictor.C:
|
case <-evictor.C:
|
||||||
logger.Debug("attempting hot function eject")
|
logger.Debug("attempting hot function eject")
|
||||||
|
isEvictEvent = true
|
||||||
case <-ejectTimer.C:
|
case <-ejectTimer.C:
|
||||||
// we've been idle too long, now we are ejectable
|
// we've been idle too long, now we are ejectable
|
||||||
a.evictor.RegisterEvictor(evictor)
|
a.evictor.RegisterEvictor(evictor)
|
||||||
@@ -940,6 +947,9 @@ func (a *agent) runHotReq(ctx context.Context, call *call, state ContainerState,
|
|||||||
// otherwise continue processing the request
|
// otherwise continue processing the request
|
||||||
if call.slots.acquireSlot(s) {
|
if call.slots.acquireSlot(s) {
|
||||||
slot.Close(ctx)
|
slot.Close(ctx)
|
||||||
|
if isEvictEvent {
|
||||||
|
statsContainerEvicted(ctx)
|
||||||
|
}
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -163,6 +163,7 @@ func (a *slotQueue) isIdle() bool {
|
|||||||
a.stats.containerStates[ContainerStateWait] == 0 &&
|
a.stats.containerStates[ContainerStateWait] == 0 &&
|
||||||
a.stats.containerStates[ContainerStateStart] == 0 &&
|
a.stats.containerStates[ContainerStateStart] == 0 &&
|
||||||
a.stats.containerStates[ContainerStateIdle] == 0 &&
|
a.stats.containerStates[ContainerStateIdle] == 0 &&
|
||||||
|
a.stats.containerStates[ContainerStatePaused] == 0 &&
|
||||||
a.stats.containerStates[ContainerStateBusy] == 0
|
a.stats.containerStates[ContainerStateBusy] == 0
|
||||||
|
|
||||||
a.statsLock.Unlock()
|
a.statsLock.Unlock()
|
||||||
@@ -180,7 +181,7 @@ func (a *slotQueue) getStats() slotQueueStats {
|
|||||||
|
|
||||||
func isNewContainerNeeded(cur *slotQueueStats) bool {
|
func isNewContainerNeeded(cur *slotQueueStats) bool {
|
||||||
|
|
||||||
idleWorkers := cur.containerStates[ContainerStateIdle]
|
idleWorkers := cur.containerStates[ContainerStateIdle] + cur.containerStates[ContainerStatePaused]
|
||||||
starters := cur.containerStates[ContainerStateStart]
|
starters := cur.containerStates[ContainerStateStart]
|
||||||
startWaiters := cur.containerStates[ContainerStateWait]
|
startWaiters := cur.containerStates[ContainerStateWait]
|
||||||
|
|
||||||
|
|||||||
@@ -47,12 +47,13 @@ const (
|
|||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
ContainerStateNone ContainerStateType = iota // uninitialized
|
ContainerStateNone ContainerStateType = iota // uninitialized
|
||||||
ContainerStateWait // resource (cpu + mem) waiting
|
ContainerStateWait // resource (cpu + mem) waiting
|
||||||
ContainerStateStart // launching
|
ContainerStateStart // launching
|
||||||
ContainerStateIdle // running idle
|
ContainerStateIdle // running: idle but not paused
|
||||||
ContainerStateBusy // running busy
|
ContainerStatePaused // running: idle but paused
|
||||||
ContainerStateDone // exited/failed/done
|
ContainerStateBusy // running: busy
|
||||||
|
ContainerStateDone // exited/failed/done
|
||||||
ContainerStateMax
|
ContainerStateMax
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -61,14 +62,16 @@ var containerGaugeKeys = [ContainerStateMax]string{
|
|||||||
"container_wait_total",
|
"container_wait_total",
|
||||||
"container_start_total",
|
"container_start_total",
|
||||||
"container_idle_total",
|
"container_idle_total",
|
||||||
|
"container_paused_total",
|
||||||
"container_busy_total",
|
"container_busy_total",
|
||||||
"container_done_total",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var containerTimeKeys = [ContainerStateMax]string{
|
var containerTimeKeys = [ContainerStateMax]string{
|
||||||
"",
|
"",
|
||||||
"container_wait_duration_seconds",
|
"container_wait_duration_seconds",
|
||||||
"container_start_duration_seconds",
|
"container_start_duration_seconds",
|
||||||
"container_idle_duration_seconds",
|
"container_idle_duration_seconds",
|
||||||
|
"container_paused_duration_seconds",
|
||||||
"container_busy_duration_seconds",
|
"container_busy_duration_seconds",
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -101,6 +104,10 @@ func (c *requestState) UpdateState(ctx context.Context, newState RequestStateTyp
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func isIdleState(state ContainerStateType) bool {
|
||||||
|
return state == ContainerStateIdle || state == ContainerStatePaused
|
||||||
|
}
|
||||||
|
|
||||||
func (c *containerState) UpdateState(ctx context.Context, newState ContainerStateType, slots *slotQueue) {
|
func (c *containerState) UpdateState(ctx context.Context, newState ContainerStateType, slots *slotQueue) {
|
||||||
|
|
||||||
var now time.Time
|
var now time.Time
|
||||||
@@ -109,11 +116,13 @@ func (c *containerState) UpdateState(ctx context.Context, newState ContainerStat
|
|||||||
|
|
||||||
c.lock.Lock()
|
c.lock.Lock()
|
||||||
|
|
||||||
// except for 1) switching back to idle from busy (hot containers) or 2)
|
// Only the following state transitions are allowed:
|
||||||
// to waiting from done, otherwise we can only move forward in states
|
// 1) any move forward in states as per ContainerStateType order
|
||||||
|
// 2) move back: from paused to idle
|
||||||
|
// 3) move back: from busy to idle/paused
|
||||||
if c.state < newState ||
|
if c.state < newState ||
|
||||||
(c.state == ContainerStateBusy && newState == ContainerStateIdle) ||
|
(c.state == ContainerStatePaused && newState == ContainerStateIdle) ||
|
||||||
(c.state == ContainerStateDone && newState == ContainerStateIdle) {
|
(c.state == ContainerStateBusy && isIdleState(newState)) {
|
||||||
|
|
||||||
now = time.Now()
|
now = time.Now()
|
||||||
oldState = c.state
|
oldState = c.state
|
||||||
|
|||||||
@@ -66,6 +66,10 @@ func statsLBAgentRunnerExecLatency(ctx context.Context, dur time.Duration) {
|
|||||||
stats.Record(ctx, runnerExecLatencyMeasure.M(int64(dur/time.Millisecond)))
|
stats.Record(ctx, runnerExecLatencyMeasure.M(int64(dur/time.Millisecond)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func statsContainerEvicted(ctx context.Context) {
|
||||||
|
stats.Record(ctx, containerEvictedMeasure.M(0))
|
||||||
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
// TODO we should probably prefix these with calls_ ?
|
// TODO we should probably prefix these with calls_ ?
|
||||||
queuedMetricName = "queued"
|
queuedMetricName = "queued"
|
||||||
@@ -77,6 +81,8 @@ const (
|
|||||||
errorsMetricName = "errors"
|
errorsMetricName = "errors"
|
||||||
serverBusyMetricName = "server_busy"
|
serverBusyMetricName = "server_busy"
|
||||||
|
|
||||||
|
containerEvictedMetricName = "container_evictions"
|
||||||
|
|
||||||
// Reported By LB
|
// Reported By LB
|
||||||
runnerSchedLatencyMetricName = "lb_runner_sched_latency"
|
runnerSchedLatencyMetricName = "lb_runner_sched_latency"
|
||||||
runnerExecLatencyMetricName = "lb_runner_exec_latency"
|
runnerExecLatencyMetricName = "lb_runner_exec_latency"
|
||||||
@@ -96,6 +102,8 @@ var (
|
|||||||
containerGaugeMeasures = initContainerGaugeMeasures()
|
containerGaugeMeasures = initContainerGaugeMeasures()
|
||||||
containerTimeMeasures = initContainerTimeMeasures()
|
containerTimeMeasures = initContainerTimeMeasures()
|
||||||
|
|
||||||
|
containerEvictedMeasure = common.MakeMeasure(containerEvictedMetricName, "containers evicted", "")
|
||||||
|
|
||||||
// Reported By LB: How long does a runner scheduler wait for a committed call? eg. wait/launch/pull containers
|
// Reported By LB: How long does a runner scheduler wait for a committed call? eg. wait/launch/pull containers
|
||||||
runnerSchedLatencyMeasure = common.MakeMeasure(runnerSchedLatencyMetricName, "Runner Scheduler Latency Reported By LBAgent", "msecs")
|
runnerSchedLatencyMeasure = common.MakeMeasure(runnerSchedLatencyMetricName, "Runner Scheduler Latency Reported By LBAgent", "msecs")
|
||||||
// Reported By LB: Function execution time inside a container.
|
// Reported By LB: Function execution time inside a container.
|
||||||
@@ -179,6 +187,13 @@ func RegisterContainerViews(tagKeys []string, latencyDist []float64) {
|
|||||||
logrus.WithError(err).Fatal("cannot register view")
|
logrus.WithError(err).Fatal("cannot register view")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
err := view.Register(
|
||||||
|
common.CreateView(containerEvictedMeasure, view.Count(), tagKeys),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
logrus.WithError(err).Fatal("cannot register view")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// initDockerMeasures initializes Docker related measures
|
// initDockerMeasures initializes Docker related measures
|
||||||
|
|||||||
Reference in New Issue
Block a user