fn: uds init wait latency metric in prometheus (#1336)

* fn: uds init wait latency metric in prometheus

Adding tracker for UDS initialization during container start. This
complements our existing container state latency trackers and
docker-api latency trackers.
This commit is contained in:
Tolga Ceylan
2018-11-30 14:09:01 -08:00
committed by GitHub
parent fea4e931ce
commit 179b5e0e4c
2 changed files with 43 additions and 4 deletions

View File

@@ -895,16 +895,26 @@ func (a *agent) runHot(ctx context.Context, caller slotCaller, call *call, tok R
go func() {
defer cancel() // also close if we get an agent shutdown / idle timeout
// We record init wait for three basic states below: "initialized", "canceled", "timedout"
// Notice how we do not distinguish between agent-shutdown, eviction, ctx.Done, etc. This is
// because monitoring go-routine may pick these events earlier and cancel the ctx.
initStart := time.Now()
// INIT BARRIER HERE. Wait for the initialization go-routine signal
select {
case <-initialized:
statsContainerUDSInitLatency(ctx, initStart, time.Now(), "initialized")
case <-a.shutWg.Closer(): // agent shutdown
statsContainerUDSInitLatency(ctx, initStart, time.Now(), "canceled")
return
case <-ctx.Done():
statsContainerUDSInitLatency(ctx, initStart, time.Now(), "canceled")
return
case <-evictor.C: // eviction
statsContainerUDSInitLatency(ctx, initStart, time.Now(), "canceled")
return
case <-time.After(a.cfg.HotStartTimeout):
statsContainerUDSInitLatency(ctx, initStart, time.Now(), "timedout")
tryQueueErr(models.ErrContainerInitTimeout, errQueue)
return
}

View File

@@ -16,6 +16,7 @@ import (
var (
containerStateKey = common.MakeKey("container_state")
callStatusKey = common.MakeKey("call_status")
containerUDSStateKey = common.MakeKey("container_uds_state")
)
func statsCalls(ctx context.Context) {
@@ -66,6 +67,22 @@ func statsLBAgentRunnerExecLatency(ctx context.Context, dur time.Duration) {
stats.Record(ctx, runnerExecLatencyMeasure.M(int64(dur/time.Millisecond)))
}
func statsContainerUDSInitLatency(ctx context.Context, start time.Time, end time.Time, containerUDSState string) {
if end.Before(start) {
return
}
ctx, err := tag.New(ctx,
tag.Upsert(containerUDSStateKey, containerUDSState),
)
if err != nil {
logrus.Fatal(err)
}
dur := end.Sub(start)
stats.Record(ctx, containerUDSInitLatencyMeasure.M(int64(dur/time.Millisecond)))
}
func statsContainerEvicted(ctx context.Context, containerState string) {
ctx, err := tag.New(ctx,
tag.Upsert(containerStateKey, containerState),
@@ -130,6 +147,7 @@ const (
serverBusyMetricName = "server_busy"
containerEvictedMetricName = "container_evictions"
containerUDSInitLatencyMetricName = "container_uds_init_latency"
utilCpuUsedMetricName = "util_cpu_used"
utilCpuAvailMetricName = "util_cpu_avail"
@@ -161,6 +179,7 @@ var (
utilMemAvailMeasure = common.MakeMeasure(utilMemAvailMetricName, "agent memory available", "By")
containerEvictedMeasure = common.MakeMeasure(containerEvictedMetricName, "containers evicted", "")
containerUDSInitLatencyMeasure = common.MakeMeasure(containerUDSInitLatencyMetricName, "container UDS Init-Wait Latency", "msecs")
// Reported By LB: How long does a runner scheduler wait for a committed call? eg. wait/launch/pull containers
runnerSchedLatencyMeasure = common.MakeMeasure(runnerSchedLatencyMetricName, "Runner Scheduler Latency Reported By LBAgent", "msecs")
@@ -271,8 +290,18 @@ func RegisterContainerViews(tagKeys []string, latencyDist []float64) {
}
}
// add container uds_state tag for uds-wait
udsInitTags := make([]string, 0, len(tagKeys)+1)
udsInitTags = append(udsInitTags, "container_uds_state")
for _, key := range tagKeys {
if key != "container_uds_state" {
udsInitTags = append(udsInitTags, key)
}
}
err := view.Register(
common.CreateView(containerEvictedMeasure, view.Count(), evictTags),
common.CreateView(containerUDSInitLatencyMeasure, view.Distribution(latencyDist...), udsInitTags),
)
if err != nil {
logrus.WithError(err).Fatal("cannot register view")