From 179b5e0e4c30f0a16a10b94c97c724f44fc3c29a Mon Sep 17 00:00:00 2001 From: Tolga Ceylan Date: Fri, 30 Nov 2018 14:09:01 -0800 Subject: [PATCH] fn: uds init wait latency metric in prometheus (#1336) * fn: uds init wait latency metric in prometheus Adding tracker for UDS initialization during container start. This complements our existing container state latency trackers and docker-api latency trackers. --- api/agent/agent.go | 10 ++++++++++ api/agent/stats.go | 37 +++++++++++++++++++++++++++++++++---- 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/api/agent/agent.go b/api/agent/agent.go index c36ac7ac1..83722d598 100644 --- a/api/agent/agent.go +++ b/api/agent/agent.go @@ -895,16 +895,26 @@ func (a *agent) runHot(ctx context.Context, caller slotCaller, call *call, tok R go func() { defer cancel() // also close if we get an agent shutdown / idle timeout + // We record init wait for three basic states below: "initialized", "canceled", "timedout" + // Notice how we do not distinguish between agent-shutdown, eviction, ctx.Done, etc. This is + // because monitoring go-routine may pick these events earlier and cancel the ctx. + initStart := time.Now() + // INIT BARRIER HERE. Wait for the initialization go-routine signal select { case <-initialized: + statsContainerUDSInitLatency(ctx, initStart, time.Now(), "initialized") case <-a.shutWg.Closer(): // agent shutdown + statsContainerUDSInitLatency(ctx, initStart, time.Now(), "canceled") return case <-ctx.Done(): + statsContainerUDSInitLatency(ctx, initStart, time.Now(), "canceled") return case <-evictor.C: // eviction + statsContainerUDSInitLatency(ctx, initStart, time.Now(), "canceled") return case <-time.After(a.cfg.HotStartTimeout): + statsContainerUDSInitLatency(ctx, initStart, time.Now(), "timedout") tryQueueErr(models.ErrContainerInitTimeout, errQueue) return } diff --git a/api/agent/stats.go b/api/agent/stats.go index 361244259..3dae5384f 100644 --- a/api/agent/stats.go +++ b/api/agent/stats.go @@ -14,8 +14,9 @@ import ( ) var ( - containerStateKey = common.MakeKey("container_state") - callStatusKey = common.MakeKey("call_status") + containerStateKey = common.MakeKey("container_state") + callStatusKey = common.MakeKey("call_status") + containerUDSStateKey = common.MakeKey("container_uds_state") ) func statsCalls(ctx context.Context) { @@ -66,6 +67,22 @@ func statsLBAgentRunnerExecLatency(ctx context.Context, dur time.Duration) { stats.Record(ctx, runnerExecLatencyMeasure.M(int64(dur/time.Millisecond))) } +func statsContainerUDSInitLatency(ctx context.Context, start time.Time, end time.Time, containerUDSState string) { + if end.Before(start) { + return + } + + ctx, err := tag.New(ctx, + tag.Upsert(containerUDSStateKey, containerUDSState), + ) + if err != nil { + logrus.Fatal(err) + } + + dur := end.Sub(start) + stats.Record(ctx, containerUDSInitLatencyMeasure.M(int64(dur/time.Millisecond))) +} + func statsContainerEvicted(ctx context.Context, containerState string) { ctx, err := tag.New(ctx, tag.Upsert(containerStateKey, containerState), @@ -129,7 +146,8 @@ const ( errorsMetricName = "errors" serverBusyMetricName = "server_busy" - containerEvictedMetricName = "container_evictions" + containerEvictedMetricName = "container_evictions" + containerUDSInitLatencyMetricName = "container_uds_init_latency" utilCpuUsedMetricName = "util_cpu_used" utilCpuAvailMetricName = "util_cpu_avail" @@ -160,7 +178,8 @@ var ( utilMemUsedMeasure = common.MakeMeasure(utilMemUsedMetricName, "agent memory in use", "By") utilMemAvailMeasure = common.MakeMeasure(utilMemAvailMetricName, "agent memory available", "By") - containerEvictedMeasure = common.MakeMeasure(containerEvictedMetricName, "containers evicted", "") + containerEvictedMeasure = common.MakeMeasure(containerEvictedMetricName, "containers evicted", "") + containerUDSInitLatencyMeasure = common.MakeMeasure(containerUDSInitLatencyMetricName, "container UDS Init-Wait Latency", "msecs") // Reported By LB: How long does a runner scheduler wait for a committed call? eg. wait/launch/pull containers runnerSchedLatencyMeasure = common.MakeMeasure(runnerSchedLatencyMetricName, "Runner Scheduler Latency Reported By LBAgent", "msecs") @@ -271,8 +290,18 @@ func RegisterContainerViews(tagKeys []string, latencyDist []float64) { } } + // add container uds_state tag for uds-wait + udsInitTags := make([]string, 0, len(tagKeys)+1) + udsInitTags = append(udsInitTags, "container_uds_state") + for _, key := range tagKeys { + if key != "container_uds_state" { + udsInitTags = append(udsInitTags, key) + } + } + err := view.Register( common.CreateView(containerEvictedMeasure, view.Count(), evictTags), + common.CreateView(containerUDSInitLatencyMeasure, view.Distribution(latencyDist...), udsInitTags), ) if err != nil { logrus.WithError(err).Fatal("cannot register view")