From 7fbbd7534946218e1a9cd3d45e0f68c7d3ede44c Mon Sep 17 00:00:00 2001 From: Reed Allman Date: Tue, 13 Mar 2018 15:42:43 -0700 Subject: [PATCH] fn, dockerd pid collector & go collector metrics (#837) * fn, dockerd pid collector & go collector metrics the prometheus client we're using has a nice collector for process metrics and for go metrics. these are things we are very interested in operationally and recently the benevolent team at opencensus made this possible again, so this hooks it up for us with added dockerd sugar. nannying the dockerd we're using should be super useful since that thing likes to get carried away, it'll be nice to differentiate memory/cpu usage between dockerd / the host / fn. this will basically only work in a 'dind' environment, or on a linux host that is running fn outside of docker that is configured with the permissions to be able to check this. otherwise, it will simply fail. we also probably want disk i/o and net i/o information for that as well, or at least it would be interesting to differentiate from the host, but this isn't hooked up in the default collectors unfortunately. dockerd: ``` dockerd_process_cpu_seconds_total 520.74 dockerd_process_max_fds 1.048576e+06 dockerd_process_resident_memory_bytes 9.033728e+07 dockerd_process_start_time_seconds 1.52029677322e+09 dockerd_process_virtual_memory_bytes 1.782509568e+09 ``` fn: ``` fn_process_cpu_seconds_total 0.14 fn_process_max_fds 1024 fn_process_open_fds 12 fn_process_resident_memory_bytes 2.7348992e+07 fn_process_start_time_seconds 1.52056274238e+09 fn_process_virtual_memory_bytes 7.20068608e+08 ``` go: ``` go_gc_duration_seconds{quantile="0"} 4.4194e-05 go_gc_duration_seconds{quantile="0.25"} 9.8118e-05 go_gc_duration_seconds{quantile="0.5"} 0.000105989 go_gc_duration_seconds{quantile="0.75"} 0.000106251 go_gc_duration_seconds{quantile="1"} 0.000157864 go_gc_duration_seconds_sum 0.000512416 go_gc_duration_seconds_count 5 go_goroutines 30 go_memstats_alloc_bytes 3.897696e+06 go_memstats_alloc_bytes_total 1.2916016e+07 go_memstats_buck_hash_sys_bytes 1.45034e+06 go_memstats_frees_total 75399 go_memstats_gc_sys_bytes 450560 go_memstats_heap_alloc_bytes 3.897696e+06 go_memstats_heap_idle_bytes 868352 go_memstats_heap_inuse_bytes 5.750784e+06 go_memstats_heap_objects 29925 go_memstats_heap_released_bytes_total 0 go_memstats_heap_sys_bytes 6.619136e+06 go_memstats_last_gc_time_seconds 1.520562751182639e+09 go_memstats_lookups_total 239 go_memstats_mallocs_total 105324 go_memstats_mcache_inuse_bytes 3472 go_memstats_mcache_sys_bytes 16384 go_memstats_mspan_inuse_bytes 90592 go_memstats_mspan_sys_bytes 98304 go_memstats_next_gc_bytes 6.31304e+06 go_memstats_other_sys_bytes 710548 go_memstats_stack_inuse_bytes 720896 go_memstats_stack_sys_bytes 720896 go_memstats_sys_bytes 1.0066168e+07 ``` * cache pid until it stops working --- api/server/server.go | 71 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 2 deletions(-) diff --git a/api/server/server.go b/api/server/server.go index c2d0f1c26..279aca869 100644 --- a/api/server/server.go +++ b/api/server/server.go @@ -1,11 +1,13 @@ package server import ( + "bufio" "bytes" "context" "encoding/base64" "errors" "fmt" + "io" "net" "net/http" "os" @@ -29,6 +31,7 @@ import ( "github.com/fnproject/fn/fnext" "github.com/gin-gonic/gin" zipkinhttp "github.com/openzipkin/zipkin-go/reporter/http" + promclient "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" "go.opencensus.io/exporter/prometheus" "go.opencensus.io/exporter/zipkin" @@ -518,7 +521,7 @@ func New(ctx context.Context, opts ...ServerOption) *Server { return s } -// TODO need to fix this to handle the nil case better +// TODO this should be a 'plugin' most likely func WithTracer(zipkinURL string) ServerOption { return func(ctx context.Context, s *Server) error { var ( @@ -540,9 +543,15 @@ func WithTracer(zipkinURL string) ServerOption { trace.SetDefaultSampler(trace.AlwaysSample()) } - // TODO we can keep this on *Server and unregister it in Close()... can finagle later. same for tracer + reg := promclient.NewRegistry() + reg.MustRegister(promclient.NewProcessCollector(os.Getpid(), "fn"), + promclient.NewProcessCollectorPIDFn(dockerPid(), "dockerd"), + promclient.NewGoCollector(), + ) + exporter, err := prometheus.NewExporter(prometheus.Options{ Namespace: "fn", + Registry: reg, OnError: func(err error) { logrus.WithError(err).Error("opencensus prometheus exporter err") }, }) if err != nil { @@ -555,6 +564,64 @@ func WithTracer(zipkinURL string) ServerOption { } } +// TODO plumbing considerations, we've put the S pipe next to the chandalier... +func dockerPid() func() (int, error) { + // prometheus' process collector only works on linux anyway. let them do the + // process detection, if we return an error here we just get 0 metrics and it + // does not log / blow up (that's fine!) it's also likely we hit permissions + // errors here for many installations, we want to do similar and ignore (we + // just want for prod). + + var pid int + + return func() (int, error) { + if pid != 0 { + // make sure it's docker pid. + if isDockerPid("/proc/" + strconv.Itoa(pid) + "/status") { + return pid, nil + } + pid = 0 // reset to go search + } + + err := filepath.Walk("/proc", func(path string, info os.FileInfo, err error) error { + if err != nil || pid != 0 { + // we get permission errors digging around in here, ignore them and press on + return nil + } + + // /proc//status + if strings.Count(path, "/") == 3 && strings.Contains(path, "/status") { + if isDockerPid(path) { + // extract pid from path + pid, _ = strconv.Atoi(path[6:strings.LastIndex(path, "/")]) + return io.EOF // end the search + } + } + + // keep searching + return nil + }) + if err == io.EOF { // used as sentinel + err = nil + } + return pid, err + } +} + +func isDockerPid(path string) bool { + // first line of status file is: "Name: " + f, err := os.Open(path) + if err != nil { + return false + } + defer f.Close() + + // scan first line only + scanner := bufio.NewScanner(f) + scanner.Scan() + return strings.HasSuffix(scanner.Text(), "dockerd") +} + func setMachineID() { port := uint16(getEnvInt(EnvPort, DefaultPort)) addr := whoAmI().To4()