mirror of
https://github.com/fnproject/fn.git
synced 2022-10-28 21:29:17 +03:00
fn, dockerd pid collector & go collector metrics (#837)
* fn, dockerd pid collector & go collector metrics
the prometheus client we're using has a nice collector for process metrics and
for go metrics. these are things we are very interested in operationally and
recently the benevolent team at opencensus made this possible again, so this
hooks it up for us with added dockerd sugar.
nannying the dockerd we're using should be super useful since that thing likes
to get carried away, it'll be nice to differentiate memory/cpu usage between
dockerd / the host / fn. this will basically only work in a 'dind'
environment, or on a linux host that is running fn outside of docker that is
configured with the permissions to be able to check this. otherwise, it will
simply fail. we also probably want disk i/o and net i/o information for that
as well, or at least it would be interesting to differentiate from the host,
but this isn't hooked up in the default collectors unfortunately.
dockerd:
```
dockerd_process_cpu_seconds_total 520.74
dockerd_process_max_fds 1.048576e+06
dockerd_process_resident_memory_bytes 9.033728e+07
dockerd_process_start_time_seconds 1.52029677322e+09
dockerd_process_virtual_memory_bytes 1.782509568e+09
```
fn:
```
fn_process_cpu_seconds_total 0.14
fn_process_max_fds 1024
fn_process_open_fds 12
fn_process_resident_memory_bytes 2.7348992e+07
fn_process_start_time_seconds 1.52056274238e+09
fn_process_virtual_memory_bytes 7.20068608e+08
```
go:
```
go_gc_duration_seconds{quantile="0"} 4.4194e-05
go_gc_duration_seconds{quantile="0.25"} 9.8118e-05
go_gc_duration_seconds{quantile="0.5"} 0.000105989
go_gc_duration_seconds{quantile="0.75"} 0.000106251
go_gc_duration_seconds{quantile="1"} 0.000157864
go_gc_duration_seconds_sum 0.000512416
go_gc_duration_seconds_count 5
go_goroutines 30
go_memstats_alloc_bytes 3.897696e+06
go_memstats_alloc_bytes_total 1.2916016e+07
go_memstats_buck_hash_sys_bytes 1.45034e+06
go_memstats_frees_total 75399
go_memstats_gc_sys_bytes 450560
go_memstats_heap_alloc_bytes 3.897696e+06
go_memstats_heap_idle_bytes 868352
go_memstats_heap_inuse_bytes 5.750784e+06
go_memstats_heap_objects 29925
go_memstats_heap_released_bytes_total 0
go_memstats_heap_sys_bytes 6.619136e+06
go_memstats_last_gc_time_seconds 1.520562751182639e+09
go_memstats_lookups_total 239
go_memstats_mallocs_total 105324
go_memstats_mcache_inuse_bytes 3472
go_memstats_mcache_sys_bytes 16384
go_memstats_mspan_inuse_bytes 90592
go_memstats_mspan_sys_bytes 98304
go_memstats_next_gc_bytes 6.31304e+06
go_memstats_other_sys_bytes 710548
go_memstats_stack_inuse_bytes 720896
go_memstats_stack_sys_bytes 720896
go_memstats_sys_bytes 1.0066168e+07
```
* cache pid until it stops working
This commit is contained in:
@@ -1,11 +1,13 @@
|
|||||||
package server
|
package server
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bufio"
|
||||||
"bytes"
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"encoding/base64"
|
"encoding/base64"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"io"
|
||||||
"net"
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
@@ -29,6 +31,7 @@ import (
|
|||||||
"github.com/fnproject/fn/fnext"
|
"github.com/fnproject/fn/fnext"
|
||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
zipkinhttp "github.com/openzipkin/zipkin-go/reporter/http"
|
zipkinhttp "github.com/openzipkin/zipkin-go/reporter/http"
|
||||||
|
promclient "github.com/prometheus/client_golang/prometheus"
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
"go.opencensus.io/exporter/prometheus"
|
"go.opencensus.io/exporter/prometheus"
|
||||||
"go.opencensus.io/exporter/zipkin"
|
"go.opencensus.io/exporter/zipkin"
|
||||||
@@ -518,7 +521,7 @@ func New(ctx context.Context, opts ...ServerOption) *Server {
|
|||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO need to fix this to handle the nil case better
|
// TODO this should be a 'plugin' most likely
|
||||||
func WithTracer(zipkinURL string) ServerOption {
|
func WithTracer(zipkinURL string) ServerOption {
|
||||||
return func(ctx context.Context, s *Server) error {
|
return func(ctx context.Context, s *Server) error {
|
||||||
var (
|
var (
|
||||||
@@ -540,9 +543,15 @@ func WithTracer(zipkinURL string) ServerOption {
|
|||||||
trace.SetDefaultSampler(trace.AlwaysSample())
|
trace.SetDefaultSampler(trace.AlwaysSample())
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO we can keep this on *Server and unregister it in Close()... can finagle later. same for tracer
|
reg := promclient.NewRegistry()
|
||||||
|
reg.MustRegister(promclient.NewProcessCollector(os.Getpid(), "fn"),
|
||||||
|
promclient.NewProcessCollectorPIDFn(dockerPid(), "dockerd"),
|
||||||
|
promclient.NewGoCollector(),
|
||||||
|
)
|
||||||
|
|
||||||
exporter, err := prometheus.NewExporter(prometheus.Options{
|
exporter, err := prometheus.NewExporter(prometheus.Options{
|
||||||
Namespace: "fn",
|
Namespace: "fn",
|
||||||
|
Registry: reg,
|
||||||
OnError: func(err error) { logrus.WithError(err).Error("opencensus prometheus exporter err") },
|
OnError: func(err error) { logrus.WithError(err).Error("opencensus prometheus exporter err") },
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -555,6 +564,64 @@ func WithTracer(zipkinURL string) ServerOption {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO plumbing considerations, we've put the S pipe next to the chandalier...
|
||||||
|
func dockerPid() func() (int, error) {
|
||||||
|
// prometheus' process collector only works on linux anyway. let them do the
|
||||||
|
// process detection, if we return an error here we just get 0 metrics and it
|
||||||
|
// does not log / blow up (that's fine!) it's also likely we hit permissions
|
||||||
|
// errors here for many installations, we want to do similar and ignore (we
|
||||||
|
// just want for prod).
|
||||||
|
|
||||||
|
var pid int
|
||||||
|
|
||||||
|
return func() (int, error) {
|
||||||
|
if pid != 0 {
|
||||||
|
// make sure it's docker pid.
|
||||||
|
if isDockerPid("/proc/" + strconv.Itoa(pid) + "/status") {
|
||||||
|
return pid, nil
|
||||||
|
}
|
||||||
|
pid = 0 // reset to go search
|
||||||
|
}
|
||||||
|
|
||||||
|
err := filepath.Walk("/proc", func(path string, info os.FileInfo, err error) error {
|
||||||
|
if err != nil || pid != 0 {
|
||||||
|
// we get permission errors digging around in here, ignore them and press on
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// /proc/<pid>/status
|
||||||
|
if strings.Count(path, "/") == 3 && strings.Contains(path, "/status") {
|
||||||
|
if isDockerPid(path) {
|
||||||
|
// extract pid from path
|
||||||
|
pid, _ = strconv.Atoi(path[6:strings.LastIndex(path, "/")])
|
||||||
|
return io.EOF // end the search
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// keep searching
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
if err == io.EOF { // used as sentinel
|
||||||
|
err = nil
|
||||||
|
}
|
||||||
|
return pid, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func isDockerPid(path string) bool {
|
||||||
|
// first line of status file is: "Name: <name>"
|
||||||
|
f, err := os.Open(path)
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
// scan first line only
|
||||||
|
scanner := bufio.NewScanner(f)
|
||||||
|
scanner.Scan()
|
||||||
|
return strings.HasSuffix(scanner.Text(), "dockerd")
|
||||||
|
}
|
||||||
|
|
||||||
func setMachineID() {
|
func setMachineID() {
|
||||||
port := uint16(getEnvInt(EnvPort, DefaultPort))
|
port := uint16(getEnvInt(EnvPort, DefaultPort))
|
||||||
addr := whoAmI().To4()
|
addr := whoAmI().To4()
|
||||||
|
|||||||
Reference in New Issue
Block a user