mirror of
https://github.com/fnproject/fn.git
synced 2022-10-28 21:29:17 +03:00
Merge pull request #447 from fnproject/tracing_to_prometheus
Tracing to prometheus
This commit is contained in:
@@ -188,6 +188,8 @@ func (a *agent) Submit(callI Call) error {
|
||||
ctx := call.req.Context()
|
||||
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_submit")
|
||||
span.SetBaggageItem("fn_appname", callI.Model().AppName)
|
||||
span.SetBaggageItem("fn_path", callI.Model().Path)
|
||||
defer span.Finish()
|
||||
|
||||
// start the timer STAT! TODO add some wiggle room
|
||||
|
||||
60
api/server/fntracer.go
Normal file
60
api/server/fntracer.go
Normal file
@@ -0,0 +1,60 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"github.com/opentracing/opentracing-go"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// FnTracer is a custom Tracer which wraps another another tracer
|
||||
// its main purpose is to wrap the underlying Span in a FnSpan,
|
||||
// which adds some extra behaviour required for sending tracing spans to prometheus
|
||||
type FnTracer struct {
|
||||
opentracing.Tracer
|
||||
}
|
||||
|
||||
// NewFnTracer returns a new FnTracer which wraps the specified Tracer
|
||||
func NewFnTracer(t opentracing.Tracer) opentracing.Tracer {
|
||||
return &FnTracer{t}
|
||||
}
|
||||
|
||||
// FnTracer implements opentracing.Tracer
|
||||
// Override StartSpan to wrap the returned Span in a FnSpan
|
||||
func (fnt FnTracer) StartSpan(operationName string, opts ...opentracing.StartSpanOption) opentracing.Span {
|
||||
return NewFnSpan(fnt.Tracer.StartSpan(operationName, opts...))
|
||||
}
|
||||
|
||||
// FnSpan is a custom Span that wraps another span
|
||||
// which adds some extra behaviour required for sending tracing spans to prometheus
|
||||
type FnSpan struct {
|
||||
opentracing.Span
|
||||
}
|
||||
|
||||
// NewFnSpan returns a new FnSpan which wraps the specified Span
|
||||
func NewFnSpan(s opentracing.Span) opentracing.Span {
|
||||
return &FnSpan{s}
|
||||
}
|
||||
|
||||
// FnSpan implements opentracing.Span
|
||||
func (fns FnSpan) Finish() {
|
||||
fns.copyBaggageItemsToTags()
|
||||
fns.Span.Finish()
|
||||
}
|
||||
|
||||
// FnSpan implements opentracing.Span
|
||||
func (fns FnSpan) FinishWithOptions(opts opentracing.FinishOptions) {
|
||||
fns.copyBaggageItemsToTags()
|
||||
fns.Span.FinishWithOptions(opts)
|
||||
}
|
||||
|
||||
func (fns FnSpan) copyBaggageItemsToTags() {
|
||||
// copy baggage items (which are inherited from the parent) with keys starting with "fn" to tags
|
||||
// the PrometheusCollector will send these to Prometheus
|
||||
// need to do this because the collector can't access baggage items, but it can access tags
|
||||
// whereas here we can access the parent's baggage items, but not its tags
|
||||
fns.Context().ForeachBaggageItem(func(k, v string) bool {
|
||||
if strings.HasPrefix(k, "fn") {
|
||||
fns.SetTag(k, v)
|
||||
}
|
||||
return true
|
||||
})
|
||||
}
|
||||
77
api/server/prom_zip_collector.go
Normal file
77
api/server/prom_zip_collector.go
Normal file
@@ -0,0 +1,77 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"github.com/openzipkin/zipkin-go-opentracing"
|
||||
"github.com/openzipkin/zipkin-go-opentracing/thrift/gen-go/zipkincore"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// PrometheusCollector is a custom Collector
|
||||
// which sends ZipKin traces to Prometheus
|
||||
type PrometheusCollector struct {
|
||||
|
||||
// Each span name is published as a separate Histogram metric
|
||||
// Using metric names of the form fn_span_<span-name>_duration_seconds
|
||||
histogramVecMap map[string]*prometheus.HistogramVec
|
||||
}
|
||||
|
||||
// NewPrometheusCollector returns a new PrometheusCollector
|
||||
func NewPrometheusCollector() (zipkintracer.Collector, error) {
|
||||
pc := &PrometheusCollector{make(map[string]*prometheus.HistogramVec)}
|
||||
return pc, nil
|
||||
}
|
||||
|
||||
// Return the HistogramVec corresponding to the specified spanName.
|
||||
// If a HistogramVec does not already exist for specified spanName then one is created and configured with the specified labels
|
||||
// otherwise the labels parameter is ignored.
|
||||
func (pc PrometheusCollector) getHistogramVecForSpanName(spanName string, labels []string) *prometheus.HistogramVec {
|
||||
thisHistogramVec, found := pc.histogramVecMap[spanName]
|
||||
if !found {
|
||||
thisHistogramVec = prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Name: "fn_span_" + spanName + "_duration_seconds",
|
||||
Help: "Span " + spanName + " duration, by span name",
|
||||
},
|
||||
labels,
|
||||
)
|
||||
pc.histogramVecMap[spanName] = thisHistogramVec
|
||||
prometheus.MustRegister(thisHistogramVec)
|
||||
}
|
||||
return thisHistogramVec
|
||||
}
|
||||
|
||||
// PrometheusCollector implements Collector.
|
||||
func (pc PrometheusCollector) Collect(span *zipkincore.Span) error {
|
||||
|
||||
// extract any label values from the span
|
||||
labelKeys, labelValueMap := getLabels(span)
|
||||
|
||||
pc.getHistogramVecForSpanName(span.GetName(), labelKeys).With(labelValueMap).Observe((time.Duration(span.GetDuration()) * time.Microsecond).Seconds())
|
||||
return nil
|
||||
}
|
||||
|
||||
// extract from the specified span the key/value pairs that we want to add as labels to the Prometheus metric for this span
|
||||
// returns an array of keys, and a map of key-value pairs
|
||||
func getLabels(span *zipkincore.Span) ([]string, map[string]string) {
|
||||
|
||||
var keys []string
|
||||
labelMap := make(map[string]string)
|
||||
|
||||
// extract any tags whose key starts with "fn" from the span
|
||||
binaryAnnotations := span.GetBinaryAnnotations()
|
||||
for _, thisBinaryAnnotation := range binaryAnnotations {
|
||||
key := thisBinaryAnnotation.GetKey()
|
||||
if thisBinaryAnnotation.GetAnnotationType() == zipkincore.AnnotationType_STRING && strings.HasPrefix(key, "fn") {
|
||||
keys = append(keys, key)
|
||||
value := string(thisBinaryAnnotation.GetValue()[:])
|
||||
labelMap[key] = value
|
||||
}
|
||||
}
|
||||
|
||||
return keys, labelMap
|
||||
}
|
||||
|
||||
// PrometheusCollector implements Collector.
|
||||
func (PrometheusCollector) Close() error { return nil }
|
||||
@@ -86,7 +86,7 @@ func New(ctx context.Context, ds models.Datastore, mq models.MessageQueue, logDB
|
||||
}
|
||||
|
||||
setMachineId()
|
||||
setTracer()
|
||||
s.setTracer()
|
||||
s.Router.Use(loggerWrap, traceWrap, panicWrap)
|
||||
s.bindHandlers(ctx)
|
||||
|
||||
@@ -110,6 +110,8 @@ func traceWrap(c *gin.Context) {
|
||||
// If wireContext == nil, a root span will be created.
|
||||
// TODO we should add more tags?
|
||||
serverSpan := opentracing.StartSpan("serve_http", ext.RPCServerOption(wireContext), opentracing.Tag{Key: "path", Value: c.Request.URL.Path})
|
||||
serverSpan.SetBaggageItem("fn_appname", c.Param(api.CApp))
|
||||
serverSpan.SetBaggageItem("fn_path", c.Param(api.CRoute))
|
||||
defer serverSpan.Finish()
|
||||
|
||||
ctx := opentracing.ContextWithSpan(c.Request.Context(), serverSpan)
|
||||
@@ -117,7 +119,7 @@ func traceWrap(c *gin.Context) {
|
||||
c.Next()
|
||||
}
|
||||
|
||||
func setTracer() {
|
||||
func (s *Server) setTracer() {
|
||||
var (
|
||||
debugMode = false
|
||||
serviceName = "fn-server"
|
||||
@@ -126,17 +128,29 @@ func setTracer() {
|
||||
// ex: "http://zipkin:9411/api/v1/spans"
|
||||
)
|
||||
|
||||
if zipkinHTTPEndpoint == "" {
|
||||
return
|
||||
var collector zipkintracer.Collector
|
||||
|
||||
// custom Zipkin collector to send tracing spans to Prometheus
|
||||
promCollector, promErr := NewPrometheusCollector()
|
||||
if promErr != nil {
|
||||
logrus.WithError(promErr).Fatalln("couldn't start Prometheus trace collector")
|
||||
}
|
||||
|
||||
logger := zipkintracer.LoggerFunc(func(i ...interface{}) error { logrus.Error(i...); return nil })
|
||||
|
||||
collector, err := zipkintracer.NewHTTPCollector(zipkinHTTPEndpoint, zipkintracer.HTTPLogger(logger))
|
||||
if err != nil {
|
||||
logrus.WithError(err).Fatalln("couldn't start trace collector")
|
||||
if zipkinHTTPEndpoint != "" {
|
||||
// Custom PrometheusCollector and Zipkin HTTPCollector
|
||||
httpCollector, zipErr := zipkintracer.NewHTTPCollector(zipkinHTTPEndpoint, zipkintracer.HTTPLogger(logger))
|
||||
if zipErr != nil {
|
||||
logrus.WithError(zipErr).Fatalln("couldn't start Zipkin trace collector")
|
||||
}
|
||||
tracer, err := zipkintracer.NewTracer(zipkintracer.NewRecorder(collector, debugMode, serviceHostPort, serviceName),
|
||||
collector = zipkintracer.MultiCollector{httpCollector, promCollector}
|
||||
} else {
|
||||
// Custom PrometheusCollector only
|
||||
collector = promCollector
|
||||
}
|
||||
|
||||
ziptracer, err := zipkintracer.NewTracer(zipkintracer.NewRecorder(collector, debugMode, serviceHostPort, serviceName),
|
||||
zipkintracer.ClientServerSameSpan(true),
|
||||
zipkintracer.TraceID128Bit(true),
|
||||
)
|
||||
@@ -144,7 +158,10 @@ func setTracer() {
|
||||
logrus.WithError(err).Fatalln("couldn't start tracer")
|
||||
}
|
||||
|
||||
opentracing.SetGlobalTracer(tracer)
|
||||
// wrap the Zipkin tracer in a FnTracer which will also send spans to Prometheus
|
||||
fntracer := NewFnTracer(ziptracer)
|
||||
|
||||
opentracing.SetGlobalTracer(fntracer)
|
||||
logrus.WithFields(logrus.Fields{"url": zipkinHTTPEndpoint}).Info("started tracer")
|
||||
}
|
||||
|
||||
|
||||
BIN
docs/assets/GrafanaDashboard2.png
Executable file
BIN
docs/assets/GrafanaDashboard2.png
Executable file
Binary file not shown.
|
After Width: | Height: | Size: 1007 KiB |
@@ -90,11 +90,23 @@ Create a datasource to obtain metrics from Promethesus:
|
||||
* Click **Add** and then **Save and test**
|
||||
|
||||
Import the example dashboard that displays metrics from the Fn server:
|
||||
* Click on the main menu at the top left and choose **Dashboards** and then **Home**
|
||||
* Click on **Home** at the top and then **Import dashboard**
|
||||
* Click on the main menu at the top left and choose **Dashboards** and then **Import**
|
||||
* In the dialog that opens, click **Upload .json file** and specify `fn_grafana_dashboard.json` in this example's directory.
|
||||
* Specify the Prometheus data source that you just created
|
||||
* Click **Import**
|
||||
|
||||
You should then see the dashboard shown above. Now execute some functions and see the graphs update.
|
||||
|
||||
## Tracing metrics
|
||||
|
||||
Tracing spans from the Fn server are available as Prometheus metrics. Each span represents a timed internal operation such as a function call, and has
|
||||
two main attributes: a name that describes the operation being performed (for example `docker_wait_container`), and its duration in seconds. Each span name is represented by a separate histogram metric, which has a name of the form `fn_span_<span-name>_duration_seconds`.
|
||||
|
||||
If the span is associated with a specific function invocation, the corresponding metric is given the labels `fn_app` and `fn_path` which are set to the application name and function path respectively.
|
||||
|
||||
A second example dashboard `fn_grafana_dashboard2.json` in this example's directory displays rate and duration data for any number of spans. Use the dropdown lists at the top of the dashboard to choose which tracing spans to examine.
|
||||
|
||||
In the following screenshot, the "Choose spans to display rates" dropdown has been used to select `agent_submit` and `serve_http`, and the "Choose spans to display durations" dropdown, has been used to select `agent_cold_exec`, `agent_get_slot`, `agent_submit`, `docker_create_container`, `docker_start_container` and `docker_wait_container`.
|
||||
|
||||
<img src="../../docs/assets/GrafanaDashboard2.png" width="100%">
|
||||
|
||||
|
||||
321
examples/grafana/fn_grafana_dashboard2.json
Executable file
321
examples/grafana/fn_grafana_dashboard2.json
Executable file
@@ -0,0 +1,321 @@
|
||||
{
|
||||
"__inputs": [
|
||||
{
|
||||
"name": "DS_PROMDS",
|
||||
"label": "PromDS",
|
||||
"description": "",
|
||||
"type": "datasource",
|
||||
"pluginId": "prometheus",
|
||||
"pluginName": "Prometheus"
|
||||
}
|
||||
],
|
||||
"__requires": [
|
||||
{
|
||||
"type": "grafana",
|
||||
"id": "grafana",
|
||||
"name": "Grafana",
|
||||
"version": "4.6.0-beta3"
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "graph",
|
||||
"name": "Graph",
|
||||
"version": ""
|
||||
},
|
||||
{
|
||||
"type": "datasource",
|
||||
"id": "prometheus",
|
||||
"name": "Prometheus",
|
||||
"version": "1.0.0"
|
||||
}
|
||||
],
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": "-- Grafana --",
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"hideControls": false,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"refresh": "30s",
|
||||
"rows": [
|
||||
{
|
||||
"collapse": false,
|
||||
"height": 297,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMDS}",
|
||||
"description": "",
|
||||
"fill": 1,
|
||||
"id": 44,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": false,
|
||||
"current": true,
|
||||
"hideEmpty": false,
|
||||
"hideZero": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"rightSide": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [],
|
||||
"minSpan": 3,
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": "span_rate_name",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"span": 6,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(fn_span_[[span_rate_name]]_duration_seconds_count[1m])",
|
||||
"format": "time_series",
|
||||
"hide": false,
|
||||
"instant": false,
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{fn_path}} ({{fn_appname}})",
|
||||
"refId": "A",
|
||||
"step": 1
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "[[span_rate_name]] rate (1 min rolling avg)",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "none",
|
||||
"label": "spans/sec",
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Dashboard Row",
|
||||
"titleSize": "h6"
|
||||
},
|
||||
{
|
||||
"collapse": false,
|
||||
"height": 250,
|
||||
"panels": [
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "${DS_PROMDS}",
|
||||
"fill": 1,
|
||||
"id": 45,
|
||||
"legend": {
|
||||
"alignAsTable": true,
|
||||
"avg": false,
|
||||
"current": true,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": true
|
||||
},
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [],
|
||||
"minSpan": 3,
|
||||
"nullPointMode": "null",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
"renderer": "flot",
|
||||
"repeat": "span_duration_name",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"span": 3,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(fn_span_[[span_duration_name]]_duration_seconds_sum[1m]) / rate(fn_span_[[span_duration_name]]_duration_seconds_count[1m])\n\n\n\n",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{fn_path}} ({{fn_appname}})",
|
||||
"refId": "A",
|
||||
"step": 1
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "[[span_duration_name]] duration (1min rolling avg)",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": "Seconds",
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"repeat": null,
|
||||
"repeatIteration": null,
|
||||
"repeatRowId": null,
|
||||
"showTitle": false,
|
||||
"title": "Dashboard Row",
|
||||
"titleSize": "h6"
|
||||
}
|
||||
],
|
||||
"schemaVersion": 14,
|
||||
"style": "dark",
|
||||
"tags": [],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {},
|
||||
"datasource": "${DS_PROMDS}",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Choose spans to display rates",
|
||||
"multi": true,
|
||||
"name": "span_rate_name",
|
||||
"options": [],
|
||||
"query": "metrics(fn_span.*)",
|
||||
"refresh": 2,
|
||||
"regex": "fn_span_(.*)_duration_seconds_count",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {},
|
||||
"datasource": "${DS_PROMDS}",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "Choose spans to display durations",
|
||||
"multi": true,
|
||||
"name": "span_duration_name",
|
||||
"options": [],
|
||||
"query": "metrics(fn_span.*)",
|
||||
"refresh": 2,
|
||||
"regex": "fn_span_(.*)_duration_seconds_count",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-5m",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"timezone": "",
|
||||
"title": "Fn tracing spans",
|
||||
"version": 23
|
||||
}
|
||||
Reference in New Issue
Block a user