Merge pull request #447 from fnproject/tracing_to_prometheus

Tracing to prometheus
This commit is contained in:
Reed Allman
2017-10-25 10:58:07 -07:00
committed by GitHub
7 changed files with 500 additions and 11 deletions

View File

@@ -188,6 +188,8 @@ func (a *agent) Submit(callI Call) error {
ctx := call.req.Context()
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_submit")
span.SetBaggageItem("fn_appname", callI.Model().AppName)
span.SetBaggageItem("fn_path", callI.Model().Path)
defer span.Finish()
// start the timer STAT! TODO add some wiggle room

60
api/server/fntracer.go Normal file
View File

@@ -0,0 +1,60 @@
package server
import (
"github.com/opentracing/opentracing-go"
"strings"
)
// FnTracer is a custom Tracer which wraps another another tracer
// its main purpose is to wrap the underlying Span in a FnSpan,
// which adds some extra behaviour required for sending tracing spans to prometheus
type FnTracer struct {
opentracing.Tracer
}
// NewFnTracer returns a new FnTracer which wraps the specified Tracer
func NewFnTracer(t opentracing.Tracer) opentracing.Tracer {
return &FnTracer{t}
}
// FnTracer implements opentracing.Tracer
// Override StartSpan to wrap the returned Span in a FnSpan
func (fnt FnTracer) StartSpan(operationName string, opts ...opentracing.StartSpanOption) opentracing.Span {
return NewFnSpan(fnt.Tracer.StartSpan(operationName, opts...))
}
// FnSpan is a custom Span that wraps another span
// which adds some extra behaviour required for sending tracing spans to prometheus
type FnSpan struct {
opentracing.Span
}
// NewFnSpan returns a new FnSpan which wraps the specified Span
func NewFnSpan(s opentracing.Span) opentracing.Span {
return &FnSpan{s}
}
// FnSpan implements opentracing.Span
func (fns FnSpan) Finish() {
fns.copyBaggageItemsToTags()
fns.Span.Finish()
}
// FnSpan implements opentracing.Span
func (fns FnSpan) FinishWithOptions(opts opentracing.FinishOptions) {
fns.copyBaggageItemsToTags()
fns.Span.FinishWithOptions(opts)
}
func (fns FnSpan) copyBaggageItemsToTags() {
// copy baggage items (which are inherited from the parent) with keys starting with "fn" to tags
// the PrometheusCollector will send these to Prometheus
// need to do this because the collector can't access baggage items, but it can access tags
// whereas here we can access the parent's baggage items, but not its tags
fns.Context().ForeachBaggageItem(func(k, v string) bool {
if strings.HasPrefix(k, "fn") {
fns.SetTag(k, v)
}
return true
})
}

View File

@@ -0,0 +1,77 @@
package server
import (
"github.com/openzipkin/zipkin-go-opentracing"
"github.com/openzipkin/zipkin-go-opentracing/thrift/gen-go/zipkincore"
"github.com/prometheus/client_golang/prometheus"
"strings"
"time"
)
// PrometheusCollector is a custom Collector
// which sends ZipKin traces to Prometheus
type PrometheusCollector struct {
// Each span name is published as a separate Histogram metric
// Using metric names of the form fn_span_<span-name>_duration_seconds
histogramVecMap map[string]*prometheus.HistogramVec
}
// NewPrometheusCollector returns a new PrometheusCollector
func NewPrometheusCollector() (zipkintracer.Collector, error) {
pc := &PrometheusCollector{make(map[string]*prometheus.HistogramVec)}
return pc, nil
}
// Return the HistogramVec corresponding to the specified spanName.
// If a HistogramVec does not already exist for specified spanName then one is created and configured with the specified labels
// otherwise the labels parameter is ignored.
func (pc PrometheusCollector) getHistogramVecForSpanName(spanName string, labels []string) *prometheus.HistogramVec {
thisHistogramVec, found := pc.histogramVecMap[spanName]
if !found {
thisHistogramVec = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "fn_span_" + spanName + "_duration_seconds",
Help: "Span " + spanName + " duration, by span name",
},
labels,
)
pc.histogramVecMap[spanName] = thisHistogramVec
prometheus.MustRegister(thisHistogramVec)
}
return thisHistogramVec
}
// PrometheusCollector implements Collector.
func (pc PrometheusCollector) Collect(span *zipkincore.Span) error {
// extract any label values from the span
labelKeys, labelValueMap := getLabels(span)
pc.getHistogramVecForSpanName(span.GetName(), labelKeys).With(labelValueMap).Observe((time.Duration(span.GetDuration()) * time.Microsecond).Seconds())
return nil
}
// extract from the specified span the key/value pairs that we want to add as labels to the Prometheus metric for this span
// returns an array of keys, and a map of key-value pairs
func getLabels(span *zipkincore.Span) ([]string, map[string]string) {
var keys []string
labelMap := make(map[string]string)
// extract any tags whose key starts with "fn" from the span
binaryAnnotations := span.GetBinaryAnnotations()
for _, thisBinaryAnnotation := range binaryAnnotations {
key := thisBinaryAnnotation.GetKey()
if thisBinaryAnnotation.GetAnnotationType() == zipkincore.AnnotationType_STRING && strings.HasPrefix(key, "fn") {
keys = append(keys, key)
value := string(thisBinaryAnnotation.GetValue()[:])
labelMap[key] = value
}
}
return keys, labelMap
}
// PrometheusCollector implements Collector.
func (PrometheusCollector) Close() error { return nil }

View File

@@ -86,7 +86,7 @@ func New(ctx context.Context, ds models.Datastore, mq models.MessageQueue, logDB
}
setMachineId()
setTracer()
s.setTracer()
s.Router.Use(loggerWrap, traceWrap, panicWrap)
s.bindHandlers(ctx)
@@ -110,6 +110,8 @@ func traceWrap(c *gin.Context) {
// If wireContext == nil, a root span will be created.
// TODO we should add more tags?
serverSpan := opentracing.StartSpan("serve_http", ext.RPCServerOption(wireContext), opentracing.Tag{Key: "path", Value: c.Request.URL.Path})
serverSpan.SetBaggageItem("fn_appname", c.Param(api.CApp))
serverSpan.SetBaggageItem("fn_path", c.Param(api.CRoute))
defer serverSpan.Finish()
ctx := opentracing.ContextWithSpan(c.Request.Context(), serverSpan)
@@ -117,7 +119,7 @@ func traceWrap(c *gin.Context) {
c.Next()
}
func setTracer() {
func (s *Server) setTracer() {
var (
debugMode = false
serviceName = "fn-server"
@@ -126,17 +128,29 @@ func setTracer() {
// ex: "http://zipkin:9411/api/v1/spans"
)
if zipkinHTTPEndpoint == "" {
return
var collector zipkintracer.Collector
// custom Zipkin collector to send tracing spans to Prometheus
promCollector, promErr := NewPrometheusCollector()
if promErr != nil {
logrus.WithError(promErr).Fatalln("couldn't start Prometheus trace collector")
}
logger := zipkintracer.LoggerFunc(func(i ...interface{}) error { logrus.Error(i...); return nil })
collector, err := zipkintracer.NewHTTPCollector(zipkinHTTPEndpoint, zipkintracer.HTTPLogger(logger))
if err != nil {
logrus.WithError(err).Fatalln("couldn't start trace collector")
if zipkinHTTPEndpoint != "" {
// Custom PrometheusCollector and Zipkin HTTPCollector
httpCollector, zipErr := zipkintracer.NewHTTPCollector(zipkinHTTPEndpoint, zipkintracer.HTTPLogger(logger))
if zipErr != nil {
logrus.WithError(zipErr).Fatalln("couldn't start Zipkin trace collector")
}
tracer, err := zipkintracer.NewTracer(zipkintracer.NewRecorder(collector, debugMode, serviceHostPort, serviceName),
collector = zipkintracer.MultiCollector{httpCollector, promCollector}
} else {
// Custom PrometheusCollector only
collector = promCollector
}
ziptracer, err := zipkintracer.NewTracer(zipkintracer.NewRecorder(collector, debugMode, serviceHostPort, serviceName),
zipkintracer.ClientServerSameSpan(true),
zipkintracer.TraceID128Bit(true),
)
@@ -144,7 +158,10 @@ func setTracer() {
logrus.WithError(err).Fatalln("couldn't start tracer")
}
opentracing.SetGlobalTracer(tracer)
// wrap the Zipkin tracer in a FnTracer which will also send spans to Prometheus
fntracer := NewFnTracer(ziptracer)
opentracing.SetGlobalTracer(fntracer)
logrus.WithFields(logrus.Fields{"url": zipkinHTTPEndpoint}).Info("started tracer")
}

BIN
docs/assets/GrafanaDashboard2.png Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1007 KiB

View File

@@ -90,11 +90,23 @@ Create a datasource to obtain metrics from Promethesus:
* Click **Add** and then **Save and test**
Import the example dashboard that displays metrics from the Fn server:
* Click on the main menu at the top left and choose **Dashboards** and then **Home**
* Click on **Home** at the top and then **Import dashboard**
* Click on the main menu at the top left and choose **Dashboards** and then **Import**
* In the dialog that opens, click **Upload .json file** and specify `fn_grafana_dashboard.json` in this example's directory.
* Specify the Prometheus data source that you just created
* Click **Import**
You should then see the dashboard shown above. Now execute some functions and see the graphs update.
## Tracing metrics
Tracing spans from the Fn server are available as Prometheus metrics. Each span represents a timed internal operation such as a function call, and has
two main attributes: a name that describes the operation being performed (for example `docker_wait_container`), and its duration in seconds. Each span name is represented by a separate histogram metric, which has a name of the form `fn_span_<span-name>_duration_seconds`.
If the span is associated with a specific function invocation, the corresponding metric is given the labels `fn_app` and `fn_path` which are set to the application name and function path respectively.
A second example dashboard `fn_grafana_dashboard2.json` in this example's directory displays rate and duration data for any number of spans. Use the dropdown lists at the top of the dashboard to choose which tracing spans to examine.
In the following screenshot, the "Choose spans to display rates" dropdown has been used to select `agent_submit` and `serve_http`, and the "Choose spans to display durations" dropdown, has been used to select `agent_cold_exec`, `agent_get_slot`, `agent_submit`, `docker_create_container`, `docker_start_container` and `docker_wait_container`.
<img src="../../docs/assets/GrafanaDashboard2.png" width="100%">

View File

@@ -0,0 +1,321 @@
{
"__inputs": [
{
"name": "DS_PROMDS",
"label": "PromDS",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
],
"__requires": [
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "4.6.0-beta3"
},
{
"type": "panel",
"id": "graph",
"name": "Graph",
"version": ""
},
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"hideControls": false,
"id": null,
"links": [],
"refresh": "30s",
"rows": [
{
"collapse": false,
"height": 297,
"panels": [
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMDS}",
"description": "",
"fill": 1,
"id": 44,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"hideEmpty": false,
"hideZero": false,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"minSpan": 3,
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": "span_rate_name",
"seriesOverrides": [],
"spaceLength": 10,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "rate(fn_span_[[span_rate_name]]_duration_seconds_count[1m])",
"format": "time_series",
"hide": false,
"instant": false,
"intervalFactor": 2,
"legendFormat": "{{fn_path}} ({{fn_appname}})",
"refId": "A",
"step": 1
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "[[span_rate_name]] rate (1 min rolling avg)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "none",
"label": "spans/sec",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "Dashboard Row",
"titleSize": "h6"
},
{
"collapse": false,
"height": 250,
"panels": [
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_PROMDS}",
"fill": 1,
"id": 45,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"max": false,
"min": false,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"minSpan": 3,
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": "span_duration_name",
"seriesOverrides": [],
"spaceLength": 10,
"span": 3,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "rate(fn_span_[[span_duration_name]]_duration_seconds_sum[1m]) / rate(fn_span_[[span_duration_name]]_duration_seconds_count[1m])\n\n\n\n",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{fn_path}} ({{fn_appname}})",
"refId": "A",
"step": 1
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "[[span_duration_name]] duration (1min rolling avg)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": "Seconds",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": false,
"title": "Dashboard Row",
"titleSize": "h6"
}
],
"schemaVersion": 14,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"allValue": null,
"current": {},
"datasource": "${DS_PROMDS}",
"hide": 0,
"includeAll": false,
"label": "Choose spans to display rates",
"multi": true,
"name": "span_rate_name",
"options": [],
"query": "metrics(fn_span.*)",
"refresh": 2,
"regex": "fn_span_(.*)_duration_seconds_count",
"sort": 1,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": null,
"current": {},
"datasource": "${DS_PROMDS}",
"hide": 0,
"includeAll": false,
"label": "Choose spans to display durations",
"multi": true,
"name": "span_duration_name",
"options": [],
"query": "metrics(fn_span.*)",
"refresh": 2,
"regex": "fn_span_(.*)_duration_seconds_count",
"sort": 1,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-5m",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "",
"title": "Fn tracing spans",
"version": 23
}