diff --git a/api/agent/agent.go b/api/agent/agent.go index 2f6ecf681..5936bfa1f 100644 --- a/api/agent/agent.go +++ b/api/agent/agent.go @@ -188,6 +188,8 @@ func (a *agent) Submit(callI Call) error { ctx := call.req.Context() span, ctx := opentracing.StartSpanFromContext(ctx, "agent_submit") + span.SetBaggageItem("fn_appname", callI.Model().AppName) + span.SetBaggageItem("fn_path", callI.Model().Path) defer span.Finish() // start the timer STAT! TODO add some wiggle room diff --git a/api/server/fntracer.go b/api/server/fntracer.go new file mode 100644 index 000000000..ce65c6944 --- /dev/null +++ b/api/server/fntracer.go @@ -0,0 +1,60 @@ +package server + +import ( + "github.com/opentracing/opentracing-go" + "strings" +) + +// FnTracer is a custom Tracer which wraps another another tracer +// its main purpose is to wrap the underlying Span in a FnSpan, +// which adds some extra behaviour required for sending tracing spans to prometheus +type FnTracer struct { + opentracing.Tracer +} + +// NewFnTracer returns a new FnTracer which wraps the specified Tracer +func NewFnTracer(t opentracing.Tracer) opentracing.Tracer { + return &FnTracer{t} +} + +// FnTracer implements opentracing.Tracer +// Override StartSpan to wrap the returned Span in a FnSpan +func (fnt FnTracer) StartSpan(operationName string, opts ...opentracing.StartSpanOption) opentracing.Span { + return NewFnSpan(fnt.Tracer.StartSpan(operationName, opts...)) +} + +// FnSpan is a custom Span that wraps another span +// which adds some extra behaviour required for sending tracing spans to prometheus +type FnSpan struct { + opentracing.Span +} + +// NewFnSpan returns a new FnSpan which wraps the specified Span +func NewFnSpan(s opentracing.Span) opentracing.Span { + return &FnSpan{s} +} + +// FnSpan implements opentracing.Span +func (fns FnSpan) Finish() { + fns.copyBaggageItemsToTags() + fns.Span.Finish() +} + +// FnSpan implements opentracing.Span +func (fns FnSpan) FinishWithOptions(opts opentracing.FinishOptions) { + fns.copyBaggageItemsToTags() + fns.Span.FinishWithOptions(opts) +} + +func (fns FnSpan) copyBaggageItemsToTags() { + // copy baggage items (which are inherited from the parent) with keys starting with "fn" to tags + // the PrometheusCollector will send these to Prometheus + // need to do this because the collector can't access baggage items, but it can access tags + // whereas here we can access the parent's baggage items, but not its tags + fns.Context().ForeachBaggageItem(func(k, v string) bool { + if strings.HasPrefix(k, "fn") { + fns.SetTag(k, v) + } + return true + }) +} diff --git a/api/server/prom_zip_collector.go b/api/server/prom_zip_collector.go new file mode 100644 index 000000000..a8034329a --- /dev/null +++ b/api/server/prom_zip_collector.go @@ -0,0 +1,77 @@ +package server + +import ( + "github.com/openzipkin/zipkin-go-opentracing" + "github.com/openzipkin/zipkin-go-opentracing/thrift/gen-go/zipkincore" + "github.com/prometheus/client_golang/prometheus" + "strings" + "time" +) + +// PrometheusCollector is a custom Collector +// which sends ZipKin traces to Prometheus +type PrometheusCollector struct { + + // Each span name is published as a separate Histogram metric + // Using metric names of the form fn_span__duration_seconds + histogramVecMap map[string]*prometheus.HistogramVec +} + +// NewPrometheusCollector returns a new PrometheusCollector +func NewPrometheusCollector() (zipkintracer.Collector, error) { + pc := &PrometheusCollector{make(map[string]*prometheus.HistogramVec)} + return pc, nil +} + +// Return the HistogramVec corresponding to the specified spanName. +// If a HistogramVec does not already exist for specified spanName then one is created and configured with the specified labels +// otherwise the labels parameter is ignored. +func (pc PrometheusCollector) getHistogramVecForSpanName(spanName string, labels []string) *prometheus.HistogramVec { + thisHistogramVec, found := pc.histogramVecMap[spanName] + if !found { + thisHistogramVec = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "fn_span_" + spanName + "_duration_seconds", + Help: "Span " + spanName + " duration, by span name", + }, + labels, + ) + pc.histogramVecMap[spanName] = thisHistogramVec + prometheus.MustRegister(thisHistogramVec) + } + return thisHistogramVec +} + +// PrometheusCollector implements Collector. +func (pc PrometheusCollector) Collect(span *zipkincore.Span) error { + + // extract any label values from the span + labelKeys, labelValueMap := getLabels(span) + + pc.getHistogramVecForSpanName(span.GetName(), labelKeys).With(labelValueMap).Observe((time.Duration(span.GetDuration()) * time.Microsecond).Seconds()) + return nil +} + +// extract from the specified span the key/value pairs that we want to add as labels to the Prometheus metric for this span +// returns an array of keys, and a map of key-value pairs +func getLabels(span *zipkincore.Span) ([]string, map[string]string) { + + var keys []string + labelMap := make(map[string]string) + + // extract any tags whose key starts with "fn" from the span + binaryAnnotations := span.GetBinaryAnnotations() + for _, thisBinaryAnnotation := range binaryAnnotations { + key := thisBinaryAnnotation.GetKey() + if thisBinaryAnnotation.GetAnnotationType() == zipkincore.AnnotationType_STRING && strings.HasPrefix(key, "fn") { + keys = append(keys, key) + value := string(thisBinaryAnnotation.GetValue()[:]) + labelMap[key] = value + } + } + + return keys, labelMap +} + +// PrometheusCollector implements Collector. +func (PrometheusCollector) Close() error { return nil } diff --git a/api/server/server.go b/api/server/server.go index 6ea1432b7..24610a50d 100644 --- a/api/server/server.go +++ b/api/server/server.go @@ -86,7 +86,7 @@ func New(ctx context.Context, ds models.Datastore, mq models.MessageQueue, logDB } setMachineId() - setTracer() + s.setTracer() s.Router.Use(loggerWrap, traceWrap, panicWrap) s.bindHandlers(ctx) @@ -110,6 +110,8 @@ func traceWrap(c *gin.Context) { // If wireContext == nil, a root span will be created. // TODO we should add more tags? serverSpan := opentracing.StartSpan("serve_http", ext.RPCServerOption(wireContext), opentracing.Tag{Key: "path", Value: c.Request.URL.Path}) + serverSpan.SetBaggageItem("fn_appname", c.Param(api.CApp)) + serverSpan.SetBaggageItem("fn_path", c.Param(api.CRoute)) defer serverSpan.Finish() ctx := opentracing.ContextWithSpan(c.Request.Context(), serverSpan) @@ -117,7 +119,7 @@ func traceWrap(c *gin.Context) { c.Next() } -func setTracer() { +func (s *Server) setTracer() { var ( debugMode = false serviceName = "fn-server" @@ -126,17 +128,29 @@ func setTracer() { // ex: "http://zipkin:9411/api/v1/spans" ) - if zipkinHTTPEndpoint == "" { - return + var collector zipkintracer.Collector + + // custom Zipkin collector to send tracing spans to Prometheus + promCollector, promErr := NewPrometheusCollector() + if promErr != nil { + logrus.WithError(promErr).Fatalln("couldn't start Prometheus trace collector") } logger := zipkintracer.LoggerFunc(func(i ...interface{}) error { logrus.Error(i...); return nil }) - collector, err := zipkintracer.NewHTTPCollector(zipkinHTTPEndpoint, zipkintracer.HTTPLogger(logger)) - if err != nil { - logrus.WithError(err).Fatalln("couldn't start trace collector") + if zipkinHTTPEndpoint != "" { + // Custom PrometheusCollector and Zipkin HTTPCollector + httpCollector, zipErr := zipkintracer.NewHTTPCollector(zipkinHTTPEndpoint, zipkintracer.HTTPLogger(logger)) + if zipErr != nil { + logrus.WithError(zipErr).Fatalln("couldn't start Zipkin trace collector") + } + collector = zipkintracer.MultiCollector{httpCollector, promCollector} + } else { + // Custom PrometheusCollector only + collector = promCollector } - tracer, err := zipkintracer.NewTracer(zipkintracer.NewRecorder(collector, debugMode, serviceHostPort, serviceName), + + ziptracer, err := zipkintracer.NewTracer(zipkintracer.NewRecorder(collector, debugMode, serviceHostPort, serviceName), zipkintracer.ClientServerSameSpan(true), zipkintracer.TraceID128Bit(true), ) @@ -144,7 +158,10 @@ func setTracer() { logrus.WithError(err).Fatalln("couldn't start tracer") } - opentracing.SetGlobalTracer(tracer) + // wrap the Zipkin tracer in a FnTracer which will also send spans to Prometheus + fntracer := NewFnTracer(ziptracer) + + opentracing.SetGlobalTracer(fntracer) logrus.WithFields(logrus.Fields{"url": zipkinHTTPEndpoint}).Info("started tracer") } diff --git a/docs/assets/GrafanaDashboard2.png b/docs/assets/GrafanaDashboard2.png new file mode 100755 index 000000000..2483367f0 Binary files /dev/null and b/docs/assets/GrafanaDashboard2.png differ diff --git a/examples/grafana/README.md b/examples/grafana/README.md index b84044d14..327d07f60 100644 --- a/examples/grafana/README.md +++ b/examples/grafana/README.md @@ -90,11 +90,23 @@ Create a datasource to obtain metrics from Promethesus: * Click **Add** and then **Save and test** Import the example dashboard that displays metrics from the Fn server: -* Click on the main menu at the top left and choose **Dashboards** and then **Home** -* Click on **Home** at the top and then **Import dashboard** +* Click on the main menu at the top left and choose **Dashboards** and then **Import** * In the dialog that opens, click **Upload .json file** and specify `fn_grafana_dashboard.json` in this example's directory. * Specify the Prometheus data source that you just created * Click **Import** You should then see the dashboard shown above. Now execute some functions and see the graphs update. +## Tracing metrics + +Tracing spans from the Fn server are available as Prometheus metrics. Each span represents a timed internal operation such as a function call, and has +two main attributes: a name that describes the operation being performed (for example `docker_wait_container`), and its duration in seconds. Each span name is represented by a separate histogram metric, which has a name of the form `fn_span__duration_seconds`. + +If the span is associated with a specific function invocation, the corresponding metric is given the labels `fn_app` and `fn_path` which are set to the application name and function path respectively. + +A second example dashboard `fn_grafana_dashboard2.json` in this example's directory displays rate and duration data for any number of spans. Use the dropdown lists at the top of the dashboard to choose which tracing spans to examine. + +In the following screenshot, the "Choose spans to display rates" dropdown has been used to select `agent_submit` and `serve_http`, and the "Choose spans to display durations" dropdown, has been used to select `agent_cold_exec`, `agent_get_slot`, `agent_submit`, `docker_create_container`, `docker_start_container` and `docker_wait_container`. + + + diff --git a/examples/grafana/fn_grafana_dashboard2.json b/examples/grafana/fn_grafana_dashboard2.json new file mode 100755 index 000000000..529659ace --- /dev/null +++ b/examples/grafana/fn_grafana_dashboard2.json @@ -0,0 +1,321 @@ +{ + "__inputs": [ + { + "name": "DS_PROMDS", + "label": "PromDS", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.6.0-beta3" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": "30s", + "rows": [ + { + "collapse": false, + "height": 297, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMDS}", + "description": "", + "fill": 1, + "id": 44, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "minSpan": 3, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "span_rate_name", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(fn_span_[[span_rate_name]]_duration_seconds_count[1m])", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 2, + "legendFormat": "{{fn_path}} ({{fn_appname}})", + "refId": "A", + "step": 1 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "[[span_rate_name]] rate (1 min rolling avg)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": "spans/sec", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMDS}", + "fill": 1, + "id": 45, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "minSpan": 3, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "span_duration_name", + "seriesOverrides": [], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(fn_span_[[span_duration_name]]_duration_seconds_sum[1m]) / rate(fn_span_[[span_duration_name]]_duration_seconds_count[1m])\n\n\n\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{fn_path}} ({{fn_appname}})", + "refId": "A", + "step": 1 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "[[span_duration_name]] duration (1min rolling avg)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Seconds", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMDS}", + "hide": 0, + "includeAll": false, + "label": "Choose spans to display rates", + "multi": true, + "name": "span_rate_name", + "options": [], + "query": "metrics(fn_span.*)", + "refresh": 2, + "regex": "fn_span_(.*)_duration_seconds_count", + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMDS}", + "hide": 0, + "includeAll": false, + "label": "Choose spans to display durations", + "multi": true, + "name": "span_duration_name", + "options": [], + "query": "metrics(fn_span.*)", + "refresh": 2, + "regex": "fn_span_(.*)_duration_seconds_count", + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Fn tracing spans", + "version": 23 +} \ No newline at end of file