From ac2bfd346238d81ef9d246c1cea1b96eef043ad1 Mon Sep 17 00:00:00 2001
From: Nigel Deakin <nigel.deakin@oracle.com>
Date: Thu, 11 Jan 2018 17:34:51 +0000
Subject: [PATCH] Change basic stats to use opentracing rather than Prometheus
 API  (#671)

* Change basic stats to use opentracing rather than Prometheus API directly

* Just ran gofmt

* Extract opentracing access for metrics to common/metrics.go

* Replace quotes strings with constants where possible
---
 api/agent/agent.go                         |  40 ++--
 api/agent/stats.go                         |  98 ++++------
 api/common/metrics.go                      | 101 ++++++++++
 api/server/prom_zip_collector.go           | 206 +++++++++++++++++++--
 examples/grafana/fn_grafana_dashboard.json |  34 ++--
 5 files changed, 361 insertions(+), 118 deletions(-)
 create mode 100644 api/common/metrics.go

diff --git a/api/agent/agent.go b/api/agent/agent.go
index d57fbcb5c..9d3adea03 100644
--- a/api/agent/agent.go
+++ b/api/agent/agent.go
@@ -16,7 +16,6 @@ import (
 	"github.com/fnproject/fn/api/models"
 	"github.com/fnproject/fn/fnext"
 	"github.com/opentracing/opentracing-go"
-	"github.com/opentracing/opentracing-go/log"
 	"github.com/prometheus/client_golang/prometheus/promhttp"
 	"github.com/sirupsen/logrus"
 )
@@ -174,11 +173,11 @@ func transformTimeout(e error, isRetriable bool) error {
 
 // handleStatsDequeue handles stats for dequeuing for early exit (getSlot or Start)
 // cases. Only timeouts can be a simple dequeue while other cases are actual errors.
-func (a *agent) handleStatsDequeue(err error, callI Call) {
+func (a *agent) handleStatsDequeue(ctx context.Context, err error, callI Call) {
 	if err == context.DeadlineExceeded {
-		a.stats.Dequeue(callI.Model().AppName, callI.Model().Path)
+		a.stats.Dequeue(ctx, callI.Model().AppName, callI.Model().Path)
 	} else {
-		a.stats.DequeueAndFail(callI.Model().AppName, callI.Model().Path)
+		a.stats.DequeueAndFail(ctx, callI.Model().AppName, callI.Model().Path)
 	}
 }
 
@@ -192,9 +191,6 @@ func (a *agent) Submit(callI Call) error {
 	default:
 	}
 
-	// increment queued count
-	a.stats.Enqueue(callI.Model().AppName, callI.Model().Path)
-
 	call := callI.(*call)
 	ctx := call.req.Context()
 
@@ -219,9 +215,14 @@ func (a *agent) Submit(callI Call) error {
 	call.req = call.req.WithContext(ctxSlotWait)
 	defer cancelSlotWait()
 
+	// increment queued count
+	// this is done after setting "fn_appname" and "fn_path"
+	a.stats.Enqueue(ctx, callI.Model().AppName, callI.Model().Path)
+
 	slot, err := a.getSlot(ctxSlotWait, call) // find ram available / running
+
 	if err != nil {
-		a.handleStatsDequeue(err, call)
+		a.handleStatsDequeue(ctx, err, call)
 		return transformTimeout(err, true)
 	}
 	// TODO if the call times out & container is created, we need
@@ -230,7 +231,7 @@ func (a *agent) Submit(callI Call) error {
 
 	err = call.Start(ctxSlotWait)
 	if err != nil {
-		a.handleStatsDequeue(err, call)
+		a.handleStatsDequeue(ctx, err, call)
 		return transformTimeout(err, true)
 	}
 
@@ -241,7 +242,7 @@ func (a *agent) Submit(callI Call) error {
 	defer cancelExec()
 
 	// decrement queued count, increment running count
-	a.stats.DequeueAndStart(callI.Model().AppName, callI.Model().Path)
+	a.stats.DequeueAndStart(ctx, callI.Model().AppName, callI.Model().Path)
 
 	err = slot.exec(ctxExec, call)
 	// pass this error (nil or otherwise) to end directly, to store status, etc
@@ -249,10 +250,10 @@ func (a *agent) Submit(callI Call) error {
 
 	if err == nil {
 		// decrement running count, increment completed count
-		a.stats.Complete(callI.Model().AppName, callI.Model().Path)
+		a.stats.Complete(ctx, callI.Model().AppName, callI.Model().Path)
 	} else {
 		// decrement running count, increment failed count
-		a.stats.Failed(callI.Model().AppName, callI.Model().Path)
+		a.stats.Failed(ctx, callI.Model().AppName, callI.Model().Path)
 	}
 
 	// TODO: we need to allocate more time to store the call + logs in case the call timed out,
@@ -726,16 +727,19 @@ func (c *container) Timeout() time.Duration         { return c.timeout }
 func (c *container) EnvVars() map[string]string     { return c.env }
 func (c *container) Memory() uint64                 { return c.memory * 1024 * 1024 } // convert MB
 
-// Log the specified stats to a tracing span.
-// Spans are not processed by the collector until the span ends, so to prevent any delay
-// in processing the stats when the function is long-lived we create a new span for every call
+// WriteStat publishes each metric in the specified Stats structure as a histogram metric
 func (c *container) WriteStat(ctx context.Context, stat drivers.Stat) {
-	span, ctx := opentracing.StartSpanFromContext(ctx, "docker_stats")
-	defer span.Finish()
+
+	// Convert each metric value from uint64 to float64
+	// and, for backward compatibility reasons, prepend each metric name with "docker_stats_fn_"
+	// (if we don't care about compatibility then we can remove that)
+	var metrics = make(map[string]float64)
 	for key, value := range stat.Metrics {
-		span.LogFields(log.Uint64("fn_"+key, value))
+		metrics["docker_stats_fn_"+key] = float64(value)
 	}
 
+	common.PublishHistograms(ctx, metrics)
+
 	c.Lock()
 	defer c.Unlock()
 	if c.stats != nil {
diff --git a/api/agent/stats.go b/api/agent/stats.go
index b57ed6855..645a574f1 100644
--- a/api/agent/stats.go
+++ b/api/agent/stats.go
@@ -1,9 +1,9 @@
 package agent
 
 import (
+	"context"
+	"github.com/fnproject/fn/api/common"
 	"sync"
-
-	"github.com/prometheus/client_golang/prometheus"
 )
 
 // TODO this should expose:
@@ -30,8 +30,9 @@ type functionStats struct {
 	failed   uint64
 }
 
+// Stats hold the statistics for all functions combined
+// and the statistics for each individual function
 type Stats struct {
-	// statistics for all functions combined
 	Queue    uint64
 	Running  uint64
 	Complete uint64
@@ -40,7 +41,7 @@ type Stats struct {
 	FunctionStatsMap map[string]*FunctionStats
 }
 
-// statistics for an individual function
+// FunctionStats holds the statistics for an individual function
 type FunctionStats struct {
 	Queue    uint64
 	Running  uint64
@@ -48,52 +49,6 @@ type FunctionStats struct {
 	Failed   uint64
 }
 
-var (
-	fnCalls = prometheus.NewCounterVec(
-		prometheus.CounterOpts{
-			Name: "fn_api_calls",
-			Help: "Function calls by app and path",
-		},
-		[](string){"app", "path"},
-	)
-	fnQueued = prometheus.NewGaugeVec(
-		prometheus.GaugeOpts{
-			Name: "fn_api_queued",
-			Help: "Queued requests by app and path",
-		},
-		[](string){"app", "path"},
-	)
-	fnRunning = prometheus.NewGaugeVec(
-		prometheus.GaugeOpts{
-			Name: "fn_api_running",
-			Help: "Running requests by app and path",
-		},
-		[](string){"app", "path"},
-	)
-	fnCompleted = prometheus.NewCounterVec(
-		prometheus.CounterOpts{
-			Name: "fn_api_completed",
-			Help: "Completed requests by app and path",
-		},
-		[](string){"app", "path"},
-	)
-	fnFailed = prometheus.NewCounterVec(
-		prometheus.CounterOpts{
-			Name: "fn_api_failed",
-			Help: "Failed requests by path",
-		},
-		[](string){"app", "path"},
-	)
-)
-
-func init() {
-	prometheus.MustRegister(fnCalls)
-	prometheus.MustRegister(fnQueued)
-	prometheus.MustRegister(fnRunning)
-	prometheus.MustRegister(fnFailed)
-	prometheus.MustRegister(fnCompleted)
-}
-
 func (s *stats) getStatsForFunction(path string) *functionStats {
 	if s.functionStatsMap == nil {
 		s.functionStatsMap = make(map[string]*functionStats)
@@ -107,80 +62,81 @@ func (s *stats) getStatsForFunction(path string) *functionStats {
 	return thisFunctionStats
 }
 
-func (s *stats) Enqueue(app string, path string) {
+func (s *stats) Enqueue(ctx context.Context, app string, path string) {
 	s.mu.Lock()
 
 	s.queue++
 	s.getStatsForFunction(path).queue++
-	fnQueued.WithLabelValues(app, path).Inc()
-	fnCalls.WithLabelValues(app, path).Inc()
+	common.IncrementGauge(ctx, queuedMetricName)
+
+	common.IncrementCounter(ctx, callsMetricName)
 
 	s.mu.Unlock()
 }
 
 // Call when a function has been queued but cannot be started because of an error
-func (s *stats) Dequeue(app string, path string) {
+func (s *stats) Dequeue(ctx context.Context, app string, path string) {
 	s.mu.Lock()
 
 	s.queue--
 	s.getStatsForFunction(path).queue--
-	fnQueued.WithLabelValues(app, path).Dec()
+	common.DecrementGauge(ctx, queuedMetricName)
 
 	s.mu.Unlock()
 }
 
-func (s *stats) DequeueAndStart(app string, path string) {
+func (s *stats) DequeueAndStart(ctx context.Context, app string, path string) {
 	s.mu.Lock()
 
 	s.queue--
 	s.getStatsForFunction(path).queue--
-	fnQueued.WithLabelValues(app, path).Dec()
+	common.DecrementGauge(ctx, queuedMetricName)
 
 	s.running++
 	s.getStatsForFunction(path).running++
-	fnRunning.WithLabelValues(app, path).Inc()
+	common.IncrementGauge(ctx, runningSuffix)
 
 	s.mu.Unlock()
 }
 
-func (s *stats) Complete(app string, path string) {
+func (s *stats) Complete(ctx context.Context, app string, path string) {
 	s.mu.Lock()
 
 	s.running--
 	s.getStatsForFunction(path).running--
-	fnRunning.WithLabelValues(app, path).Dec()
+	common.DecrementGauge(ctx, runningSuffix)
 
 	s.complete++
 	s.getStatsForFunction(path).complete++
-	fnCompleted.WithLabelValues(app, path).Inc()
+	common.IncrementCounter(ctx, completedMetricName)
 
 	s.mu.Unlock()
 }
 
-func (s *stats) Failed(app string, path string) {
+func (s *stats) Failed(ctx context.Context, app string, path string) {
 	s.mu.Lock()
 
 	s.running--
 	s.getStatsForFunction(path).running--
-	fnRunning.WithLabelValues(app, path).Dec()
+	common.DecrementGauge(ctx, runningSuffix)
 
 	s.failed++
 	s.getStatsForFunction(path).failed++
-	fnFailed.WithLabelValues(app, path).Inc()
+	common.IncrementCounter(ctx, failedMetricName)
 
 	s.mu.Unlock()
 }
 
-func (s *stats) DequeueAndFail(app string, path string) {
+func (s *stats) DequeueAndFail(ctx context.Context, app string, path string) {
 	s.mu.Lock()
 
 	s.queue--
 	s.getStatsForFunction(path).queue--
-	fnQueued.WithLabelValues(app, path).Dec()
+	common.DecrementGauge(ctx, queuedMetricName)
 
 	s.failed++
 	s.getStatsForFunction(path).failed++
-	fnFailed.WithLabelValues(app, path).Inc()
+	common.IncrementCounter(ctx, failedMetricName)
 
 	s.mu.Unlock()
 }
@@ -200,3 +156,11 @@ func (s *stats) Stats() Stats {
 	s.mu.Unlock()
 	return stats
 }
+
+const (
+	queuedMetricName    = "queued"
+	callsMetricName     = "calls"
+	runningSuffix       = "running"
+	completedMetricName = "completed"
+	failedMetricName    = "failed"
+)
diff --git a/api/common/metrics.go b/api/common/metrics.go
new file mode 100644
index 000000000..d939ac251
--- /dev/null
+++ b/api/common/metrics.go
@@ -0,0 +1,101 @@
+package common
+
+import (
+	"context"
+	"github.com/opentracing/opentracing-go"
+	"github.com/opentracing/opentracing-go/log"
+)
+
+// IncrementGauge increments the specified gauge metric
+// It does this by logging an appropriate field value to a tracing span.
+func IncrementGauge(ctx context.Context, metric string) {
+	// The field name we use is the specified metric name prepended with FieldnamePrefixGauge to designate that it is a Prometheus gauge metric
+	// The collector will replace that prefix with "fn_" and use the result as the Prometheus metric name.
+	fieldname := FieldnamePrefixGauge + metric
+
+	// Spans are not processed by the collector until the span ends, so to prevent any delay
+	// in processing the stats when the current span is long-lived we create a new span for every call
+	// suffix the span name with SpannameSuffixDummy to denote that it is used only to hold a metric and isn't itself of any interest
+	span, ctx := opentracing.StartSpanFromContext(ctx, fieldname+SpannameSuffixDummy)
+	defer span.Finish()
+
+	// gauge metrics are actually float64; here we log that it should be increased by +1
+	span.LogFields(log.Float64(fieldname, 1.))
+}
+
+// DecrementGauge decrements the specified gauge metric
+// It does this by logging an appropriate field value to a tracing span.
+func DecrementGauge(ctx context.Context, metric string) {
+	// The field name we use is the specified metric name prepended with FieldnamePrefixGauge to designate that it is a Prometheus gauge metric
+	// The collector will replace that prefix with "fn_" and use the result as the Prometheus metric name.
+	fieldname := FieldnamePrefixGauge + metric
+
+	// Spans are not processed by the collector until the span ends, so to prevent any delay
+	// in processing the stats when the current span is long-lived we create a new span for every call.
+	// suffix the span name with SpannameSuffixDummy to denote that it is used only to hold a metric and isn't itself of any interest
+	span, ctx := opentracing.StartSpanFromContext(ctx, fieldname+SpannameSuffixDummy)
+	defer span.Finish()
+
+	// gauge metrics are actually float64; here we log that it should be increased by -1
+	span.LogFields(log.Float64(fieldname, -1.))
+}
+
+// IncrementCounter increments the specified counter metric
+// It does this by logging an appropriate field value to a tracing span.
+func IncrementCounter(ctx context.Context, metric string) {
+	// The field name we use is the specified metric name prepended with FieldnamePrefixCounter to designate that it is a Prometheus counter metric
+	// The collector will replace that prefix with "fn_" and use the result as the Prometheus metric name.
+	fieldname := FieldnamePrefixCounter + metric
+
+	// Spans are not processed by the collector until the span ends, so to prevent any delay
+	// in processing the stats when the current span is long-lived we create a new span for every call.
+	// suffix the span name with SpannameSuffixDummy to denote that it is used only to hold a metric and isn't itself of any interest
+	span, ctx := opentracing.StartSpanFromContext(ctx, fieldname+SpannameSuffixDummy)
+	defer span.Finish()
+
+	// counter metrics are actually float64; here we log that it should be increased by +1
+	span.LogFields(log.Float64(fieldname, 1.))
+}
+
+// If required, create a scalar version of PublishHistograms that publishes a single histogram metric
+
+// PublishHistograms publishes the specifed histogram metrics
+// It does this by logging appropriate field values to a tracing span
+func PublishHistograms(ctx context.Context, metrics map[string]float64) {
+
+	// Spans are not processed by the collector until the span ends, so to prevent any delay
+	// in processing the stats when the current span is long-lived we create a new span for every call.
+	// suffix the span name with SpannameSuffixDummy to denote that it is used only to hold a metric and isn't itself of any interest
+	span, ctx := opentracing.StartSpanFromContext(ctx, "histogram_metrics"+SpannameSuffixDummy)
+	defer span.Finish()
+
+	for key, value := range metrics {
+		// The field name we use is the metric name prepended with FieldnamePrefixHistogram to designate that it is a Prometheus histogram metric
+		// The collector will replace that prefix with "fn_" and use the result as the Prometheus metric name.
+		fieldname := FieldnamePrefixHistogram + key
+		span.LogFields(log.Float64(fieldname, value))
+	}
+}
+
+const (
+
+	// FnPrefix is a constant for "fn_", used as a prefix for span names, field names, Prometheus metric names and Prometheus label names
+	FnPrefix = "fn_"
+
+	// FieldnamePrefixHistogram is prefixed to the name of a logged field
+	// to denote that it corresponds to a histogram metric
+	FieldnamePrefixHistogram = FnPrefix + "histogram_"
+
+	// FieldnamePrefixCounter is prefixed to the name of a logged field
+	// to denote that it corresponds to a counter metric
+	FieldnamePrefixCounter = FnPrefix + "counter_"
+
+	// FieldnamePrefixGauge is prefixed to the name of a logged field
+	// to denote that it corresponds to a gauge metric
+	FieldnamePrefixGauge = FnPrefix + "gauge_"
+
+	// SpannameSuffixDummy is suffixed to the name of a tracing span
+	// to denote that it has been created solely for the purpose of carrying metric values
+	// and is not itself of any interest and should not be converted to a Prometheus duration metric
+	SpannameSuffixDummy = "_dummy"
+)
diff --git a/api/server/prom_zip_collector.go b/api/server/prom_zip_collector.go
index d11f9e9f0..cc8900e70 100644
--- a/api/server/prom_zip_collector.go
+++ b/api/server/prom_zip_collector.go
@@ -1,6 +1,7 @@
 package server
 
 import (
+	"github.com/fnproject/fn/api/common"
 	"github.com/openzipkin/zipkin-go-opentracing"
 	"github.com/openzipkin/zipkin-go-opentracing/thrift/gen-go/zipkincore"
 	"github.com/prometheus/client_golang/prometheus"
@@ -21,6 +22,14 @@ type PrometheusCollector struct {
 	// and the corresponding value is a HistogramVec metric used to report the duration of spans with this name to Prometheus
 	histogramVecMap map[string]*prometheus.HistogramVec
 
+	// In this map, the key is the name of a tracing span,
+	// and the corresponding value is a CounterVec metric used to report the duration of spans with this name to Prometheus
+	counterVecMap map[string]*prometheus.CounterVec
+
+	// In this map, the key is the name of a tracing span,
+	// and the corresponding value is a GaugeVec metric used to report the duration of spans with this name to Prometheus
+	gaugeVecMap map[string]*prometheus.GaugeVec
+
 	// In this map, the key is the name of a tracing span,
 	// and the corresponding value is an array containing the label keys that were specified when the HistogramVec metric was created
 	registeredLabelKeysMap map[string][]string
@@ -30,6 +39,8 @@ type PrometheusCollector struct {
 func NewPrometheusCollector() (zipkintracer.Collector, error) {
 	pc := &PrometheusCollector{
 		histogramVecMap:        make(map[string]*prometheus.HistogramVec),
+		counterVecMap:          make(map[string]*prometheus.CounterVec),
+		gaugeVecMap:            make(map[string]*prometheus.GaugeVec),
 		registeredLabelKeysMap: make(map[string][]string),
 	}
 	return pc, nil
@@ -43,22 +54,51 @@ func (pc *PrometheusCollector) Collect(span *zipkincore.Span) error {
 	// extract any label values from the span
 	labelKeysFromSpan, labelValuesFromSpan := getLabels(span)
 
-	// get the HistogramVec for this span name
-	histogramVec, labelValuesToUse := pc.getHistogramVec(
-		("fn_span_" + spanName + "_duration_seconds"), ("Span " + spanName + " duration, by span name"), labelKeysFromSpan, labelValuesFromSpan)
+	// report the duration of this span as a histogram
+	// (unless the span name ends with SpannameSuffixDummy to denote it as being purely the carrier of a metric value and so of no interest in itself)
+	if !strings.HasSuffix(spanName, common.SpannameSuffixDummy) {
 
-	// now report the span duration value
-	histogramVec.With(labelValuesToUse).Observe((time.Duration(span.GetDuration()) * time.Microsecond).Seconds())
+		// get the HistogramVec for this span name
+		histogramVec, labelValuesToUse := pc.getHistogramVec(
+			("fn_span_" + spanName + "_duration_seconds"), ("Span " + spanName + " duration, by span name"), labelKeysFromSpan, labelValuesFromSpan)
 
-	// now extract any logged metric values from the span
-	for key, value := range getLoggedMetrics(span) {
+		// now report the span duration value
+		histogramVec.With(labelValuesToUse).Observe((time.Duration(span.GetDuration()) * time.Microsecond).Seconds())
+
+	}
+
+	// now extract any logged histogram metric values from the span
+	for key, value := range getLoggedHistogramMetrics(span) {
 
 		// get the HistogramVec for this metric
 		thisMetricHistogramVec, labelValuesToUse := pc.getHistogramVec(
-			("fn_" + spanName + "_" + key), (spanName + " metric " + key), labelKeysFromSpan, labelValuesFromSpan)
+			key, ("Metric " + key), labelKeysFromSpan, labelValuesFromSpan)
 
 		// now report the metric value
-		thisMetricHistogramVec.With(labelValuesToUse).Observe(float64(value))
+		thisMetricHistogramVec.With(labelValuesToUse).Observe(value)
+	}
+
+	// now extract any logged counter metric values from the span
+	for key, value := range getLoggedCounterMetrics(span) {
+
+		// get the CounterVec for this metric
+		thisMetricCounterVec, labelValuesToUse := pc.getCounterVec(
+			key, ("Metric " + key), labelKeysFromSpan, labelValuesFromSpan)
+
+		// now report the metric value
+		thisMetricCounterVec.With(labelValuesToUse).Add(value)
+	}
+
+	// now extract any logged gauge metric values from the span
+	for key, value := range getLoggedGaugeMetrics(span) {
+
+		// get the GaugeVec for this metric
+		thisMetricGaugeVec, labelValuesToUse := pc.getGaugeVec(
+			key, ("Metric " + key), labelKeysFromSpan, labelValuesFromSpan)
+
+		// now report the metric value
+		thisMetricGaugeVec.With(labelValuesToUse).Add(value)
+
 	}
 
 	return nil
@@ -106,6 +146,90 @@ func (pc *PrometheusCollector) getHistogramVec(
 	return histogramVec, labelValuesToUse
 }
 
+// Return (and create, if necessary) a CounterVec for the specified Prometheus metric
+func (pc *PrometheusCollector) getCounterVec(
+	metricName string, metricHelp string, labelKeysFromSpan []string, labelValuesFromSpan map[string]string) (
+	*prometheus.CounterVec, map[string]string) {
+
+	var labelValuesToUse map[string]string
+
+	pc.lock.Lock()
+	defer pc.lock.Unlock()
+
+	counterVec, found := pc.counterVecMap[metricName]
+	if !found {
+		// create a new CounterVec
+		counterVec = prometheus.NewCounterVec(
+			prometheus.CounterOpts{
+				Name: metricName,
+				Help: metricHelp,
+			},
+			labelKeysFromSpan,
+		)
+		pc.counterVecMap[metricName] = counterVec
+		pc.registeredLabelKeysMap[metricName] = labelKeysFromSpan
+		prometheus.MustRegister(counterVec)
+		labelValuesToUse = labelValuesFromSpan
+	} else {
+		// found an existing CounterVec
+		// need to be careful here, since we must supply the same label keys as when we first created the metric
+		// otherwise we will get a "inconsistent label cardinality" panic
+		// that's why we saved the original label keys in the registeredLabelKeysMap map
+		// so we can use that to construct a map of label key/value pairs to set on the metric
+		labelValuesToUse = make(map[string]string)
+		for _, thisRegisteredLabelKey := range pc.registeredLabelKeysMap[metricName] {
+			if value, found := labelValuesFromSpan[thisRegisteredLabelKey]; found {
+				labelValuesToUse[thisRegisteredLabelKey] = value
+			} else {
+				labelValuesToUse[thisRegisteredLabelKey] = ""
+			}
+		}
+	}
+	return counterVec, labelValuesToUse
+}
+
+// Return (and create, if necessary) a GaugeVec for the specified Prometheus metric
+func (pc *PrometheusCollector) getGaugeVec(
+	metricName string, metricHelp string, labelKeysFromSpan []string, labelValuesFromSpan map[string]string) (
+	*prometheus.GaugeVec, map[string]string) {
+
+	var labelValuesToUse map[string]string
+
+	pc.lock.Lock()
+	defer pc.lock.Unlock()
+
+	gaugeVec, found := pc.gaugeVecMap[metricName]
+	if !found {
+		// create a new GaugeVec
+		gaugeVec = prometheus.NewGaugeVec(
+			prometheus.GaugeOpts{
+				Name: metricName,
+				Help: metricHelp,
+			},
+			labelKeysFromSpan,
+		)
+		pc.gaugeVecMap[metricName] = gaugeVec
+		pc.registeredLabelKeysMap[metricName] = labelKeysFromSpan
+		prometheus.MustRegister(gaugeVec)
+		labelValuesToUse = labelValuesFromSpan
+	} else {
+		// found an existing GaugeVec
+		// need to be careful here, since we must supply the same label keys as when we first created the metric
+		// otherwise we will get a "inconsistent label cardinality" panic
+		// that's why we saved the original label keys in the registeredLabelKeysMap map
+		// so we can use that to construct a map of label key/value pairs to set on the metric
+		labelValuesToUse = make(map[string]string)
+		for _, thisRegisteredLabelKey := range pc.registeredLabelKeysMap[metricName] {
+			if value, found := labelValuesFromSpan[thisRegisteredLabelKey]; found {
+				labelValuesToUse[thisRegisteredLabelKey] = value
+			} else {
+				labelValuesToUse[thisRegisteredLabelKey] = ""
+			}
+		}
+	}
+	return gaugeVec, labelValuesToUse
+}
+
 // extract from the specified span the key/value pairs that we want to add as labels to the Prometheus metric for this span
 // returns an array of keys, and a map of key-value pairs
 func getLabels(span *zipkincore.Span) ([]string, map[string]string) {
@@ -127,20 +251,70 @@ func getLabels(span *zipkincore.Span) ([]string, map[string]string) {
 	return keys, labelMap
 }
 
-// extract from the span the logged metric values, which we assume as uint64 values
-func getLoggedMetrics(span *zipkincore.Span) map[string]uint64 {
+// extract from the span the logged histogram metric values.
+// These are the ones whose names start with FieldnamePrefixHistogram,
+// and whose values we assume are float64
+func getLoggedHistogramMetrics(span *zipkincore.Span) map[string]float64 {
 
-	keyValueMap := make(map[string]uint64)
+	keyValueMap := make(map[string]float64)
 
-	// extract any annotations whose Value starts with "fn_"
+	// extract any annotations whose Value starts with FieldnamePrefixHistogram
 	annotations := span.GetAnnotations()
 	for _, thisAnnotation := range annotations {
-		if strings.HasPrefix(thisAnnotation.GetValue(), "fn_") {
+		if strings.HasPrefix(thisAnnotation.GetValue(), common.FieldnamePrefixHistogram) {
 			keyvalue := strings.Split(thisAnnotation.GetValue(), "=")
 			if len(keyvalue) == 2 {
-				if value, err := strconv.ParseUint(keyvalue[1], 10, 64); err == nil {
+				if value, err := strconv.ParseFloat(keyvalue[1], 64); err == nil {
 					key := strings.TrimSpace(keyvalue[0])
-					key = key[3:] // strip off leading fn_
+					key = common.FnPrefix + key[len(common.FieldnamePrefixHistogram):] // strip off fieldname prefix and then prepend "fn_" to the front
+					keyValueMap[key] = value
+				}
+			}
+		}
+	}
+	return keyValueMap
+}
+
+// extract from the span the logged counter metric values.
+// These are the ones whose names start with FieldnamePrefixCounter,
+// and whose values we assume are float64
+func getLoggedCounterMetrics(span *zipkincore.Span) map[string]float64 {
+
+	keyValueMap := make(map[string]float64)
+
+	// extract any annotations whose Value starts with FieldnamePrefixCounter
+	annotations := span.GetAnnotations()
+	for _, thisAnnotation := range annotations {
+		if strings.HasPrefix(thisAnnotation.GetValue(), common.FieldnamePrefixCounter) {
+			keyvalue := strings.Split(thisAnnotation.GetValue(), "=")
+			if len(keyvalue) == 2 {
+				if value, err := strconv.ParseFloat(keyvalue[1], 64); err == nil {
+					key := strings.TrimSpace(keyvalue[0])
+					key = common.FnPrefix + key[len(common.FieldnamePrefixCounter):] // strip off fieldname prefix and then prepend "fn_" to the front
+					keyValueMap[key] = value
+				}
+			}
+		}
+	}
+	return keyValueMap
+}
+
+// extract from the span the logged gauge metric values.
+// These are the ones whose names start with FieldnamePrefixGauge,
+// and whose values we assume are float64
+func getLoggedGaugeMetrics(span *zipkincore.Span) map[string]float64 {
+
+	keyValueMap := make(map[string]float64)
+
+	// extract any annotations whose Value starts with FieldnamePrefixGauge
+	annotations := span.GetAnnotations()
+	for _, thisAnnotation := range annotations {
+		if strings.HasPrefix(thisAnnotation.GetValue(), common.FieldnamePrefixGauge) {
+			keyvalue := strings.Split(thisAnnotation.GetValue(), "=")
+			if len(keyvalue) == 2 {
+				if value, err := strconv.ParseFloat(keyvalue[1], 64); err == nil {
+					key := strings.TrimSpace(keyvalue[0])
+					key = common.FnPrefix + key[len(common.FieldnamePrefixGauge):] // strip off fieldname prefix and then prepend "fn_" to the front
 					keyValueMap[key] = value
 				}
 			}
diff --git a/examples/grafana/fn_grafana_dashboard.json b/examples/grafana/fn_grafana_dashboard.json
index 89aba2976..37c93c508 100644
--- a/examples/grafana/fn_grafana_dashboard.json
+++ b/examples/grafana/fn_grafana_dashboard.json
@@ -116,7 +116,7 @@
           "tableColumn": "",
           "targets": [
             {
-              "expr": "sum(fn_api_queued)",
+              "expr": "sum(fn_queued)",
               "format": "time_series",
               "intervalFactor": 2,
               "legendFormat": "",
@@ -193,7 +193,7 @@
           "tableColumn": "",
           "targets": [
             {
-              "expr": "sum(fn_api_running)",
+              "expr": "sum(fn_running)",
               "format": "time_series",
               "intervalFactor": 2,
               "legendFormat": "",
@@ -270,7 +270,7 @@
           "tableColumn": "",
           "targets": [
             {
-              "expr": "sum(fn_api_completed)",
+              "expr": "sum(fn_completed)",
               "format": "time_series",
               "intervalFactor": 2,
               "legendFormat": "",
@@ -347,7 +347,7 @@
           "tableColumn": "",
           "targets": [
             {
-              "expr": "sum(fn_api_failed)",
+              "expr": "sum(fn_failed)",
               "format": "time_series",
               "intervalFactor": 2,
               "legendFormat": "",
@@ -413,7 +413,7 @@
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(fn_api_queued)",
+              "expr": "sum(fn_queued)",
               "format": "time_series",
               "intervalFactor": 2,
               "legendFormat": "Total queued",
@@ -490,7 +490,7 @@
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(fn_api_running)",
+              "expr": "sum(fn_running)",
               "format": "time_series",
               "intervalFactor": 2,
               "legendFormat": "Total running",
@@ -567,7 +567,7 @@
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(fn_api_completed)",
+              "expr": "sum(fn_completed)",
               "format": "time_series",
               "intervalFactor": 2,
               "legendFormat": "Total completed",
@@ -644,7 +644,7 @@
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(fn_api_failed)",
+              "expr": "sum(fn_failed)",
               "format": "time_series",
               "intervalFactor": 2,
               "legendFormat": "Total failed",
@@ -655,7 +655,7 @@
           "thresholds": [],
           "timeFrom": null,
           "timeShift": null,
-          "title": "Total queued",
+          "title": "Total failed",
           "tooltip": {
             "shared": true,
             "sort": 0,
@@ -738,11 +738,11 @@
           "steppedLine": false,
           "targets": [
             {
-              "expr": "fn_api_queued",
+              "expr": "fn_queued",
               "format": "time_series",
               "interval": "",
               "intervalFactor": 1,
-              "legendFormat": "{{app}} {{path}}",
+              "legendFormat": "{{fn_appname}} {{fn_path}}",
               "refId": "A",
               "step": 1
             }
@@ -822,10 +822,10 @@
           "steppedLine": false,
           "targets": [
             {
-              "expr": "fn_api_running",
+              "expr": "fn_running",
               "format": "time_series",
               "intervalFactor": 2,
-              "legendFormat": "{{app}} {{path}}",
+              "legendFormat": "{{fn_appname}} {{fn_path}}",
               "refId": "A",
               "step": 2
             }
@@ -904,10 +904,10 @@
           "steppedLine": false,
           "targets": [
             {
-              "expr": "fn_api_completed",
+              "expr": "fn_completed",
               "format": "time_series",
               "intervalFactor": 2,
-              "legendFormat": "{{app}} {{path}}",
+              "legendFormat": "{{fn_appname}} {{fn_path}}",
               "refId": "A",
               "step": 2
             }
@@ -986,10 +986,10 @@
           "steppedLine": false,
           "targets": [
             {
-              "expr": "fn_api_failed",
+              "expr": "fn_failed",
               "format": "time_series",
               "intervalFactor": 2,
-              "legendFormat": "{{app}} {{path}}",
+              "legendFormat": "{{fn_appname}} {{fn_path}}",
               "refId": "A",
               "step": 2
             }