opentracing -> opencensus (#802)

* update vendor directory, add go.opencensus.io

* update imports

* oops

* s/opentracing/opencensus/ & remove prometheus / zipkin stuff & remove old stats

* the dep train rides again

* fix gin build

* deps from last guy

* start in on the agent metrics

* she builds

* remove tags for now, cardinality error is fussing. subscribe instead of register

* update to patched version of opencensus to proceed for now TODO switch to a release

* meh

fix imports

* println debug the bad boys

* lace it with the tags

* update deps again

* fix all inconsistent cardinality errors

* add our own logger

* fix init

* fix oom measure

* remove bugged removal code

* fix s3 measures

* fix prom handler nil
This commit is contained in:
Reed Allman
2018-03-05 09:35:28 -08:00
committed by GitHub
parent 924d27559c
commit 206aa3c203
5975 changed files with 158755 additions and 566592 deletions

View File

@@ -3,7 +3,6 @@ package agent
import (
"context"
"io"
"net/http"
"strings"
"sync"
"time"
@@ -16,9 +15,11 @@ import (
"github.com/fnproject/fn/api/models"
"github.com/fnproject/fn/fnext"
"github.com/go-openapi/strfmt"
"github.com/opentracing/opentracing-go"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/sirupsen/logrus"
"go.opencensus.io/stats"
"go.opencensus.io/stats/view"
"go.opencensus.io/tag"
"go.opencensus.io/trace"
)
// TODO we should prob store async calls in db immediately since we're returning id (will 404 until post-execution)
@@ -85,8 +86,6 @@ type Agent interface {
// Close is not safe to be called from multiple threads.
io.Closer
// Return the http.Handler used to handle Prometheus metric requests
PromHandler() http.Handler
AddCallListener(fnext.CallListener)
// Enqueue is to use the agent's sweet sweet client bindings to remotely
@@ -109,9 +108,6 @@ type agent struct {
wg sync.WaitGroup // TODO rename
shutonce sync.Once
shutdown chan struct{}
// Prometheus HTTP handler
promHandler http.Handler
}
func New(da DataAccess) Agent {
@@ -128,13 +124,12 @@ func New(da DataAccess) Agent {
})
a := &agent{
cfg: *cfg,
da: da,
driver: driver,
slotMgr: NewSlotQueueMgr(),
resources: NewResourceTracker(),
shutdown: make(chan struct{}),
promHandler: promhttp.Handler(),
cfg: *cfg,
da: da,
driver: driver,
slotMgr: NewSlotQueueMgr(),
resources: NewResourceTracker(),
shutdown: make(chan struct{}),
}
// TODO assert that agent doesn't get started for API nodes up above ?
@@ -174,8 +169,8 @@ func (a *agent) Submit(callI Call) error {
call.req = call.req.WithContext(ctx)
defer cancel()
ctx, finish := statSpans(ctx, call)
defer finish()
ctx, span := trace.StartSpan(ctx, "agent_submit")
defer span.End()
err := a.submit(ctx, call)
return err
@@ -202,14 +197,15 @@ func (a *agent) endStateTrackers(ctx context.Context, call *call) {
}
func (a *agent) submit(ctx context.Context, call *call) error {
StatsEnqueue(ctx)
statsEnqueue(ctx)
// TODO can we replace state trackers with metrics?
a.startStateTrackers(ctx, call)
defer a.endStateTrackers(ctx, call)
slot, err := a.getSlot(ctx, call)
if err != nil {
a.handleStatsDequeue(ctx, call, err)
handleStatsDequeue(ctx, err)
return transformTimeout(err, true)
}
@@ -217,20 +213,19 @@ func (a *agent) submit(ctx context.Context, call *call) error {
err = call.Start(ctx)
if err != nil {
a.handleStatsDequeue(ctx, call, err)
handleStatsDequeue(ctx, err)
return transformTimeout(err, true)
}
// decrement queued count, increment running count
StatsDequeueAndStart(ctx)
statsDequeueAndStart(ctx)
// pass this error (nil or otherwise) to end directly, to store status, etc
err = slot.exec(ctx, call)
a.handleStatsEnd(ctx, call, err)
handleStatsEnd(ctx, err)
// TODO: we need to allocate more time to store the call + logs in case the call timed out,
// but this could put us over the timeout if the call did not reply yet (need better policy).
ctx = opentracing.ContextWithSpan(context.Background(), opentracing.SpanFromContext(ctx))
ctx = common.BackgroundContext(ctx)
err = call.End(ctx, err)
return transformTimeout(err, false)
}
@@ -247,54 +242,33 @@ func transformTimeout(e error, isRetriable bool) error {
// handleStatsDequeue handles stats for dequeuing for early exit (getSlot or Start)
// cases. Only timeouts can be a simple dequeue while other cases are actual errors.
func (a *agent) handleStatsDequeue(ctx context.Context, call *call, err error) {
func handleStatsDequeue(ctx context.Context, err error) {
if err == context.DeadlineExceeded {
StatsDequeue(ctx)
StatsIncrementTooBusy(ctx)
statsDequeue(ctx)
statsTooBusy(ctx)
} else {
StatsDequeueAndFail(ctx)
StatsIncrementErrors(ctx)
statsDequeueAndFail(ctx)
statsErrors(ctx)
}
}
// handleStatsEnd handles stats for after a call is ran, depending on error.
func (a *agent) handleStatsEnd(ctx context.Context, call *call, err error) {
func handleStatsEnd(ctx context.Context, err error) {
if err == nil {
// decrement running count, increment completed count
StatsComplete(ctx)
statsComplete(ctx)
} else {
// decrement running count, increment failed count
StatsFailed(ctx)
statsFailed(ctx)
// increment the timeout or errors count, as appropriate
if err == context.DeadlineExceeded {
StatsIncrementTimedout(ctx)
statsTimedout(ctx)
} else {
StatsIncrementErrors(ctx)
statsErrors(ctx)
}
}
}
func statSpans(ctx context.Context, call *call) (_ context.Context, finish func()) {
// agent_submit_global has no parent span because we don't want it to inherit fn_appname or fn_path
spanGlobal := opentracing.StartSpan("agent_submit_global")
// agent_submit_global has no parent span because we don't want it to inherit fn_path
spanApp := opentracing.StartSpan("agent_submit_app")
spanApp.SetBaggageItem("fn_appname", call.AppName)
// agent_submit has a parent span in the usual way
// it doesn't matter if it inherits fn_appname or fn_path (and we set them here in any case)
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_submit")
span.SetBaggageItem("fn_appname", call.AppName)
span.SetBaggageItem("fn_path", call.Path)
return ctx, func() {
spanGlobal.Finish()
spanApp.Finish()
span.Finish()
}
}
// getSlot returns a Slot (or error) for the request to run. Depending on hot/cold
// request type, this may launch a new container or wait for other containers to become idle
// or it may wait for resources to become available to launch a new container.
@@ -303,8 +277,8 @@ func (a *agent) getSlot(ctx context.Context, call *call) (Slot, error) {
ctx, cancel := context.WithDeadline(ctx, call.slotDeadline)
defer cancel()
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_get_slot")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "agent_get_slot")
defer span.End()
if protocol.IsStreamable(protocol.Protocol(call.Format)) {
// For hot requests, we use a long lived slot queue, which we use to manage hot containers
@@ -340,9 +314,9 @@ func (a *agent) hotLauncher(ctx context.Context, call *call) {
// IMPORTANT: get a context that has a child span / logger but NO timeout
// TODO this is a 'FollowsFrom'
ctx = opentracing.ContextWithSpan(common.WithLogger(context.Background(), logger), opentracing.SpanFromContext(ctx))
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_hot_launcher")
defer span.Finish()
ctx = common.BackgroundContext(ctx)
ctx, span := trace.StartSpan(ctx, "agent_hot_launcher")
defer span.End()
for {
ctx, cancel := context.WithTimeout(ctx, timeout)
@@ -395,8 +369,8 @@ func (a *agent) checkLaunch(ctx context.Context, call *call) {
// waitHot pings and waits for a hot container from the slot queue
func (a *agent) waitHot(ctx context.Context, call *call) (Slot, error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_wait_hot")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "agent_wait_hot")
defer span.End()
ctx, cancel := context.WithCancel(ctx)
defer cancel() // shut down dequeuer if we grab a slot
@@ -442,8 +416,8 @@ func (a *agent) launchCold(ctx context.Context, call *call) (Slot, error) {
isAsync := call.Type == models.TypeAsync
ch := make(chan Slot)
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_launch_cold")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "agent_launch_cold")
defer span.End()
call.containerState.UpdateState(ctx, ContainerStateWait, call.slots)
@@ -479,8 +453,8 @@ func (s *coldSlot) Error() error {
}
func (s *coldSlot) exec(ctx context.Context, call *call) error {
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_cold_exec")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "agent_cold_exec")
defer span.End()
call.requestState.UpdateState(ctx, RequestStateExec, call.slots)
call.containerState.UpdateState(ctx, ContainerStateBusy, call.slots)
@@ -507,7 +481,7 @@ func (s *coldSlot) Close(ctx context.Context) error {
// call this from here so that in exec we don't have to eat container
// removal latency
// NOTE ensure container removal, no ctx timeout
ctx = opentracing.ContextWithSpan(context.Background(), opentracing.SpanFromContext(ctx))
ctx = common.BackgroundContext(ctx)
s.cookie.Close(ctx)
}
if s.tok != nil {
@@ -535,8 +509,8 @@ func (s *hotSlot) Error() error {
}
func (s *hotSlot) exec(ctx context.Context, call *call) error {
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_hot_exec")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "agent_hot_exec")
defer span.End()
call.requestState.UpdateState(ctx, RequestStateExec, call.slots)
@@ -575,8 +549,8 @@ func (s *hotSlot) exec(ctx context.Context, call *call) error {
}
func (a *agent) prepCold(ctx context.Context, call *call, tok ResourceToken, ch chan Slot) {
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_prep_cold")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "agent_prep_cold")
defer span.End()
call.containerState.UpdateState(ctx, ContainerStateStart, call.slots)
@@ -622,9 +596,9 @@ func (a *agent) prepCold(ctx context.Context, call *call, tok ResourceToken, ch
func (a *agent) runHot(ctx context.Context, call *call, tok ResourceToken, state ContainerState) {
// IMPORTANT: get a context that has a child span / logger but NO timeout
// TODO this is a 'FollowsFrom'
ctx = opentracing.ContextWithSpan(context.Background(), opentracing.SpanFromContext(ctx))
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_run_hot")
defer span.Finish()
ctx = common.BackgroundContext(ctx)
ctx, span := trace.StartSpan(ctx, "agent_run_hot")
defer span.End()
defer tok.Close() // IMPORTANT: this MUST get called
state.UpdateState(ctx, ContainerStateStart, call.slots)
@@ -852,17 +826,10 @@ func (c *container) CPUs() uint64 { return c.cpus }
// WriteStat publishes each metric in the specified Stats structure as a histogram metric
func (c *container) WriteStat(ctx context.Context, stat drivers.Stat) {
// Convert each metric value from uint64 to float64
// and, for backward compatibility reasons, prepend each metric name with "docker_stats_fn_"
// (if we don't care about compatibility then we can remove that)
var metrics = make(map[string]float64)
for key, value := range stat.Metrics {
metrics["docker_stats_fn_"+key] = float64(value)
stats.Record(ctx, stats.FindMeasure("docker_stats_"+key).(*stats.Int64Measure).M(int64(value)))
}
common.PublishHistograms(ctx, metrics)
c.statsMu.Lock()
if c.stats != nil {
*(c.stats) = append(*(c.stats), stat)
@@ -870,6 +837,45 @@ func (c *container) WriteStat(ctx context.Context, stat drivers.Stat) {
c.statsMu.Unlock()
}
func init() {
// TODO this is nasty figure out how to use opencensus to not have to declare these
keys := []string{"net_rx", "net_tx", "mem_limit", "mem_usage", "disk_read", "disk_write", "cpu_user", "cpu_total", "cpu_kernel"}
// TODO necessary?
appKey, err := tag.NewKey("fn_appname")
if err != nil {
logrus.Fatal(err)
}
pathKey, err := tag.NewKey("fn_path")
if err != nil {
logrus.Fatal(err)
}
for _, key := range keys {
units := "bytes"
if strings.Contains(key, "cpu") {
units = "cpu"
}
dockerStatsDist, err := stats.Int64("docker_stats_"+key, "docker container stats for "+key, units)
if err != nil {
logrus.Fatal(err)
}
v, err := view.New(
"docker_stats_"+key,
"docker container stats for "+key,
[]tag.Key{appKey, pathKey},
dockerStatsDist,
view.DistributionAggregation{},
)
if err != nil {
logrus.Fatalf("cannot create view: %v", err)
}
if err := v.Subscribe(); err != nil {
logrus.Fatal(err)
}
}
}
//func (c *container) DockerAuth() (docker.AuthConfiguration, error) {
// Implementing the docker.AuthConfiguration interface.
// TODO per call could implement this stored somewhere (vs. configured on host)

View File

@@ -4,9 +4,11 @@ import (
"context"
"time"
"github.com/fnproject/fn/api/common"
"github.com/fnproject/fn/api/models"
opentracing "github.com/opentracing/opentracing-go"
"github.com/sirupsen/logrus"
"go.opencensus.io/tag"
"go.opencensus.io/trace"
)
func (a *agent) asyncDequeue() {
@@ -17,8 +19,8 @@ func (a *agent) asyncDequeue() {
defer cancel()
// parent span here so that we can see how many async calls are running
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_async_dequeue")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "agent_async_dequeue")
defer span.End()
for {
select {
@@ -73,11 +75,29 @@ func (a *agent) asyncChew(ctx context.Context) <-chan *models.Call {
func (a *agent) asyncRun(ctx context.Context, model *models.Call) {
// IMPORTANT: get a context that has a child span but NO timeout (Submit imposes timeout)
// TODO this is a 'FollowsFrom'
ctx = opentracing.ContextWithSpan(context.Background(), opentracing.SpanFromContext(ctx))
ctx = common.BackgroundContext(ctx)
// since async doesn't come in through the normal request path,
// we've gotta add tags here for stats to come out properly.
appKey, err := tag.NewKey("fn_appname")
if err != nil {
logrus.Fatal(err)
}
pathKey, err := tag.NewKey("fn_path")
if err != nil {
logrus.Fatal(err)
}
ctx, err = tag.New(ctx,
tag.Insert(appKey, model.AppName),
tag.Insert(pathKey, model.Path),
)
if err != nil {
logrus.Fatal(err)
}
// additional enclosing context here since this isn't spawned from an http request
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_async_run")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "agent_async_run")
defer span.End()
call, err := a.GetCall(
FromModel(model),

View File

@@ -9,12 +9,13 @@ import (
"strings"
"time"
"go.opencensus.io/trace"
"github.com/fnproject/fn/api/agent/drivers"
"github.com/fnproject/fn/api/common"
"github.com/fnproject/fn/api/id"
"github.com/fnproject/fn/api/models"
"github.com/go-openapi/strfmt"
"github.com/opentracing/opentracing-go"
"github.com/sirupsen/logrus"
)
@@ -249,8 +250,8 @@ type call struct {
func (c *call) Model() *models.Call { return c.Call }
func (c *call) Start(ctx context.Context) error {
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_call_start")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "agent_call_start")
defer span.End()
// Check context timeouts, errors
if ctx.Err() != nil {
@@ -290,8 +291,8 @@ func (c *call) Start(ctx context.Context) error {
}
func (c *call) End(ctx context.Context, errIn error) error {
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_call_end")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "agent_call_end")
defer span.End()
c.CompletedAt = strfmt.DateTime(time.Now())

View File

@@ -12,13 +12,15 @@ import (
"strings"
"time"
"go.opencensus.io/stats"
"go.opencensus.io/trace"
"github.com/coreos/go-semver/semver"
"github.com/fnproject/fn/api/agent/drivers"
"github.com/fnproject/fn/api/common"
"github.com/fnproject/fn/api/models"
"github.com/fsouza/go-dockerclient"
"github.com/go-openapi/strfmt"
"github.com/opentracing/opentracing-go"
"github.com/sirupsen/logrus"
)
@@ -77,7 +79,6 @@ func NewDocker(conf drivers.Config) *DockerDriver {
}
func checkDockerVersion(driver *DockerDriver, expected string) error {
info, err := driver.docker.Info(context.Background())
if err != nil {
return err
@@ -269,9 +270,9 @@ func (drv *DockerDriver) ensureImage(ctx context.Context, task drivers.Container
if task, ok := task.(Auther); ok {
var err error
span, _ := opentracing.StartSpanFromContext(ctx, "docker_auth")
_, span := trace.StartSpan(ctx, "docker_auth")
config, err = task.DockerAuth()
span.Finish()
span.End()
if err != nil {
return err
}
@@ -396,8 +397,8 @@ func (w *waitResult) Wait(ctx context.Context) (drivers.RunResult, error) {
// Repeatedly collect stats from the specified docker container until the stopSignal is closed or the context is cancelled
func (drv *DockerDriver) collectStats(ctx context.Context, stopSignal <-chan struct{}, container string, task drivers.ContainerTask) {
span, ctx := opentracing.StartSpanFromContext(ctx, "docker_collect_stats")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "docker_collect_stats")
defer span.End()
log := common.Logger(ctx)
@@ -576,7 +577,7 @@ func (w *waitResult) wait(ctx context.Context) (status string, err error) {
case 0:
return drivers.StatusSuccess, nil
case 137: // OOM
// TODO put in stats opentracing.SpanFromContext(ctx).LogFields(log.String("docker", "oom"))
stats.Record(ctx, dockerOOMMeasure.M(1))
common.Logger(ctx).Error("docker oom")
err := errors.New("container out of memory, you may want to raise route.memory for this route (default: 128MB)")
return drivers.StatusKilled, models.NewAPIError(http.StatusBadGateway, err)

View File

@@ -12,9 +12,11 @@ import (
"github.com/fnproject/fn/api/common"
"github.com/fsouza/go-dockerclient"
"github.com/opentracing/opentracing-go"
"github.com/opentracing/opentracing-go/log"
"github.com/sirupsen/logrus"
"go.opencensus.io/stats"
"go.opencensus.io/stats/view"
"go.opencensus.io/tag"
"go.opencensus.io/trace"
)
const (
@@ -98,18 +100,123 @@ type dockerWrap struct {
dockerNoTimeout *docker.Client
}
func init() {
// TODO doing this at each call site seems not the intention of the library since measurements
// need to be created and views registered. doing this up front seems painful but maybe there
// are benefits?
// TODO do we have to do this? the measurements will be tagged on the context, will they be propagated
// or we have to white list them in the view for them to show up? test...
var err error
appKey, err := tag.NewKey("fn_appname")
if err != nil {
logrus.Fatal(err)
}
pathKey, err := tag.NewKey("fn_path")
if err != nil {
logrus.Fatal(err)
}
{
dockerRetriesMeasure, err = stats.Int64("docker_api_retries", "docker api retries", "")
if err != nil {
logrus.Fatal(err)
}
v, err := view.New(
"docker_api_retries",
"number of times we've retried docker API upon failure",
[]tag.Key{appKey, pathKey},
dockerRetriesMeasure,
view.SumAggregation{},
)
if err != nil {
logrus.Fatalf("cannot create view: %v", err)
}
if err := v.Subscribe(); err != nil {
logrus.Fatal(err)
}
}
{
dockerTimeoutMeasure, err = stats.Int64("docker_api_timeout", "docker api timeouts", "")
if err != nil {
logrus.Fatal(err)
}
v, err := view.New(
"docker_api_timeout_count",
"number of times we've timed out calling docker API",
[]tag.Key{appKey, pathKey},
dockerTimeoutMeasure,
view.CountAggregation{},
)
if err != nil {
logrus.Fatalf("cannot create view: %v", err)
}
if err := v.Subscribe(); err != nil {
logrus.Fatal(err)
}
}
{
dockerErrorMeasure, err = stats.Int64("docker_api_error", "docker api errors", "")
if err != nil {
logrus.Fatal(err)
}
v, err := view.New(
"docker_api_error_count",
"number of unrecoverable errors from docker API",
[]tag.Key{appKey, pathKey},
dockerErrorMeasure,
view.CountAggregation{},
)
if err != nil {
logrus.Fatalf("cannot create view: %v", err)
}
if err := v.Subscribe(); err != nil {
logrus.Fatal(err)
}
}
{
dockerOOMMeasure, err = stats.Int64("docker_oom", "docker oom", "")
if err != nil {
logrus.Fatal(err)
}
v, err := view.New(
"docker_oom_count",
"number of docker container oom",
[]tag.Key{appKey, pathKey},
dockerOOMMeasure,
view.CountAggregation{},
)
if err != nil {
logrus.Fatalf("cannot create view: %v", err)
}
if err := v.Subscribe(); err != nil {
logrus.Fatal(err)
}
}
}
var (
// TODO it's either this or stats.FindMeasure("string").M() -- this is safer but painful
dockerRetriesMeasure *stats.Int64Measure
dockerTimeoutMeasure *stats.Int64Measure
dockerErrorMeasure *stats.Int64Measure
dockerOOMMeasure *stats.Int64Measure
)
func (d *dockerWrap) retry(ctx context.Context, logger logrus.FieldLogger, f func() error) error {
var i int
var err error
span := opentracing.SpanFromContext(ctx)
defer func() { span.LogFields(log.Int("docker_call_retries", i)) }()
defer func() { stats.Record(ctx, dockerRetriesMeasure.M(int64(i))) }()
var b common.Backoff
// 10 retries w/o change to backoff is ~13s if ops take ~0 time
for ; i < 10; i++ {
select {
case <-ctx.Done():
span.LogFields(log.String("task", "fail.docker"))
stats.Record(ctx, dockerTimeoutMeasure.M(1))
logger.WithError(ctx.Err()).Warnf("docker call timed out")
return ctx.Err()
default:
@@ -119,11 +226,10 @@ func (d *dockerWrap) retry(ctx context.Context, logger logrus.FieldLogger, f fun
if common.IsTemporary(err) || isDocker50x(err) {
logger.WithError(err).Warn("docker temporary error, retrying")
b.Sleep(ctx)
span.LogFields(log.String("task", "tmperror.docker"))
continue
}
if err != nil {
span.LogFields(log.String("task", "error.docker"))
stats.Record(ctx, dockerErrorMeasure.M(1))
}
return err
}
@@ -176,22 +282,17 @@ func filterNoSuchContainer(ctx context.Context, err error) error {
}
func (d *dockerWrap) Info(ctx context.Context) (info *docker.DockerInfo, err error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "docker_server_version")
defer span.Finish()
logger := common.Logger(ctx).WithField("docker_cmd", "DockerInfo")
ctx, cancel := context.WithTimeout(ctx, retryTimeout)
defer cancel()
err = d.retry(ctx, logger, func() error {
info, err = d.docker.Info()
return err
})
return info, err
// NOTE: we're not very responsible and prometheus wasn't loved as a child, this
// threads through directly down to the docker call, skipping retires, so that we
// don't have to add tags / tracing / logger to the bare context handed to the one
// place this is called in initialization that has no context to report consistent
// stats like everything else in here. tl;dr this works, just don't use it for anything else.
return d.docker.Info()
}
func (d *dockerWrap) AttachToContainerNonBlocking(ctx context.Context, opts docker.AttachToContainerOptions) (w docker.CloseWaiter, err error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "docker_attach_container")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "docker_attach_container")
defer span.End()
logger := common.Logger(ctx).WithField("docker_cmd", "AttachContainer")
ctx, cancel := context.WithTimeout(ctx, retryTimeout)
@@ -208,8 +309,8 @@ func (d *dockerWrap) AttachToContainerNonBlocking(ctx context.Context, opts dock
}
func (d *dockerWrap) WaitContainerWithContext(id string, ctx context.Context) (code int, err error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "docker_wait_container")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "docker_wait_container")
defer span.End()
logger := common.Logger(ctx).WithField("docker_cmd", "WaitContainer")
err = d.retry(ctx, logger, func() error {
@@ -220,8 +321,8 @@ func (d *dockerWrap) WaitContainerWithContext(id string, ctx context.Context) (c
}
func (d *dockerWrap) StartContainerWithContext(id string, hostConfig *docker.HostConfig, ctx context.Context) (err error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "docker_start_container")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "docker_start_container")
defer span.End()
logger := common.Logger(ctx).WithField("docker_cmd", "StartContainer")
err = d.retry(ctx, logger, func() error {
@@ -236,8 +337,8 @@ func (d *dockerWrap) StartContainerWithContext(id string, hostConfig *docker.Hos
}
func (d *dockerWrap) CreateContainer(opts docker.CreateContainerOptions) (c *docker.Container, err error) {
span, ctx := opentracing.StartSpanFromContext(opts.Context, "docker_create_container")
defer span.Finish()
ctx, span := trace.StartSpan(opts.Context, "docker_create_container")
defer span.End()
logger := common.Logger(ctx).WithField("docker_cmd", "CreateContainer")
err = d.retry(ctx, logger, func() error {
@@ -248,8 +349,8 @@ func (d *dockerWrap) CreateContainer(opts docker.CreateContainerOptions) (c *doc
}
func (d *dockerWrap) PullImage(opts docker.PullImageOptions, auth docker.AuthConfiguration) (err error) {
span, ctx := opentracing.StartSpanFromContext(opts.Context, "docker_pull_image")
defer span.Finish()
ctx, span := trace.StartSpan(opts.Context, "docker_pull_image")
defer span.End()
logger := common.Logger(ctx).WithField("docker_cmd", "PullImage")
err = d.retry(ctx, logger, func() error {
@@ -262,9 +363,9 @@ func (d *dockerWrap) PullImage(opts docker.PullImageOptions, auth docker.AuthCon
func (d *dockerWrap) RemoveContainer(opts docker.RemoveContainerOptions) (err error) {
// extract the span, but do not keep the context, since the enclosing context
// may be timed out, and we still want to remove the container. TODO in caller? who cares?
span, _ := opentracing.StartSpanFromContext(opts.Context, "docker_remove_container")
defer span.Finish()
ctx := opentracing.ContextWithSpan(context.Background(), span)
ctx := common.BackgroundContext(opts.Context)
ctx, span := trace.StartSpan(ctx, "docker_remove_container")
defer span.End()
ctx, cancel := context.WithTimeout(ctx, retryTimeout)
defer cancel()
@@ -278,8 +379,8 @@ func (d *dockerWrap) RemoveContainer(opts docker.RemoveContainerOptions) (err er
}
func (d *dockerWrap) PauseContainer(id string, ctx context.Context) (err error) {
span, _ := opentracing.StartSpanFromContext(ctx, "docker_pause_container")
defer span.Finish()
_, span := trace.StartSpan(ctx, "docker_pause_container")
defer span.End()
ctx, cancel := context.WithTimeout(ctx, pauseTimeout)
defer cancel()
@@ -292,8 +393,8 @@ func (d *dockerWrap) PauseContainer(id string, ctx context.Context) (err error)
}
func (d *dockerWrap) UnpauseContainer(id string, ctx context.Context) (err error) {
span, _ := opentracing.StartSpanFromContext(ctx, "docker_unpause_container")
defer span.Finish()
_, span := trace.StartSpan(ctx, "docker_unpause_container")
defer span.End()
ctx, cancel := context.WithTimeout(ctx, pauseTimeout)
defer cancel()
@@ -306,8 +407,8 @@ func (d *dockerWrap) UnpauseContainer(id string, ctx context.Context) (err error
}
func (d *dockerWrap) InspectImage(ctx context.Context, name string) (i *docker.Image, err error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "docker_inspect_image")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "docker_inspect_image")
defer span.End()
ctx, cancel := context.WithTimeout(ctx, retryTimeout)
defer cancel()
@@ -320,8 +421,8 @@ func (d *dockerWrap) InspectImage(ctx context.Context, name string) (i *docker.I
}
func (d *dockerWrap) InspectContainerWithContext(container string, ctx context.Context) (c *docker.Container, err error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "docker_inspect_container")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "docker_inspect_container")
defer span.End()
ctx, cancel := context.WithTimeout(ctx, retryTimeout)
defer cancel()

View File

@@ -17,7 +17,8 @@ import (
"github.com/fnproject/fn/api/agent"
"github.com/fnproject/fn/api/common"
"github.com/fnproject/fn/api/models"
opentracing "github.com/opentracing/opentracing-go"
"go.opencensus.io/plugin/ochttp/propagation/b3"
"go.opencensus.io/trace"
)
// client implements agent.DataAccess
@@ -66,16 +67,16 @@ func NewClient(u string) (agent.DataAccess, error) {
}
func (cl *client) Enqueue(ctx context.Context, c *models.Call) error {
span, ctx := opentracing.StartSpanFromContext(ctx, "hybrid_client_enqueue")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "hybrid_client_enqueue")
defer span.End()
err := cl.do(ctx, c, nil, "PUT", "runner", "async")
return err
}
func (cl *client) Dequeue(ctx context.Context) (*models.Call, error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "hybrid_client_dequeue")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "hybrid_client_dequeue")
defer span.End()
var c struct {
C []*models.Call `json:"calls"`
@@ -88,16 +89,16 @@ func (cl *client) Dequeue(ctx context.Context) (*models.Call, error) {
}
func (cl *client) Start(ctx context.Context, c *models.Call) error {
span, ctx := opentracing.StartSpanFromContext(ctx, "hybrid_client_start")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "hybrid_client_start")
defer span.End()
err := cl.do(ctx, c, nil, "POST", "runner", "start")
return err
}
func (cl *client) Finish(ctx context.Context, c *models.Call, r io.Reader, async bool) error {
span, ctx := opentracing.StartSpanFromContext(ctx, "hybrid_client_end")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "hybrid_client_end")
defer span.End()
var b bytes.Buffer // TODO pool / we should multipart this?
_, err := io.Copy(&b, r)
@@ -118,8 +119,8 @@ func (cl *client) Finish(ctx context.Context, c *models.Call, r io.Reader, async
}
func (cl *client) GetApp(ctx context.Context, appName string) (*models.App, error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "hybrid_client_get_app")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "hybrid_client_get_app")
defer span.End()
var a struct {
A models.App `json:"app"`
@@ -129,8 +130,8 @@ func (cl *client) GetApp(ctx context.Context, appName string) (*models.App, erro
}
func (cl *client) GetRoute(ctx context.Context, appName, route string) (*models.Route, error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "hybrid_client_get_route")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "hybrid_client_get_route")
defer span.End()
// TODO trim prefix is pretty odd here eh?
var r struct {
@@ -181,8 +182,8 @@ func (cl *client) do(ctx context.Context, request, result interface{}, method st
}
func (cl *client) once(ctx context.Context, request, result interface{}, method string, url ...string) error {
span, ctx := opentracing.StartSpanFromContext(ctx, "hybrid_client_http_do")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "hybrid_client_http_do")
defer span.End()
var b bytes.Buffer // TODO pool
if request != nil {
@@ -196,12 +197,9 @@ func (cl *client) once(ctx context.Context, request, result interface{}, method
if err != nil {
return err
}
req = req.WithContext(ctx)
// shove the span headers in so that the server will continue this span
opentracing.GlobalTracer().Inject(
span.Context(),
opentracing.HTTPHeaders,
opentracing.HTTPHeadersCarrier(req.Header))
var xxx b3.HTTPFormat
xxx.SpanContextToRequest(span.SpanContext(), req)
resp, err := cl.http.Do(req)
if err != nil {

View File

@@ -1,9 +0,0 @@
package agent
import (
"net/http"
)
func (a *agent) PromHandler() http.Handler {
return a.promHandler
}

View File

@@ -7,8 +7,9 @@ import (
"io"
"net/http"
"go.opencensus.io/trace"
"github.com/fnproject/fn/api/models"
opentracing "github.com/opentracing/opentracing-go"
)
// HTTPProtocol converts stdin/stdout streams into HTTP/1.1 compliant
@@ -23,8 +24,8 @@ type HTTPProtocol struct {
func (p *HTTPProtocol) IsStreamable() bool { return true }
func (h *HTTPProtocol) Dispatch(ctx context.Context, ci CallInfo, w io.Writer) error {
span, ctx := opentracing.StartSpanFromContext(ctx, "dispatch_http")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "dispatch_http")
defer span.End()
req := ci.Request()
@@ -36,23 +37,23 @@ func (h *HTTPProtocol) Dispatch(ctx context.Context, ci CallInfo, w io.Writer) e
req.Header.Set("FN_REQUEST_URL", ci.RequestURL())
req.Header.Set("FN_CALL_ID", ci.CallID())
span, _ = opentracing.StartSpanFromContext(ctx, "dispatch_http_write_request")
_, span = trace.StartSpan(ctx, "dispatch_http_write_request")
// req.Write handles if the user does not specify content length
err := req.Write(h.in)
span.Finish()
span.End()
if err != nil {
return err
}
span, _ = opentracing.StartSpanFromContext(ctx, "dispatch_http_read_response")
_, span = trace.StartSpan(ctx, "dispatch_http_read_response")
resp, err := http.ReadResponse(bufio.NewReader(h.out), ci.Request())
span.Finish()
span.End()
if err != nil {
return models.NewAPIError(http.StatusBadGateway, fmt.Errorf("invalid http response from function err: %v", err))
}
span, _ = opentracing.StartSpanFromContext(ctx, "dispatch_http_write_response")
defer span.Finish()
_, span = trace.StartSpan(ctx, "dispatch_http_write_response")
defer span.End()
rw, ok := w.(http.ResponseWriter)
if !ok {

View File

@@ -9,8 +9,9 @@ import (
"net/http"
"sync"
"go.opencensus.io/trace"
"github.com/fnproject/fn/api/models"
opentracing "github.com/opentracing/opentracing-go"
)
var (
@@ -87,26 +88,26 @@ func (h *JSONProtocol) writeJSONToContainer(ci CallInfo) error {
}
func (h *JSONProtocol) Dispatch(ctx context.Context, ci CallInfo, w io.Writer) error {
span, ctx := opentracing.StartSpanFromContext(ctx, "dispatch_json")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "dispatch_json")
defer span.End()
span, _ = opentracing.StartSpanFromContext(ctx, "dispatch_json_write_request")
_, span = trace.StartSpan(ctx, "dispatch_json_write_request")
err := h.writeJSONToContainer(ci)
span.Finish()
span.End()
if err != nil {
return err
}
span, _ = opentracing.StartSpanFromContext(ctx, "dispatch_json_read_response")
_, span = trace.StartSpan(ctx, "dispatch_json_read_response")
var jout jsonOut
err = json.NewDecoder(h.out).Decode(&jout)
span.Finish()
span.End()
if err != nil {
return models.NewAPIError(http.StatusBadGateway, fmt.Errorf("invalid json response from function err: %v", err))
}
span, _ = opentracing.StartSpanFromContext(ctx, "dispatch_json_write_response")
defer span.Finish()
_, span = trace.StartSpan(ctx, "dispatch_json_write_response")
defer span.End()
rw, ok := w.(http.ResponseWriter)
if !ok {

View File

@@ -13,7 +13,8 @@ import (
"strings"
"sync"
opentracing "github.com/opentracing/opentracing-go"
"go.opencensus.io/trace"
"github.com/sirupsen/logrus"
)
@@ -166,9 +167,9 @@ func (a *resourceTracker) GetResourceToken(ctx context.Context, memory uint64, c
c.L.Unlock()
}()
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_get_resource_token")
ctx, span := trace.StartSpan(ctx, "agent_get_resource_token")
go func() {
defer span.Finish()
defer span.End()
defer cancel()
c.L.Lock()
@@ -254,9 +255,9 @@ func (a *resourceTracker) WaitAsyncResource(ctx context.Context) chan struct{} {
c.L.Unlock()
}()
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_wait_async_resource")
ctx, span := trace.StartSpan(ctx, "agent_wait_async_resource")
go func() {
defer span.Finish()
defer span.End()
defer cancel()
c.L.Lock()
isWaiting = true

View File

@@ -5,7 +5,10 @@ import (
"sync"
"time"
"github.com/fnproject/fn/api/common"
"github.com/sirupsen/logrus"
"go.opencensus.io/stats"
"go.opencensus.io/stats/view"
"go.opencensus.io/tag"
)
type RequestStateType int
@@ -137,16 +140,76 @@ func (c *containerState) UpdateState(ctx context.Context, newState ContainerStat
// update old state stats
gaugeKey := containerGaugeKeys[oldState]
if gaugeKey != "" {
common.DecrementGauge(ctx, gaugeKey)
stats.Record(ctx, stats.FindMeasure(gaugeKey).(*stats.Int64Measure).M(-1))
}
timeKey := containerTimeKeys[oldState]
if timeKey != "" {
common.PublishElapsedTimeHistogram(ctx, timeKey, before, now)
stats.Record(ctx, stats.FindMeasure(timeKey).(*stats.Int64Measure).M(int64(now.Sub(before).Round(time.Millisecond))))
}
// update new state stats
gaugeKey = containerGaugeKeys[newState]
if gaugeKey != "" {
common.IncrementGauge(ctx, gaugeKey)
stats.Record(ctx, stats.FindMeasure(gaugeKey).(*stats.Int64Measure).M(1))
}
}
func init() {
// TODO(reed): do we have to do this? the measurements will be tagged on the context, will they be propagated
// or we have to white list them in the view for them to show up? test...
appKey, err := tag.NewKey("fn_appname")
if err != nil {
logrus.Fatal(err)
}
pathKey, err := tag.NewKey("fn_path")
if err != nil {
logrus.Fatal(err)
}
for _, key := range containerGaugeKeys {
if key == "" { // leave nil intentionally, let it panic
continue
}
measure, err := stats.Int64(key, "containers in state "+key, "")
if err != nil {
logrus.Fatal(err)
}
v, err := view.New(
key,
"containers in state "+key,
[]tag.Key{appKey, pathKey},
measure,
view.CountAggregation{},
)
if err != nil {
logrus.Fatalf("cannot create view: %v", err)
}
if err := v.Subscribe(); err != nil {
logrus.Fatal(err)
}
}
for _, key := range containerTimeKeys {
if key == "" {
continue
}
measure, err := stats.Int64(key, "time spent in container state "+key, "ms")
if err != nil {
logrus.Fatal(err)
}
v, err := view.New(
key,
"time spent in container state "+key,
[]tag.Key{appKey, pathKey},
measure,
view.DistributionAggregation{},
)
if err != nil {
logrus.Fatalf("cannot create view: %v", err)
}
if err := v.Subscribe(); err != nil {
logrus.Fatal(err)
}
}
}

View File

@@ -2,54 +2,63 @@ package agent
import (
"context"
"github.com/fnproject/fn/api/common"
"github.com/sirupsen/logrus"
"go.opencensus.io/stats"
"go.opencensus.io/stats/view"
"go.opencensus.io/tag"
)
func StatsEnqueue(ctx context.Context) {
common.IncrementGauge(ctx, queuedMetricName)
common.IncrementCounter(ctx, callsMetricName)
// TODO add some suga:
// * hot containers active
// * memory used / available
func statsEnqueue(ctx context.Context) {
stats.Record(ctx, queuedMeasure.M(1))
stats.Record(ctx, callsMeasure.M(1))
}
// Call when a function has been queued but cannot be started because of an error
func StatsDequeue(ctx context.Context) {
common.DecrementGauge(ctx, queuedMetricName)
func statsDequeue(ctx context.Context) {
stats.Record(ctx, queuedMeasure.M(-1))
}
func StatsDequeueAndStart(ctx context.Context) {
common.DecrementGauge(ctx, queuedMetricName)
common.IncrementGauge(ctx, runningMetricName)
func statsDequeueAndStart(ctx context.Context) {
stats.Record(ctx, queuedMeasure.M(-1))
stats.Record(ctx, runningMeasure.M(1))
}
func StatsComplete(ctx context.Context) {
common.DecrementGauge(ctx, runningMetricName)
common.IncrementCounter(ctx, completedMetricName)
func statsComplete(ctx context.Context) {
stats.Record(ctx, runningMeasure.M(-1))
stats.Record(ctx, completedMeasure.M(1))
}
func StatsFailed(ctx context.Context) {
common.DecrementGauge(ctx, runningMetricName)
common.IncrementCounter(ctx, failedMetricName)
func statsFailed(ctx context.Context) {
stats.Record(ctx, runningMeasure.M(-1))
stats.Record(ctx, failedMeasure.M(1))
}
func StatsDequeueAndFail(ctx context.Context) {
common.DecrementGauge(ctx, queuedMetricName)
common.IncrementCounter(ctx, failedMetricName)
func statsDequeueAndFail(ctx context.Context) {
stats.Record(ctx, queuedMeasure.M(-1))
stats.Record(ctx, failedMeasure.M(1))
}
func StatsIncrementTimedout(ctx context.Context) {
common.IncrementCounter(ctx, timedoutMetricName)
func statsTimedout(ctx context.Context) {
stats.Record(ctx, timedoutMeasure.M(1))
}
func StatsIncrementErrors(ctx context.Context) {
common.IncrementCounter(ctx, errorsMetricName)
func statsErrors(ctx context.Context) {
stats.Record(ctx, errorsMeasure.M(1))
}
func StatsIncrementTooBusy(ctx context.Context) {
common.IncrementCounter(ctx, serverBusyMetricName)
func statsTooBusy(ctx context.Context) {
stats.Record(ctx, serverBusyMeasure.M(1))
}
const (
// TODO we should probably prefix these with calls_ ?
queuedMetricName = "queued"
callsMetricName = "calls"
callsMetricName = "calls" // TODO this is a dupe of sum {complete,failed} ?
runningMetricName = "running"
completedMetricName = "completed"
failedMetricName = "failed"
@@ -57,3 +66,192 @@ const (
errorsMetricName = "errors"
serverBusyMetricName = "server_busy"
)
var (
queuedMeasure *stats.Int64Measure
callsMeasure *stats.Int64Measure // TODO this is a dupe of sum {complete,failed} ?
runningMeasure *stats.Int64Measure
completedMeasure *stats.Int64Measure
failedMeasure *stats.Int64Measure
timedoutMeasure *stats.Int64Measure
errorsMeasure *stats.Int64Measure
serverBusyMeasure *stats.Int64Measure
)
func init() {
// TODO(reed): doing this at each call site seems not the intention of the library since measurements
// need to be created and views registered. doing this up front seems painful but maybe there
// are benefits?
// TODO(reed): do we have to do this? the measurements will be tagged on the context, will they be propagated
// or we have to white list them in the view for them to show up? test...
var err error
appKey, err := tag.NewKey("fn_appname")
if err != nil {
logrus.Fatal(err)
}
pathKey, err := tag.NewKey("fn_path")
if err != nil {
logrus.Fatal(err)
}
{
queuedMeasure, err = stats.Int64(queuedMetricName, "calls currently queued against agent", "")
if err != nil {
logrus.Fatal(err)
}
v, err := view.New(
queuedMetricName,
"calls currently queued to agent",
[]tag.Key{appKey, pathKey},
queuedMeasure,
view.SumAggregation{},
)
if err != nil {
logrus.Fatalf("cannot create view: %v", err)
}
if err := v.Subscribe(); err != nil {
logrus.Fatal(err)
}
}
{
callsMeasure, err = stats.Int64(callsMetricName, "calls created in agent", "")
if err != nil {
logrus.Fatal(err)
}
v, err := view.New(
callsMetricName,
"calls created in agent",
[]tag.Key{appKey, pathKey},
callsMeasure,
view.SumAggregation{},
)
if err != nil {
logrus.Fatalf("cannot create view: %v", err)
}
if err := v.Subscribe(); err != nil {
logrus.Fatal(err)
}
}
{
runningMeasure, err = stats.Int64(runningMetricName, "calls currently running in agent", "")
if err != nil {
logrus.Fatal(err)
}
v, err := view.New(
runningMetricName,
"calls currently running in agent",
[]tag.Key{appKey, pathKey},
runningMeasure,
view.SumAggregation{},
)
if err != nil {
logrus.Fatalf("cannot create view: %v", err)
}
if err := v.Subscribe(); err != nil {
logrus.Fatal(err)
}
}
{
completedMeasure, err = stats.Int64(completedMetricName, "calls completed in agent", "")
if err != nil {
logrus.Fatal(err)
}
v, err := view.New(
completedMetricName,
"calls completed in agent",
[]tag.Key{appKey, pathKey},
completedMeasure,
view.SumAggregation{},
)
if err != nil {
logrus.Fatalf("cannot create view: %v", err)
}
if err := v.Subscribe(); err != nil {
logrus.Fatal(err)
}
}
{
failedMeasure, err = stats.Int64(failedMetricName, "calls failed in agent", "")
if err != nil {
logrus.Fatal(err)
}
v, err := view.New(
failedMetricName,
"calls failed in agent",
[]tag.Key{appKey, pathKey},
failedMeasure,
view.SumAggregation{},
)
if err != nil {
logrus.Fatalf("cannot create view: %v", err)
}
if err := v.Subscribe(); err != nil {
logrus.Fatal(err)
}
}
{
timedoutMeasure, err = stats.Int64(timedoutMetricName, "calls timed out in agent", "")
if err != nil {
logrus.Fatal(err)
}
v, err := view.New(
timedoutMetricName,
"calls timed out in agent",
[]tag.Key{appKey, pathKey},
timedoutMeasure,
view.SumAggregation{},
)
if err != nil {
logrus.Fatalf("cannot create view: %v", err)
}
if err := v.Subscribe(); err != nil {
logrus.Fatal(err)
}
}
{
errorsMeasure, err = stats.Int64(errorsMetricName, "calls errored in agent", "")
if err != nil {
logrus.Fatal(err)
}
v, err := view.New(
errorsMetricName,
"calls errored in agent",
[]tag.Key{appKey, pathKey},
errorsMeasure,
view.SumAggregation{},
)
if err != nil {
logrus.Fatalf("cannot create view: %v", err)
}
if err := v.Subscribe(); err != nil {
logrus.Fatal(err)
}
}
{
serverBusyMeasure, err = stats.Int64(serverBusyMetricName, "calls where server was too busy in agent", "")
if err != nil {
logrus.Fatal(err)
}
v, err := view.New(
serverBusyMetricName,
"calls where server was too busy in agent",
[]tag.Key{appKey, pathKey},
serverBusyMeasure,
view.SumAggregation{},
)
if err != nil {
logrus.Fatalf("cannot create view: %v", err)
}
if err := v.Subscribe(); err != nil {
logrus.Fatal(err)
}
}
}

View File

@@ -4,6 +4,8 @@ import (
"context"
"github.com/sirupsen/logrus"
"go.opencensus.io/tag"
"go.opencensus.io/trace"
)
type contextKey string
@@ -22,11 +24,35 @@ func Logger(ctx context.Context) logrus.FieldLogger {
return l
}
// Attempt at simplifying this whole logger in the context thing
// Could even make this take a generic map, then the logger that gets returned could be used just like the stdlib too, since it's compatible
// LoggerWithFields returns a child context of the provided parent that
// contains a logger with additional fields from the parent's logger, it
// returns the new child logger, as well.
func LoggerWithFields(ctx context.Context, fields logrus.Fields) (context.Context, logrus.FieldLogger) {
l := Logger(ctx)
l = l.WithFields(fields)
ctx = WithLogger(ctx, l)
return ctx, l
}
// BackgroundContext returns a context that is specifically not a child of the
// provided parent context wrt any cancellation or deadline of the parent,
// returning a context that contains all values only. At present, this is a
// best effort as there is not a great way to extract all values, known values:
// * logger
// * span
// * tags
// (TODO(reed): we could have our own context.Context implementer that stores
// all values from WithValue in a bucket we could extract more easily?)
func BackgroundContext(ctx context.Context) context.Context {
logger := Logger(ctx)
span := trace.FromContext(ctx)
tagMap := tag.FromContext(ctx)
// fresh context
ctx = context.Background()
ctx = tag.NewContext(ctx, tagMap)
ctx = trace.WithSpan(ctx, span)
ctx = WithLogger(ctx, logger)
return ctx
}

View File

@@ -1,140 +0,0 @@
package common
import (
"context"
"time"
"github.com/opentracing/opentracing-go"
"github.com/opentracing/opentracing-go/log"
)
// IncrementGauge increments the specified gauge metric
// It does this by logging an appropriate field value to a tracing span.
func IncrementGauge(ctx context.Context, metric string) {
// The field name we use is the specified metric name prepended with FieldnamePrefixGauge to designate that it is a Prometheus gauge metric
// The collector will replace that prefix with "fn_" and use the result as the Prometheus metric name.
fieldname := FieldnamePrefixGauge + metric
// Spans are not processed by the collector until the span ends, so to prevent any delay
// in processing the stats when the current span is long-lived we create a new span for every call
// suffix the span name with SpannameSuffixDummy to denote that it is used only to hold a metric and isn't itself of any interest
span, ctx := opentracing.StartSpanFromContext(ctx, fieldname+SpannameSuffixDummy)
defer span.Finish()
// gauge metrics are actually float64; here we log that it should be increased by +1
span.LogFields(log.Float64(fieldname, 1.))
}
// DecrementGauge decrements the specified gauge metric
// It does this by logging an appropriate field value to a tracing span.
func DecrementGauge(ctx context.Context, metric string) {
// The field name we use is the specified metric name prepended with FieldnamePrefixGauge to designate that it is a Prometheus gauge metric
// The collector will replace that prefix with "fn_" and use the result as the Prometheus metric name.
fieldname := FieldnamePrefixGauge + metric
// Spans are not processed by the collector until the span ends, so to prevent any delay
// in processing the stats when the current span is long-lived we create a new span for every call.
// suffix the span name with SpannameSuffixDummy to denote that it is used only to hold a metric and isn't itself of any interest
span, ctx := opentracing.StartSpanFromContext(ctx, fieldname+SpannameSuffixDummy)
defer span.Finish()
// gauge metrics are actually float64; here we log that it should be increased by -1
span.LogFields(log.Float64(fieldname, -1.))
}
// IncrementCounter increments the specified counter metric
// It does this by logging an appropriate field value to a tracing span.
func IncrementCounter(ctx context.Context, metric string) {
// The field name we use is the specified metric name prepended with FieldnamePrefixCounter to designate that it is a Prometheus counter metric
// The collector will replace that prefix with "fn_" and use the result as the Prometheus metric name.
fieldname := FieldnamePrefixCounter + metric
// Spans are not processed by the collector until the span ends, so to prevent any delay
// in processing the stats when the current span is long-lived we create a new span for every call.
// suffix the span name with SpannameSuffixDummy to denote that it is used only to hold a metric and isn't itself of any interest
span, ctx := opentracing.StartSpanFromContext(ctx, fieldname+SpannameSuffixDummy)
defer span.Finish()
// counter metrics are actually float64; here we log that it should be increased by +1
span.LogFields(log.Float64(fieldname, 1.))
}
// If required, create a scalar version of PublishHistograms that publishes a single histogram metric
// PublishHistograms publishes the specifed histogram metrics
// It does this by logging appropriate field values to a tracing span
// Use this when the current tracing span is long-lived and you want the metric to be visible before it ends
func PublishHistograms(ctx context.Context, metrics map[string]float64) {
// Spans are not processed by the collector until the span ends, so to prevent any delay
// in processing the stats when the current span is long-lived we create a new span for every call.
// suffix the span name with SpannameSuffixDummy to denote that it is used only to hold a metric and isn't itself of any interest
span, ctx := opentracing.StartSpanFromContext(ctx, "histogram_metrics"+SpannameSuffixDummy)
defer span.Finish()
for key, value := range metrics {
// The field name we use is the metric name prepended with FieldnamePrefixHistogram to designate that it is a Prometheus histogram metric
// The collector will replace that prefix with "fn_" and use the result as the Prometheus metric name.
fieldname := FieldnamePrefixHistogram + key
span.LogFields(log.Float64(fieldname, value))
}
}
// PublishHistogram publishes the specifed histogram metric
// It does this by logging an appropriate field value to a tracing span
// Use this when the current tracing span is long-lived and you want the metric to be visible before it ends
func PublishHistogram(ctx context.Context, key string, value float64) {
// Spans are not processed by the collector until the span ends, so to prevent any delay
// in processing the stats when the current span is long-lived we create a new span for every call.
// suffix the span name with SpannameSuffixDummy to denote that it is used only to hold a metric and isn't itself of any interest
span, ctx := opentracing.StartSpanFromContext(ctx, "histogram_metrics"+SpannameSuffixDummy)
defer span.Finish()
// The field name we use is the metric name prepended with FieldnamePrefixHistogram to designate that it is a Prometheus histogram metric
// The collector will replace that prefix with "fn_" and use the result as the Prometheus metric name.
fieldname := FieldnamePrefixHistogram + key
span.LogFields(log.Float64(fieldname, value))
}
// PublishHistogramToSpan publishes the specifed histogram metric
// It does this by logging an appropriate field value to the specified tracing span
// Use this when you don't need to create a new tracing span
func PublishHistogramToSpan(span opentracing.Span, key string, value float64) {
// The field name we use is the metric name prepended with FieldnamePrefixHistogram to designate that it is a Prometheus histogram metric
// The collector will replace that prefix with "fn_" and use the result as the Prometheus metric name.
fieldname := FieldnamePrefixHistogram + key
span.LogFields(log.Float64(fieldname, value))
}
// PublishElapsedTimeToSpan publishes the specifed histogram elapsed time since start
// It does this by logging an appropriate field value to a tracing span
// Use this when the current tracing span is long-lived and you want the metric to be visible before it ends
func PublishElapsedTimeHistogram(ctx context.Context, key string, start, end time.Time) {
elapsed := float64(end.Sub(start).Seconds())
PublishHistogram(ctx, key, elapsed)
}
const (
// FnPrefix is a constant for "fn_", used as a prefix for span names, field names, Prometheus metric names and Prometheus label names
FnPrefix = "fn_"
// FieldnamePrefixHistogram is prefixed to the name of a logged field
// to denote that it corresponds to a histogram metric
FieldnamePrefixHistogram = FnPrefix + "histogram_"
// FieldnamePrefixCounter is prefixed to the name of a logged field
// to denote that it corresponds to a counter metric
FieldnamePrefixCounter = FnPrefix + "counter_"
// FieldnamePrefixGauge is prefixed to the name of a logged field
// to denote that it corresponds to a gauge metric
FieldnamePrefixGauge = FnPrefix + "gauge_"
// SpannameSuffixDummy is suffixed to the name of a tracing span
// to denote that it has been created solely for the purpose of carrying metric values
// and is not itself of any interest and should not be converted to a Prometheus duration metric
SpannameSuffixDummy = "_dummy"
)

View File

@@ -4,9 +4,10 @@ import (
"context"
"io"
"go.opencensus.io/trace"
"github.com/fnproject/fn/api/models"
"github.com/jmoiron/sqlx"
"github.com/opentracing/opentracing-go"
)
func MetricDS(ds models.Datastore) models.Datastore {
@@ -18,98 +19,98 @@ type metricds struct {
}
func (m *metricds) GetApp(ctx context.Context, appName string) (*models.App, error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_get_app")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "ds_get_app")
defer span.End()
return m.ds.GetApp(ctx, appName)
}
func (m *metricds) GetApps(ctx context.Context, filter *models.AppFilter) ([]*models.App, error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_get_apps")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "ds_get_apps")
defer span.End()
return m.ds.GetApps(ctx, filter)
}
func (m *metricds) InsertApp(ctx context.Context, app *models.App) (*models.App, error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_insert_app")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "ds_insert_app")
defer span.End()
return m.ds.InsertApp(ctx, app)
}
func (m *metricds) UpdateApp(ctx context.Context, app *models.App) (*models.App, error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_update_app")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "ds_update_app")
defer span.End()
return m.ds.UpdateApp(ctx, app)
}
func (m *metricds) RemoveApp(ctx context.Context, appName string) error {
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_remove_app")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "ds_remove_app")
defer span.End()
return m.ds.RemoveApp(ctx, appName)
}
func (m *metricds) GetRoute(ctx context.Context, appName, routePath string) (*models.Route, error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_get_route")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "ds_get_route")
defer span.End()
return m.ds.GetRoute(ctx, appName, routePath)
}
func (m *metricds) GetRoutesByApp(ctx context.Context, appName string, filter *models.RouteFilter) (routes []*models.Route, err error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_get_routes_by_app")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "ds_get_routes_by_app")
defer span.End()
return m.ds.GetRoutesByApp(ctx, appName, filter)
}
func (m *metricds) InsertRoute(ctx context.Context, route *models.Route) (*models.Route, error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_insert_route")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "ds_insert_route")
defer span.End()
return m.ds.InsertRoute(ctx, route)
}
func (m *metricds) UpdateRoute(ctx context.Context, route *models.Route) (*models.Route, error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_update_route")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "ds_update_route")
defer span.End()
return m.ds.UpdateRoute(ctx, route)
}
func (m *metricds) RemoveRoute(ctx context.Context, appName, routePath string) error {
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_remove_route")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "ds_remove_route")
defer span.End()
return m.ds.RemoveRoute(ctx, appName, routePath)
}
func (m *metricds) InsertCall(ctx context.Context, call *models.Call) error {
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_insert_call")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "ds_insert_call")
defer span.End()
return m.ds.InsertCall(ctx, call)
}
func (m *metricds) UpdateCall(ctx context.Context, from *models.Call, to *models.Call) error {
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_update_call")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "ds_update_call")
defer span.End()
return m.ds.UpdateCall(ctx, from, to)
}
func (m *metricds) GetCall(ctx context.Context, appName, callID string) (*models.Call, error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_get_call")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "ds_get_call")
defer span.End()
return m.ds.GetCall(ctx, appName, callID)
}
func (m *metricds) GetCalls(ctx context.Context, filter *models.CallFilter) ([]*models.Call, error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_get_calls")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "ds_get_calls")
defer span.End()
return m.ds.GetCalls(ctx, filter)
}
func (m *metricds) InsertLog(ctx context.Context, appName, callID string, callLog io.Reader) error {
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_insert_log")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "ds_insert_log")
defer span.End()
return m.ds.InsertLog(ctx, appName, callID, callLog)
}
func (m *metricds) GetLog(ctx context.Context, appName, callID string) (io.Reader, error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_get_log")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "ds_get_log")
defer span.End()
return m.ds.GetLog(ctx, appName, callID)
}

View File

@@ -6,6 +6,7 @@ import (
"testing"
"context"
"github.com/fnproject/fn/api/datastore/internal/datastoretest"
"github.com/fnproject/fn/api/datastore/internal/datastoreutil"
"github.com/fnproject/fn/api/models"

View File

@@ -5,6 +5,7 @@ import (
"net/url"
"context"
"github.com/fnproject/fn/api/common"
"github.com/fnproject/fn/api/datastore/sql"
"github.com/fnproject/fn/api/logs/s3"

View File

@@ -6,6 +6,7 @@ import (
"testing"
"context"
"github.com/fnproject/fn/api/datastore/sql"
logTesting "github.com/fnproject/fn/api/logs/testing"
)

View File

@@ -17,10 +17,12 @@ import (
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/s3"
"github.com/aws/aws-sdk-go/service/s3/s3manager"
"github.com/fnproject/fn/api/common"
"github.com/fnproject/fn/api/models"
"github.com/opentracing/opentracing-go"
"github.com/sirupsen/logrus"
"go.opencensus.io/stats"
"go.opencensus.io/stats/view"
"go.opencensus.io/tag"
"go.opencensus.io/trace"
)
// TODO we should encrypt these, user will have to supply a key though (or all
@@ -124,8 +126,8 @@ func path(appName, callID string) string {
}
func (s *store) InsertLog(ctx context.Context, appName, callID string, callLog io.Reader) error {
span, ctx := opentracing.StartSpanFromContext(ctx, "s3_insert_log")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "s3_insert_log")
defer span.End()
// wrap original reader in a decorator to keep track of read bytes without buffering
cr := &countingReader{r: callLog}
@@ -144,13 +146,13 @@ func (s *store) InsertLog(ctx context.Context, appName, callID string, callLog i
return fmt.Errorf("failed to write log, %v", err)
}
common.PublishHistogramToSpan(span, "s3_log_upload_size", float64(cr.count))
stats.Record(ctx, uploadSizeMeasure.M(int64(cr.count)))
return nil
}
func (s *store) GetLog(ctx context.Context, appName, callID string) (io.Reader, error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "s3_get_log")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "s3_get_log")
defer span.End()
objectName := path(appName, callID)
logrus.WithFields(logrus.Fields{"bucketName": s.bucket, "key": objectName}).Debug("Downloading log")
@@ -169,6 +171,65 @@ func (s *store) GetLog(ctx context.Context, appName, callID string) (io.Reader,
return nil, fmt.Errorf("failed to read log, %v", err)
}
common.PublishHistogramToSpan(span, "s3_log_download_size", float64(size))
stats.Record(ctx, downloadSizeMeasure.M(size))
return bytes.NewReader(target.Bytes()), nil
}
var (
uploadSizeMeasure *stats.Int64Measure
downloadSizeMeasure *stats.Int64Measure
)
func init() {
// TODO(reed): do we have to do this? the measurements will be tagged on the context, will they be propagated
// or we have to white list them in the view for them to show up? test...
var err error
appKey, err := tag.NewKey("fn_appname")
if err != nil {
logrus.Fatal(err)
}
pathKey, err := tag.NewKey("fn_path")
if err != nil {
logrus.Fatal(err)
}
{
uploadSizeMeasure, err = stats.Int64("s3_log_upload_size", "uploaded log size", "byte")
if err != nil {
logrus.Fatal(err)
}
v, err := view.New(
"s3_log_upload_size",
"uploaded log size",
[]tag.Key{appKey, pathKey},
uploadSizeMeasure,
view.DistributionAggregation{},
)
if err != nil {
logrus.Fatalf("cannot create view: %v", err)
}
if err := v.Subscribe(); err != nil {
logrus.Fatal(err)
}
}
{
downloadSizeMeasure, err = stats.Int64("s3_log_download_size", "downloaded log size", "byte")
if err != nil {
logrus.Fatal(err)
}
v, err := view.New(
"s3_log_download_size",
"downloaded log size",
[]tag.Key{appKey, pathKey},
downloadSizeMeasure,
view.DistributionAggregation{},
)
if err != nil {
logrus.Fatalf("cannot create view: %v", err)
}
if err := v.Subscribe(); err != nil {
logrus.Fatal(err)
}
}
}

View File

@@ -5,8 +5,9 @@ import (
"fmt"
"net/url"
"go.opencensus.io/trace"
"github.com/fnproject/fn/api/models"
"github.com/opentracing/opentracing-go"
"github.com/sirupsen/logrus"
)
@@ -43,19 +44,19 @@ type metricMQ struct {
}
func (m *metricMQ) Push(ctx context.Context, t *models.Call) (*models.Call, error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "mq_push")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "mq_push")
defer span.End()
return m.mq.Push(ctx, t)
}
func (m *metricMQ) Reserve(ctx context.Context) (*models.Call, error) {
span, ctx := opentracing.StartSpanFromContext(ctx, "mq_reserve")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "mq_reserve")
defer span.End()
return m.mq.Reserve(ctx)
}
func (m *metricMQ) Delete(ctx context.Context, t *models.Call) error {
span, ctx := opentracing.StartSpanFromContext(ctx, "mq_delete")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "mq_delete")
defer span.End()
return m.mq.Delete(ctx, t)
}

View File

@@ -6,10 +6,11 @@ import (
"net/http"
"errors"
"strings"
"github.com/fnproject/fn/api"
"github.com/fnproject/fn/api/models"
"github.com/gin-gonic/gin"
"strings"
)
// note: for backward compatibility, will go away later

View File

@@ -1,60 +0,0 @@
package server
import (
"github.com/opentracing/opentracing-go"
"strings"
)
// FnTracer is a custom Tracer which wraps another another tracer
// its main purpose is to wrap the underlying Span in a FnSpan,
// which adds some extra behaviour required for sending tracing spans to prometheus
type FnTracer struct {
opentracing.Tracer
}
// NewFnTracer returns a new FnTracer which wraps the specified Tracer
func NewFnTracer(t opentracing.Tracer) opentracing.Tracer {
return &FnTracer{t}
}
// FnTracer implements opentracing.Tracer
// Override StartSpan to wrap the returned Span in a FnSpan
func (fnt FnTracer) StartSpan(operationName string, opts ...opentracing.StartSpanOption) opentracing.Span {
return NewFnSpan(fnt.Tracer.StartSpan(operationName, opts...))
}
// FnSpan is a custom Span that wraps another span
// which adds some extra behaviour required for sending tracing spans to prometheus
type FnSpan struct {
opentracing.Span
}
// NewFnSpan returns a new FnSpan which wraps the specified Span
func NewFnSpan(s opentracing.Span) opentracing.Span {
return &FnSpan{s}
}
// FnSpan implements opentracing.Span
func (fns FnSpan) Finish() {
fns.copyBaggageItemsToTags()
fns.Span.Finish()
}
// FnSpan implements opentracing.Span
func (fns FnSpan) FinishWithOptions(opts opentracing.FinishOptions) {
fns.copyBaggageItemsToTags()
fns.Span.FinishWithOptions(opts)
}
func (fns FnSpan) copyBaggageItemsToTags() {
// copy baggage items (which are inherited from the parent) with keys starting with "fn" to tags
// the PrometheusCollector will send these to Prometheus
// need to do this because the collector can't access baggage items, but it can access tags
// whereas here we can access the parent's baggage items, but not its tags
fns.Context().ForeachBaggageItem(func(k, v string) bool {
if strings.HasPrefix(k, "fn") {
fns.SetTag(k, v)
}
return true
})
}

View File

@@ -13,9 +13,9 @@ import (
"github.com/fnproject/fn/fnext"
"github.com/gin-contrib/cors"
"github.com/gin-gonic/gin"
opentracing "github.com/opentracing/opentracing-go"
"github.com/opentracing/opentracing-go/ext"
"github.com/sirupsen/logrus"
"go.opencensus.io/tag"
"go.opencensus.io/trace"
)
func optionalCorsWrap(r *gin.Engine) {
@@ -37,20 +37,28 @@ func optionalCorsWrap(r *gin.Engine) {
// we should use http grr
func traceWrap(c *gin.Context) {
// try to grab a span from the request if made from another service, ignore err if not
wireContext, _ := opentracing.GlobalTracer().Extract(
opentracing.HTTPHeaders,
opentracing.HTTPHeadersCarrier(c.Request.Header))
appKey, err := tag.NewKey("fn_appname")
if err != nil {
logrus.Fatal(err)
}
pathKey, err := tag.NewKey("fn_path")
if err != nil {
logrus.Fatal(err)
}
ctx, err := tag.New(c.Request.Context(),
tag.Insert(appKey, c.Param(api.CApp)),
tag.Insert(pathKey, c.Param(api.CRoute)),
)
if err != nil {
logrus.Fatal(err)
}
// Create the span referring to the RPC client if available.
// If wireContext == nil, a root span will be created.
// TODO we should add more tags?
serverSpan := opentracing.StartSpan("serve_http", ext.RPCServerOption(wireContext), opentracing.Tag{Key: "path", Value: c.Request.URL.Path})
serverSpan.SetBaggageItem("fn_appname", c.Param(api.CApp))
serverSpan.SetBaggageItem("fn_path", c.Param(api.CRoute))
defer serverSpan.Finish()
// TODO inspect opencensus more and see if we need to define a header ourselves
// to trigger per-request spans (we will want this), we can set sampler here per request.
ctx, serverSpan := trace.StartSpan(ctx, "serve_http")
defer serverSpan.End()
ctx := opentracing.ContextWithSpan(c.Request.Context(), serverSpan)
c.Request = c.Request.WithContext(ctx)
c.Next()
}

View File

@@ -1,327 +0,0 @@
package server
import (
"github.com/fnproject/fn/api/common"
"github.com/openzipkin/zipkin-go-opentracing"
"github.com/openzipkin/zipkin-go-opentracing/thrift/gen-go/zipkincore"
"github.com/prometheus/client_golang/prometheus"
"strconv"
"strings"
"sync"
"time"
)
// PrometheusCollector is a custom Collector
// which sends ZipKin traces to Prometheus
type PrometheusCollector struct {
lock sync.Mutex
// Each span name is published as a separate Histogram metric
// Using metric names of the form fn_span_<span-name>_duration_seconds
// In this map, the key is the name of a tracing span,
// and the corresponding value is a HistogramVec metric used to report the duration of spans with this name to Prometheus
histogramVecMap map[string]*prometheus.HistogramVec
// In this map, the key is the name of a tracing span,
// and the corresponding value is a CounterVec metric used to report the duration of spans with this name to Prometheus
counterVecMap map[string]*prometheus.CounterVec
// In this map, the key is the name of a tracing span,
// and the corresponding value is a GaugeVec metric used to report the duration of spans with this name to Prometheus
gaugeVecMap map[string]*prometheus.GaugeVec
// In this map, the key is the name of a tracing span,
// and the corresponding value is an array containing the label keys that were specified when the HistogramVec metric was created
registeredLabelKeysMap map[string][]string
}
// NewPrometheusCollector returns a new PrometheusCollector
func NewPrometheusCollector() (zipkintracer.Collector, error) {
pc := &PrometheusCollector{
histogramVecMap: make(map[string]*prometheus.HistogramVec),
counterVecMap: make(map[string]*prometheus.CounterVec),
gaugeVecMap: make(map[string]*prometheus.GaugeVec),
registeredLabelKeysMap: make(map[string][]string),
}
return pc, nil
}
// PrometheusCollector implements Collector.
func (pc *PrometheusCollector) Collect(span *zipkincore.Span) error {
spanName := span.GetName()
// extract any label values from the span
labelKeysFromSpan, labelValuesFromSpan := getLabels(span)
// report the duration of this span as a histogram
// (unless the span name ends with SpannameSuffixDummy to denote it as being purely the carrier of a metric value and so of no interest in itself)
if !strings.HasSuffix(spanName, common.SpannameSuffixDummy) {
// get the HistogramVec for this span name
histogramVec, labelValuesToUse := pc.getHistogramVec(
("fn_span_" + spanName + "_duration_seconds"), ("Span " + spanName + " duration, by span name"), labelKeysFromSpan, labelValuesFromSpan)
// now report the span duration value
histogramVec.With(labelValuesToUse).Observe((time.Duration(span.GetDuration()) * time.Microsecond).Seconds())
}
// now extract any logged histogram metric values from the span
for key, value := range getLoggedHistogramMetrics(span) {
// get the HistogramVec for this metric
thisMetricHistogramVec, labelValuesToUse := pc.getHistogramVec(
key, ("Metric " + key), labelKeysFromSpan, labelValuesFromSpan)
// now report the metric value
thisMetricHistogramVec.With(labelValuesToUse).Observe(value)
}
// now extract any logged counter metric values from the span
for key, value := range getLoggedCounterMetrics(span) {
// get the CounterVec for this metric
thisMetricCounterVec, labelValuesToUse := pc.getCounterVec(
key, ("Metric " + key), labelKeysFromSpan, labelValuesFromSpan)
// now report the metric value
thisMetricCounterVec.With(labelValuesToUse).Add(value)
}
// now extract any logged gauge metric values from the span
for key, value := range getLoggedGaugeMetrics(span) {
// get the GaugeVec for this metric
thisMetricGaugeVec, labelValuesToUse := pc.getGaugeVec(
key, ("Metric " + key), labelKeysFromSpan, labelValuesFromSpan)
// now report the metric value
thisMetricGaugeVec.With(labelValuesToUse).Add(value)
}
return nil
}
// Return (and create, if necessary) a HistogramVec for the specified Prometheus metric
func (pc *PrometheusCollector) getHistogramVec(
metricName string, metricHelp string, labelKeysFromSpan []string, labelValuesFromSpan map[string]string) (
*prometheus.HistogramVec, map[string]string) {
var labelValuesToUse map[string]string
pc.lock.Lock()
defer pc.lock.Unlock()
histogramVec, found := pc.histogramVecMap[metricName]
if !found {
// create a new HistogramVec
histogramVec = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: metricName,
Help: metricHelp,
},
labelKeysFromSpan,
)
pc.histogramVecMap[metricName] = histogramVec
pc.registeredLabelKeysMap[metricName] = labelKeysFromSpan
prometheus.MustRegister(histogramVec)
labelValuesToUse = labelValuesFromSpan
} else {
// found an existing HistogramVec
// need to be careful here, since we must supply the same label keys as when we first created the metric
// otherwise we will get a "inconsistent label cardinality" panic
// that's why we saved the original label keys in the registeredLabelKeysMap map
// so we can use that to construct a map of label key/value pairs to set on the metric
labelValuesToUse = make(map[string]string)
for _, thisRegisteredLabelKey := range pc.registeredLabelKeysMap[metricName] {
if value, found := labelValuesFromSpan[thisRegisteredLabelKey]; found {
labelValuesToUse[thisRegisteredLabelKey] = value
} else {
labelValuesToUse[thisRegisteredLabelKey] = ""
}
}
}
return histogramVec, labelValuesToUse
}
// Return (and create, if necessary) a CounterVec for the specified Prometheus metric
func (pc *PrometheusCollector) getCounterVec(
metricName string, metricHelp string, labelKeysFromSpan []string, labelValuesFromSpan map[string]string) (
*prometheus.CounterVec, map[string]string) {
var labelValuesToUse map[string]string
pc.lock.Lock()
defer pc.lock.Unlock()
counterVec, found := pc.counterVecMap[metricName]
if !found {
// create a new CounterVec
counterVec = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: metricName,
Help: metricHelp,
},
labelKeysFromSpan,
)
pc.counterVecMap[metricName] = counterVec
pc.registeredLabelKeysMap[metricName] = labelKeysFromSpan
prometheus.MustRegister(counterVec)
labelValuesToUse = labelValuesFromSpan
} else {
// found an existing CounterVec
// need to be careful here, since we must supply the same label keys as when we first created the metric
// otherwise we will get a "inconsistent label cardinality" panic
// that's why we saved the original label keys in the registeredLabelKeysMap map
// so we can use that to construct a map of label key/value pairs to set on the metric
labelValuesToUse = make(map[string]string)
for _, thisRegisteredLabelKey := range pc.registeredLabelKeysMap[metricName] {
if value, found := labelValuesFromSpan[thisRegisteredLabelKey]; found {
labelValuesToUse[thisRegisteredLabelKey] = value
} else {
labelValuesToUse[thisRegisteredLabelKey] = ""
}
}
}
return counterVec, labelValuesToUse
}
// Return (and create, if necessary) a GaugeVec for the specified Prometheus metric
func (pc *PrometheusCollector) getGaugeVec(
metricName string, metricHelp string, labelKeysFromSpan []string, labelValuesFromSpan map[string]string) (
*prometheus.GaugeVec, map[string]string) {
var labelValuesToUse map[string]string
pc.lock.Lock()
defer pc.lock.Unlock()
gaugeVec, found := pc.gaugeVecMap[metricName]
if !found {
// create a new GaugeVec
gaugeVec = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: metricName,
Help: metricHelp,
},
labelKeysFromSpan,
)
pc.gaugeVecMap[metricName] = gaugeVec
pc.registeredLabelKeysMap[metricName] = labelKeysFromSpan
prometheus.MustRegister(gaugeVec)
labelValuesToUse = labelValuesFromSpan
} else {
// found an existing GaugeVec
// need to be careful here, since we must supply the same label keys as when we first created the metric
// otherwise we will get a "inconsistent label cardinality" panic
// that's why we saved the original label keys in the registeredLabelKeysMap map
// so we can use that to construct a map of label key/value pairs to set on the metric
labelValuesToUse = make(map[string]string)
for _, thisRegisteredLabelKey := range pc.registeredLabelKeysMap[metricName] {
if value, found := labelValuesFromSpan[thisRegisteredLabelKey]; found {
labelValuesToUse[thisRegisteredLabelKey] = value
} else {
labelValuesToUse[thisRegisteredLabelKey] = ""
}
}
}
return gaugeVec, labelValuesToUse
}
// extract from the specified span the key/value pairs that we want to add as labels to the Prometheus metric for this span
// returns an array of keys, and a map of key-value pairs
func getLabels(span *zipkincore.Span) ([]string, map[string]string) {
var keys []string
labelMap := make(map[string]string)
// extract any tags whose key starts with "fn" from the span
binaryAnnotations := span.GetBinaryAnnotations()
for _, thisBinaryAnnotation := range binaryAnnotations {
key := thisBinaryAnnotation.GetKey()
if thisBinaryAnnotation.GetAnnotationType() == zipkincore.AnnotationType_STRING && strings.HasPrefix(key, "fn") {
keys = append(keys, key)
value := string(thisBinaryAnnotation.GetValue()[:])
labelMap[key] = value
}
}
return keys, labelMap
}
// extract from the span the logged histogram metric values.
// These are the ones whose names start with FieldnamePrefixHistogram,
// and whose values we assume are float64
func getLoggedHistogramMetrics(span *zipkincore.Span) map[string]float64 {
keyValueMap := make(map[string]float64)
// extract any annotations whose Value starts with FieldnamePrefixHistogram
annotations := span.GetAnnotations()
for _, thisAnnotation := range annotations {
if strings.HasPrefix(thisAnnotation.GetValue(), common.FieldnamePrefixHistogram) {
keyvalue := strings.Split(thisAnnotation.GetValue(), "=")
if len(keyvalue) == 2 {
if value, err := strconv.ParseFloat(keyvalue[1], 64); err == nil {
key := strings.TrimSpace(keyvalue[0])
key = common.FnPrefix + key[len(common.FieldnamePrefixHistogram):] // strip off fieldname prefix and then prepend "fn_" to the front
keyValueMap[key] = value
}
}
}
}
return keyValueMap
}
// extract from the span the logged counter metric values.
// These are the ones whose names start with FieldnamePrefixCounter,
// and whose values we assume are float64
func getLoggedCounterMetrics(span *zipkincore.Span) map[string]float64 {
keyValueMap := make(map[string]float64)
// extract any annotations whose Value starts with FieldnamePrefixCounter
annotations := span.GetAnnotations()
for _, thisAnnotation := range annotations {
if strings.HasPrefix(thisAnnotation.GetValue(), common.FieldnamePrefixCounter) {
keyvalue := strings.Split(thisAnnotation.GetValue(), "=")
if len(keyvalue) == 2 {
if value, err := strconv.ParseFloat(keyvalue[1], 64); err == nil {
key := strings.TrimSpace(keyvalue[0])
key = common.FnPrefix + key[len(common.FieldnamePrefixCounter):] // strip off fieldname prefix and then prepend "fn_" to the front
keyValueMap[key] = value
}
}
}
}
return keyValueMap
}
// extract from the span the logged gauge metric values.
// These are the ones whose names start with FieldnamePrefixGauge,
// and whose values we assume are float64
func getLoggedGaugeMetrics(span *zipkincore.Span) map[string]float64 {
keyValueMap := make(map[string]float64)
// extract any annotations whose Value starts with FieldnamePrefixGauge
annotations := span.GetAnnotations()
for _, thisAnnotation := range annotations {
if strings.HasPrefix(thisAnnotation.GetValue(), common.FieldnamePrefixGauge) {
keyvalue := strings.Split(thisAnnotation.GetValue(), "=")
if len(keyvalue) == 2 {
if value, err := strconv.ParseFloat(keyvalue[1], 64); err == nil {
key := strings.TrimSpace(keyvalue[0])
key = common.FnPrefix + key[len(common.FieldnamePrefixGauge):] // strip off fieldname prefix and then prepend "fn_" to the front
keyValueMap[key] = value
}
}
}
}
return keyValueMap
}
// PrometheusCollector implements Collector.
func (*PrometheusCollector) Close() error { return nil }

View File

@@ -1,9 +0,0 @@
package server
import (
"github.com/gin-gonic/gin"
)
func (s *Server) handlePrometheusMetrics(c *gin.Context) {
s.agent.PromHandler().ServeHTTP(c.Writer, c.Request)
}

View File

@@ -24,9 +24,13 @@ import (
"github.com/fnproject/fn/api/version"
"github.com/fnproject/fn/fnext"
"github.com/gin-gonic/gin"
opentracing "github.com/opentracing/opentracing-go"
zipkintracer "github.com/openzipkin/zipkin-go-opentracing"
zipkinhttp "github.com/openzipkin/zipkin-go/reporter/http"
"github.com/sirupsen/logrus"
"go.opencensus.io/exporter/prometheus"
"go.opencensus.io/exporter/zipkin"
"go.opencensus.io/plugin/ochttp"
"go.opencensus.io/stats/view"
"go.opencensus.io/trace"
)
const (
@@ -79,6 +83,7 @@ type Server struct {
appListeners *appListeners
rootMiddlewares []fnext.Middleware
apiMiddlewares []fnext.Middleware
promExporter *prometheus.Exporter
}
func nodeTypeFromString(value string) ServerNodeType {
@@ -226,8 +231,8 @@ func WithAgent(agent agent.Agent) ServerOption {
// New creates a new Functions server with the opts given. For convenience, users may
// prefer to use NewFromEnv but New is more flexible if needed.
func New(ctx context.Context, opts ...ServerOption) *Server {
span, ctx := opentracing.StartSpanFromContext(ctx, "server_init")
defer span.Finish()
ctx, span := trace.StartSpan(ctx, "server_init")
defer span.End()
log := common.Logger(ctx)
s := &Server{
@@ -290,50 +295,35 @@ func New(ctx context.Context, opts ...ServerOption) *Server {
func WithTracer(zipkinURL string) ServerOption {
return func(ctx context.Context, s *Server) error {
var (
debugMode = false
serviceName = "fnserver"
serviceHostPort = "localhost:8080" // meh
// TODO add server identifier to this crap
//debugMode = false
//serviceName = "fnserver"
//serviceHostPort = "localhost:8080" // meh
zipkinHTTPEndpoint = zipkinURL
// ex: "http://zipkin:9411/api/v1/spans"
// ex: "http://zipkin:9411/api/v2/spans"
)
var collector zipkintracer.Collector
// custom Zipkin collector to send tracing spans to Prometheus
promCollector, promErr := NewPrometheusCollector()
if promErr != nil {
logrus.WithError(promErr).Fatalln("couldn't start Prometheus trace collector")
}
logger := zipkintracer.LoggerFunc(func(i ...interface{}) error { logrus.Error(i...); return nil })
if zipkinHTTPEndpoint != "" {
// Custom PrometheusCollector and Zipkin HTTPCollector
httpCollector, zipErr := zipkintracer.NewHTTPCollector(zipkinHTTPEndpoint,
zipkintracer.HTTPLogger(logger), zipkintracer.HTTPMaxBacklog(1000),
)
if zipErr != nil {
logrus.WithError(zipErr).Fatalln("couldn't start Zipkin trace collector")
}
collector = zipkintracer.MultiCollector{httpCollector, promCollector}
} else {
// Custom PrometheusCollector only
collector = promCollector
reporter := zipkinhttp.NewReporter(zipkinURL, zipkinhttp.MaxBacklog(10000))
exporter := zipkin.NewExporter(reporter, nil)
trace.RegisterExporter(exporter)
logrus.WithFields(logrus.Fields{"url": zipkinHTTPEndpoint}).Info("exporting spans to zipkin")
// TODO don't do this. testing parity.
trace.SetDefaultSampler(trace.AlwaysSample())
}
ziptracer, err := zipkintracer.NewTracer(zipkintracer.NewRecorder(collector, debugMode, serviceHostPort, serviceName),
zipkintracer.ClientServerSameSpan(true),
zipkintracer.TraceID128Bit(true),
)
// TODO we can keep this on *Server and unregister it in Close()... can finagle later. same for tracer
exporter, err := prometheus.NewExporter(prometheus.Options{
Namespace: "fn",
OnError: func(err error) { logrus.WithError(err).Error("opencensus prometheus exporter err") },
})
if err != nil {
logrus.WithError(err).Fatalln("couldn't start tracer")
logrus.Fatal(err)
}
s.promExporter = exporter
view.RegisterExporter(exporter)
// wrap the Zipkin tracer in a FnTracer which will also send spans to Prometheus
fntracer := NewFnTracer(ziptracer)
opentracing.SetGlobalTracer(fntracer)
logrus.WithFields(logrus.Fields{"url": zipkinHTTPEndpoint}).Info("started tracer")
return nil
}
}
@@ -404,7 +394,8 @@ func (s *Server) startGears(ctx context.Context, cancel context.CancelFunc) {
server := http.Server{
Addr: listen,
Handler: s.Router,
Handler: &ochttp.Handler{Handler: s.Router},
// TODO we should set read/write timeouts
}
@@ -438,8 +429,11 @@ func (s *Server) bindHandlers(ctx context.Context) {
engine.GET("/", handlePing)
engine.GET("/version", handleVersion)
// TODO: move the following under v1
engine.GET("/metrics", s.handlePrometheusMetrics)
// TODO: move under v1 ?
if s.promExporter != nil {
engine.GET("/metrics", gin.WrapH(s.promExporter))
}
profilerSetup(engine, "/debug")