mirror of
https://github.com/fnproject/fn.git
synced 2022-10-28 21:29:17 +03:00
opentracing -> opencensus (#802)
* update vendor directory, add go.opencensus.io * update imports * oops * s/opentracing/opencensus/ & remove prometheus / zipkin stuff & remove old stats * the dep train rides again * fix gin build * deps from last guy * start in on the agent metrics * she builds * remove tags for now, cardinality error is fussing. subscribe instead of register * update to patched version of opencensus to proceed for now TODO switch to a release * meh fix imports * println debug the bad boys * lace it with the tags * update deps again * fix all inconsistent cardinality errors * add our own logger * fix init * fix oom measure * remove bugged removal code * fix s3 measures * fix prom handler nil
This commit is contained in:
@@ -3,7 +3,6 @@ package agent
|
||||
import (
|
||||
"context"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
@@ -16,9 +15,11 @@ import (
|
||||
"github.com/fnproject/fn/api/models"
|
||||
"github.com/fnproject/fn/fnext"
|
||||
"github.com/go-openapi/strfmt"
|
||||
"github.com/opentracing/opentracing-go"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
"github.com/sirupsen/logrus"
|
||||
"go.opencensus.io/stats"
|
||||
"go.opencensus.io/stats/view"
|
||||
"go.opencensus.io/tag"
|
||||
"go.opencensus.io/trace"
|
||||
)
|
||||
|
||||
// TODO we should prob store async calls in db immediately since we're returning id (will 404 until post-execution)
|
||||
@@ -85,8 +86,6 @@ type Agent interface {
|
||||
// Close is not safe to be called from multiple threads.
|
||||
io.Closer
|
||||
|
||||
// Return the http.Handler used to handle Prometheus metric requests
|
||||
PromHandler() http.Handler
|
||||
AddCallListener(fnext.CallListener)
|
||||
|
||||
// Enqueue is to use the agent's sweet sweet client bindings to remotely
|
||||
@@ -109,9 +108,6 @@ type agent struct {
|
||||
wg sync.WaitGroup // TODO rename
|
||||
shutonce sync.Once
|
||||
shutdown chan struct{}
|
||||
|
||||
// Prometheus HTTP handler
|
||||
promHandler http.Handler
|
||||
}
|
||||
|
||||
func New(da DataAccess) Agent {
|
||||
@@ -128,13 +124,12 @@ func New(da DataAccess) Agent {
|
||||
})
|
||||
|
||||
a := &agent{
|
||||
cfg: *cfg,
|
||||
da: da,
|
||||
driver: driver,
|
||||
slotMgr: NewSlotQueueMgr(),
|
||||
resources: NewResourceTracker(),
|
||||
shutdown: make(chan struct{}),
|
||||
promHandler: promhttp.Handler(),
|
||||
cfg: *cfg,
|
||||
da: da,
|
||||
driver: driver,
|
||||
slotMgr: NewSlotQueueMgr(),
|
||||
resources: NewResourceTracker(),
|
||||
shutdown: make(chan struct{}),
|
||||
}
|
||||
|
||||
// TODO assert that agent doesn't get started for API nodes up above ?
|
||||
@@ -174,8 +169,8 @@ func (a *agent) Submit(callI Call) error {
|
||||
call.req = call.req.WithContext(ctx)
|
||||
defer cancel()
|
||||
|
||||
ctx, finish := statSpans(ctx, call)
|
||||
defer finish()
|
||||
ctx, span := trace.StartSpan(ctx, "agent_submit")
|
||||
defer span.End()
|
||||
|
||||
err := a.submit(ctx, call)
|
||||
return err
|
||||
@@ -202,14 +197,15 @@ func (a *agent) endStateTrackers(ctx context.Context, call *call) {
|
||||
}
|
||||
|
||||
func (a *agent) submit(ctx context.Context, call *call) error {
|
||||
StatsEnqueue(ctx)
|
||||
statsEnqueue(ctx)
|
||||
|
||||
// TODO can we replace state trackers with metrics?
|
||||
a.startStateTrackers(ctx, call)
|
||||
defer a.endStateTrackers(ctx, call)
|
||||
|
||||
slot, err := a.getSlot(ctx, call)
|
||||
if err != nil {
|
||||
a.handleStatsDequeue(ctx, call, err)
|
||||
handleStatsDequeue(ctx, err)
|
||||
return transformTimeout(err, true)
|
||||
}
|
||||
|
||||
@@ -217,20 +213,19 @@ func (a *agent) submit(ctx context.Context, call *call) error {
|
||||
|
||||
err = call.Start(ctx)
|
||||
if err != nil {
|
||||
a.handleStatsDequeue(ctx, call, err)
|
||||
handleStatsDequeue(ctx, err)
|
||||
return transformTimeout(err, true)
|
||||
}
|
||||
|
||||
// decrement queued count, increment running count
|
||||
StatsDequeueAndStart(ctx)
|
||||
statsDequeueAndStart(ctx)
|
||||
|
||||
// pass this error (nil or otherwise) to end directly, to store status, etc
|
||||
err = slot.exec(ctx, call)
|
||||
a.handleStatsEnd(ctx, call, err)
|
||||
handleStatsEnd(ctx, err)
|
||||
|
||||
// TODO: we need to allocate more time to store the call + logs in case the call timed out,
|
||||
// but this could put us over the timeout if the call did not reply yet (need better policy).
|
||||
ctx = opentracing.ContextWithSpan(context.Background(), opentracing.SpanFromContext(ctx))
|
||||
ctx = common.BackgroundContext(ctx)
|
||||
err = call.End(ctx, err)
|
||||
return transformTimeout(err, false)
|
||||
}
|
||||
@@ -247,54 +242,33 @@ func transformTimeout(e error, isRetriable bool) error {
|
||||
|
||||
// handleStatsDequeue handles stats for dequeuing for early exit (getSlot or Start)
|
||||
// cases. Only timeouts can be a simple dequeue while other cases are actual errors.
|
||||
func (a *agent) handleStatsDequeue(ctx context.Context, call *call, err error) {
|
||||
func handleStatsDequeue(ctx context.Context, err error) {
|
||||
if err == context.DeadlineExceeded {
|
||||
StatsDequeue(ctx)
|
||||
StatsIncrementTooBusy(ctx)
|
||||
statsDequeue(ctx)
|
||||
statsTooBusy(ctx)
|
||||
} else {
|
||||
StatsDequeueAndFail(ctx)
|
||||
StatsIncrementErrors(ctx)
|
||||
statsDequeueAndFail(ctx)
|
||||
statsErrors(ctx)
|
||||
}
|
||||
}
|
||||
|
||||
// handleStatsEnd handles stats for after a call is ran, depending on error.
|
||||
func (a *agent) handleStatsEnd(ctx context.Context, call *call, err error) {
|
||||
func handleStatsEnd(ctx context.Context, err error) {
|
||||
if err == nil {
|
||||
// decrement running count, increment completed count
|
||||
StatsComplete(ctx)
|
||||
statsComplete(ctx)
|
||||
} else {
|
||||
// decrement running count, increment failed count
|
||||
StatsFailed(ctx)
|
||||
statsFailed(ctx)
|
||||
// increment the timeout or errors count, as appropriate
|
||||
if err == context.DeadlineExceeded {
|
||||
StatsIncrementTimedout(ctx)
|
||||
statsTimedout(ctx)
|
||||
} else {
|
||||
StatsIncrementErrors(ctx)
|
||||
statsErrors(ctx)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func statSpans(ctx context.Context, call *call) (_ context.Context, finish func()) {
|
||||
// agent_submit_global has no parent span because we don't want it to inherit fn_appname or fn_path
|
||||
spanGlobal := opentracing.StartSpan("agent_submit_global")
|
||||
|
||||
// agent_submit_global has no parent span because we don't want it to inherit fn_path
|
||||
spanApp := opentracing.StartSpan("agent_submit_app")
|
||||
spanApp.SetBaggageItem("fn_appname", call.AppName)
|
||||
|
||||
// agent_submit has a parent span in the usual way
|
||||
// it doesn't matter if it inherits fn_appname or fn_path (and we set them here in any case)
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_submit")
|
||||
span.SetBaggageItem("fn_appname", call.AppName)
|
||||
span.SetBaggageItem("fn_path", call.Path)
|
||||
|
||||
return ctx, func() {
|
||||
spanGlobal.Finish()
|
||||
spanApp.Finish()
|
||||
span.Finish()
|
||||
}
|
||||
}
|
||||
|
||||
// getSlot returns a Slot (or error) for the request to run. Depending on hot/cold
|
||||
// request type, this may launch a new container or wait for other containers to become idle
|
||||
// or it may wait for resources to become available to launch a new container.
|
||||
@@ -303,8 +277,8 @@ func (a *agent) getSlot(ctx context.Context, call *call) (Slot, error) {
|
||||
ctx, cancel := context.WithDeadline(ctx, call.slotDeadline)
|
||||
defer cancel()
|
||||
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_get_slot")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "agent_get_slot")
|
||||
defer span.End()
|
||||
|
||||
if protocol.IsStreamable(protocol.Protocol(call.Format)) {
|
||||
// For hot requests, we use a long lived slot queue, which we use to manage hot containers
|
||||
@@ -340,9 +314,9 @@ func (a *agent) hotLauncher(ctx context.Context, call *call) {
|
||||
|
||||
// IMPORTANT: get a context that has a child span / logger but NO timeout
|
||||
// TODO this is a 'FollowsFrom'
|
||||
ctx = opentracing.ContextWithSpan(common.WithLogger(context.Background(), logger), opentracing.SpanFromContext(ctx))
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_hot_launcher")
|
||||
defer span.Finish()
|
||||
ctx = common.BackgroundContext(ctx)
|
||||
ctx, span := trace.StartSpan(ctx, "agent_hot_launcher")
|
||||
defer span.End()
|
||||
|
||||
for {
|
||||
ctx, cancel := context.WithTimeout(ctx, timeout)
|
||||
@@ -395,8 +369,8 @@ func (a *agent) checkLaunch(ctx context.Context, call *call) {
|
||||
|
||||
// waitHot pings and waits for a hot container from the slot queue
|
||||
func (a *agent) waitHot(ctx context.Context, call *call) (Slot, error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_wait_hot")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "agent_wait_hot")
|
||||
defer span.End()
|
||||
|
||||
ctx, cancel := context.WithCancel(ctx)
|
||||
defer cancel() // shut down dequeuer if we grab a slot
|
||||
@@ -442,8 +416,8 @@ func (a *agent) launchCold(ctx context.Context, call *call) (Slot, error) {
|
||||
isAsync := call.Type == models.TypeAsync
|
||||
ch := make(chan Slot)
|
||||
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_launch_cold")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "agent_launch_cold")
|
||||
defer span.End()
|
||||
|
||||
call.containerState.UpdateState(ctx, ContainerStateWait, call.slots)
|
||||
|
||||
@@ -479,8 +453,8 @@ func (s *coldSlot) Error() error {
|
||||
}
|
||||
|
||||
func (s *coldSlot) exec(ctx context.Context, call *call) error {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_cold_exec")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "agent_cold_exec")
|
||||
defer span.End()
|
||||
|
||||
call.requestState.UpdateState(ctx, RequestStateExec, call.slots)
|
||||
call.containerState.UpdateState(ctx, ContainerStateBusy, call.slots)
|
||||
@@ -507,7 +481,7 @@ func (s *coldSlot) Close(ctx context.Context) error {
|
||||
// call this from here so that in exec we don't have to eat container
|
||||
// removal latency
|
||||
// NOTE ensure container removal, no ctx timeout
|
||||
ctx = opentracing.ContextWithSpan(context.Background(), opentracing.SpanFromContext(ctx))
|
||||
ctx = common.BackgroundContext(ctx)
|
||||
s.cookie.Close(ctx)
|
||||
}
|
||||
if s.tok != nil {
|
||||
@@ -535,8 +509,8 @@ func (s *hotSlot) Error() error {
|
||||
}
|
||||
|
||||
func (s *hotSlot) exec(ctx context.Context, call *call) error {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_hot_exec")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "agent_hot_exec")
|
||||
defer span.End()
|
||||
|
||||
call.requestState.UpdateState(ctx, RequestStateExec, call.slots)
|
||||
|
||||
@@ -575,8 +549,8 @@ func (s *hotSlot) exec(ctx context.Context, call *call) error {
|
||||
}
|
||||
|
||||
func (a *agent) prepCold(ctx context.Context, call *call, tok ResourceToken, ch chan Slot) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_prep_cold")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "agent_prep_cold")
|
||||
defer span.End()
|
||||
|
||||
call.containerState.UpdateState(ctx, ContainerStateStart, call.slots)
|
||||
|
||||
@@ -622,9 +596,9 @@ func (a *agent) prepCold(ctx context.Context, call *call, tok ResourceToken, ch
|
||||
func (a *agent) runHot(ctx context.Context, call *call, tok ResourceToken, state ContainerState) {
|
||||
// IMPORTANT: get a context that has a child span / logger but NO timeout
|
||||
// TODO this is a 'FollowsFrom'
|
||||
ctx = opentracing.ContextWithSpan(context.Background(), opentracing.SpanFromContext(ctx))
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_run_hot")
|
||||
defer span.Finish()
|
||||
ctx = common.BackgroundContext(ctx)
|
||||
ctx, span := trace.StartSpan(ctx, "agent_run_hot")
|
||||
defer span.End()
|
||||
defer tok.Close() // IMPORTANT: this MUST get called
|
||||
|
||||
state.UpdateState(ctx, ContainerStateStart, call.slots)
|
||||
@@ -852,17 +826,10 @@ func (c *container) CPUs() uint64 { return c.cpus }
|
||||
|
||||
// WriteStat publishes each metric in the specified Stats structure as a histogram metric
|
||||
func (c *container) WriteStat(ctx context.Context, stat drivers.Stat) {
|
||||
|
||||
// Convert each metric value from uint64 to float64
|
||||
// and, for backward compatibility reasons, prepend each metric name with "docker_stats_fn_"
|
||||
// (if we don't care about compatibility then we can remove that)
|
||||
var metrics = make(map[string]float64)
|
||||
for key, value := range stat.Metrics {
|
||||
metrics["docker_stats_fn_"+key] = float64(value)
|
||||
stats.Record(ctx, stats.FindMeasure("docker_stats_"+key).(*stats.Int64Measure).M(int64(value)))
|
||||
}
|
||||
|
||||
common.PublishHistograms(ctx, metrics)
|
||||
|
||||
c.statsMu.Lock()
|
||||
if c.stats != nil {
|
||||
*(c.stats) = append(*(c.stats), stat)
|
||||
@@ -870,6 +837,45 @@ func (c *container) WriteStat(ctx context.Context, stat drivers.Stat) {
|
||||
c.statsMu.Unlock()
|
||||
}
|
||||
|
||||
func init() {
|
||||
// TODO this is nasty figure out how to use opencensus to not have to declare these
|
||||
keys := []string{"net_rx", "net_tx", "mem_limit", "mem_usage", "disk_read", "disk_write", "cpu_user", "cpu_total", "cpu_kernel"}
|
||||
|
||||
// TODO necessary?
|
||||
appKey, err := tag.NewKey("fn_appname")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
pathKey, err := tag.NewKey("fn_path")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
|
||||
for _, key := range keys {
|
||||
units := "bytes"
|
||||
if strings.Contains(key, "cpu") {
|
||||
units = "cpu"
|
||||
}
|
||||
dockerStatsDist, err := stats.Int64("docker_stats_"+key, "docker container stats for "+key, units)
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
v, err := view.New(
|
||||
"docker_stats_"+key,
|
||||
"docker container stats for "+key,
|
||||
[]tag.Key{appKey, pathKey},
|
||||
dockerStatsDist,
|
||||
view.DistributionAggregation{},
|
||||
)
|
||||
if err != nil {
|
||||
logrus.Fatalf("cannot create view: %v", err)
|
||||
}
|
||||
if err := v.Subscribe(); err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//func (c *container) DockerAuth() (docker.AuthConfiguration, error) {
|
||||
// Implementing the docker.AuthConfiguration interface.
|
||||
// TODO per call could implement this stored somewhere (vs. configured on host)
|
||||
|
||||
@@ -4,9 +4,11 @@ import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/fnproject/fn/api/common"
|
||||
"github.com/fnproject/fn/api/models"
|
||||
opentracing "github.com/opentracing/opentracing-go"
|
||||
"github.com/sirupsen/logrus"
|
||||
"go.opencensus.io/tag"
|
||||
"go.opencensus.io/trace"
|
||||
)
|
||||
|
||||
func (a *agent) asyncDequeue() {
|
||||
@@ -17,8 +19,8 @@ func (a *agent) asyncDequeue() {
|
||||
defer cancel()
|
||||
|
||||
// parent span here so that we can see how many async calls are running
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_async_dequeue")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "agent_async_dequeue")
|
||||
defer span.End()
|
||||
|
||||
for {
|
||||
select {
|
||||
@@ -73,11 +75,29 @@ func (a *agent) asyncChew(ctx context.Context) <-chan *models.Call {
|
||||
func (a *agent) asyncRun(ctx context.Context, model *models.Call) {
|
||||
// IMPORTANT: get a context that has a child span but NO timeout (Submit imposes timeout)
|
||||
// TODO this is a 'FollowsFrom'
|
||||
ctx = opentracing.ContextWithSpan(context.Background(), opentracing.SpanFromContext(ctx))
|
||||
ctx = common.BackgroundContext(ctx)
|
||||
|
||||
// since async doesn't come in through the normal request path,
|
||||
// we've gotta add tags here for stats to come out properly.
|
||||
appKey, err := tag.NewKey("fn_appname")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
pathKey, err := tag.NewKey("fn_path")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
ctx, err = tag.New(ctx,
|
||||
tag.Insert(appKey, model.AppName),
|
||||
tag.Insert(pathKey, model.Path),
|
||||
)
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
|
||||
// additional enclosing context here since this isn't spawned from an http request
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_async_run")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "agent_async_run")
|
||||
defer span.End()
|
||||
|
||||
call, err := a.GetCall(
|
||||
FromModel(model),
|
||||
|
||||
@@ -9,12 +9,13 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"go.opencensus.io/trace"
|
||||
|
||||
"github.com/fnproject/fn/api/agent/drivers"
|
||||
"github.com/fnproject/fn/api/common"
|
||||
"github.com/fnproject/fn/api/id"
|
||||
"github.com/fnproject/fn/api/models"
|
||||
"github.com/go-openapi/strfmt"
|
||||
"github.com/opentracing/opentracing-go"
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
@@ -249,8 +250,8 @@ type call struct {
|
||||
func (c *call) Model() *models.Call { return c.Call }
|
||||
|
||||
func (c *call) Start(ctx context.Context) error {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_call_start")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "agent_call_start")
|
||||
defer span.End()
|
||||
|
||||
// Check context timeouts, errors
|
||||
if ctx.Err() != nil {
|
||||
@@ -290,8 +291,8 @@ func (c *call) Start(ctx context.Context) error {
|
||||
}
|
||||
|
||||
func (c *call) End(ctx context.Context, errIn error) error {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_call_end")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "agent_call_end")
|
||||
defer span.End()
|
||||
|
||||
c.CompletedAt = strfmt.DateTime(time.Now())
|
||||
|
||||
|
||||
@@ -12,13 +12,15 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"go.opencensus.io/stats"
|
||||
"go.opencensus.io/trace"
|
||||
|
||||
"github.com/coreos/go-semver/semver"
|
||||
"github.com/fnproject/fn/api/agent/drivers"
|
||||
"github.com/fnproject/fn/api/common"
|
||||
"github.com/fnproject/fn/api/models"
|
||||
"github.com/fsouza/go-dockerclient"
|
||||
"github.com/go-openapi/strfmt"
|
||||
"github.com/opentracing/opentracing-go"
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
@@ -77,7 +79,6 @@ func NewDocker(conf drivers.Config) *DockerDriver {
|
||||
}
|
||||
|
||||
func checkDockerVersion(driver *DockerDriver, expected string) error {
|
||||
|
||||
info, err := driver.docker.Info(context.Background())
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -269,9 +270,9 @@ func (drv *DockerDriver) ensureImage(ctx context.Context, task drivers.Container
|
||||
|
||||
if task, ok := task.(Auther); ok {
|
||||
var err error
|
||||
span, _ := opentracing.StartSpanFromContext(ctx, "docker_auth")
|
||||
_, span := trace.StartSpan(ctx, "docker_auth")
|
||||
config, err = task.DockerAuth()
|
||||
span.Finish()
|
||||
span.End()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -396,8 +397,8 @@ func (w *waitResult) Wait(ctx context.Context) (drivers.RunResult, error) {
|
||||
|
||||
// Repeatedly collect stats from the specified docker container until the stopSignal is closed or the context is cancelled
|
||||
func (drv *DockerDriver) collectStats(ctx context.Context, stopSignal <-chan struct{}, container string, task drivers.ContainerTask) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "docker_collect_stats")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "docker_collect_stats")
|
||||
defer span.End()
|
||||
|
||||
log := common.Logger(ctx)
|
||||
|
||||
@@ -576,7 +577,7 @@ func (w *waitResult) wait(ctx context.Context) (status string, err error) {
|
||||
case 0:
|
||||
return drivers.StatusSuccess, nil
|
||||
case 137: // OOM
|
||||
// TODO put in stats opentracing.SpanFromContext(ctx).LogFields(log.String("docker", "oom"))
|
||||
stats.Record(ctx, dockerOOMMeasure.M(1))
|
||||
common.Logger(ctx).Error("docker oom")
|
||||
err := errors.New("container out of memory, you may want to raise route.memory for this route (default: 128MB)")
|
||||
return drivers.StatusKilled, models.NewAPIError(http.StatusBadGateway, err)
|
||||
|
||||
@@ -12,9 +12,11 @@ import (
|
||||
|
||||
"github.com/fnproject/fn/api/common"
|
||||
"github.com/fsouza/go-dockerclient"
|
||||
"github.com/opentracing/opentracing-go"
|
||||
"github.com/opentracing/opentracing-go/log"
|
||||
"github.com/sirupsen/logrus"
|
||||
"go.opencensus.io/stats"
|
||||
"go.opencensus.io/stats/view"
|
||||
"go.opencensus.io/tag"
|
||||
"go.opencensus.io/trace"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -98,18 +100,123 @@ type dockerWrap struct {
|
||||
dockerNoTimeout *docker.Client
|
||||
}
|
||||
|
||||
func init() {
|
||||
// TODO doing this at each call site seems not the intention of the library since measurements
|
||||
// need to be created and views registered. doing this up front seems painful but maybe there
|
||||
// are benefits?
|
||||
|
||||
// TODO do we have to do this? the measurements will be tagged on the context, will they be propagated
|
||||
// or we have to white list them in the view for them to show up? test...
|
||||
var err error
|
||||
appKey, err := tag.NewKey("fn_appname")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
pathKey, err := tag.NewKey("fn_path")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
|
||||
{
|
||||
dockerRetriesMeasure, err = stats.Int64("docker_api_retries", "docker api retries", "")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
v, err := view.New(
|
||||
"docker_api_retries",
|
||||
"number of times we've retried docker API upon failure",
|
||||
[]tag.Key{appKey, pathKey},
|
||||
dockerRetriesMeasure,
|
||||
view.SumAggregation{},
|
||||
)
|
||||
if err != nil {
|
||||
logrus.Fatalf("cannot create view: %v", err)
|
||||
}
|
||||
if err := v.Subscribe(); err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
dockerTimeoutMeasure, err = stats.Int64("docker_api_timeout", "docker api timeouts", "")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
v, err := view.New(
|
||||
"docker_api_timeout_count",
|
||||
"number of times we've timed out calling docker API",
|
||||
[]tag.Key{appKey, pathKey},
|
||||
dockerTimeoutMeasure,
|
||||
view.CountAggregation{},
|
||||
)
|
||||
if err != nil {
|
||||
logrus.Fatalf("cannot create view: %v", err)
|
||||
}
|
||||
if err := v.Subscribe(); err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
dockerErrorMeasure, err = stats.Int64("docker_api_error", "docker api errors", "")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
v, err := view.New(
|
||||
"docker_api_error_count",
|
||||
"number of unrecoverable errors from docker API",
|
||||
[]tag.Key{appKey, pathKey},
|
||||
dockerErrorMeasure,
|
||||
view.CountAggregation{},
|
||||
)
|
||||
if err != nil {
|
||||
logrus.Fatalf("cannot create view: %v", err)
|
||||
}
|
||||
if err := v.Subscribe(); err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
dockerOOMMeasure, err = stats.Int64("docker_oom", "docker oom", "")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
v, err := view.New(
|
||||
"docker_oom_count",
|
||||
"number of docker container oom",
|
||||
[]tag.Key{appKey, pathKey},
|
||||
dockerOOMMeasure,
|
||||
view.CountAggregation{},
|
||||
)
|
||||
if err != nil {
|
||||
logrus.Fatalf("cannot create view: %v", err)
|
||||
}
|
||||
if err := v.Subscribe(); err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var (
|
||||
// TODO it's either this or stats.FindMeasure("string").M() -- this is safer but painful
|
||||
dockerRetriesMeasure *stats.Int64Measure
|
||||
dockerTimeoutMeasure *stats.Int64Measure
|
||||
dockerErrorMeasure *stats.Int64Measure
|
||||
dockerOOMMeasure *stats.Int64Measure
|
||||
)
|
||||
|
||||
func (d *dockerWrap) retry(ctx context.Context, logger logrus.FieldLogger, f func() error) error {
|
||||
var i int
|
||||
var err error
|
||||
span := opentracing.SpanFromContext(ctx)
|
||||
defer func() { span.LogFields(log.Int("docker_call_retries", i)) }()
|
||||
defer func() { stats.Record(ctx, dockerRetriesMeasure.M(int64(i))) }()
|
||||
|
||||
var b common.Backoff
|
||||
// 10 retries w/o change to backoff is ~13s if ops take ~0 time
|
||||
for ; i < 10; i++ {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
span.LogFields(log.String("task", "fail.docker"))
|
||||
stats.Record(ctx, dockerTimeoutMeasure.M(1))
|
||||
logger.WithError(ctx.Err()).Warnf("docker call timed out")
|
||||
return ctx.Err()
|
||||
default:
|
||||
@@ -119,11 +226,10 @@ func (d *dockerWrap) retry(ctx context.Context, logger logrus.FieldLogger, f fun
|
||||
if common.IsTemporary(err) || isDocker50x(err) {
|
||||
logger.WithError(err).Warn("docker temporary error, retrying")
|
||||
b.Sleep(ctx)
|
||||
span.LogFields(log.String("task", "tmperror.docker"))
|
||||
continue
|
||||
}
|
||||
if err != nil {
|
||||
span.LogFields(log.String("task", "error.docker"))
|
||||
stats.Record(ctx, dockerErrorMeasure.M(1))
|
||||
}
|
||||
return err
|
||||
}
|
||||
@@ -176,22 +282,17 @@ func filterNoSuchContainer(ctx context.Context, err error) error {
|
||||
}
|
||||
|
||||
func (d *dockerWrap) Info(ctx context.Context) (info *docker.DockerInfo, err error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "docker_server_version")
|
||||
defer span.Finish()
|
||||
|
||||
logger := common.Logger(ctx).WithField("docker_cmd", "DockerInfo")
|
||||
ctx, cancel := context.WithTimeout(ctx, retryTimeout)
|
||||
defer cancel()
|
||||
err = d.retry(ctx, logger, func() error {
|
||||
info, err = d.docker.Info()
|
||||
return err
|
||||
})
|
||||
return info, err
|
||||
// NOTE: we're not very responsible and prometheus wasn't loved as a child, this
|
||||
// threads through directly down to the docker call, skipping retires, so that we
|
||||
// don't have to add tags / tracing / logger to the bare context handed to the one
|
||||
// place this is called in initialization that has no context to report consistent
|
||||
// stats like everything else in here. tl;dr this works, just don't use it for anything else.
|
||||
return d.docker.Info()
|
||||
}
|
||||
|
||||
func (d *dockerWrap) AttachToContainerNonBlocking(ctx context.Context, opts docker.AttachToContainerOptions) (w docker.CloseWaiter, err error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "docker_attach_container")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "docker_attach_container")
|
||||
defer span.End()
|
||||
|
||||
logger := common.Logger(ctx).WithField("docker_cmd", "AttachContainer")
|
||||
ctx, cancel := context.WithTimeout(ctx, retryTimeout)
|
||||
@@ -208,8 +309,8 @@ func (d *dockerWrap) AttachToContainerNonBlocking(ctx context.Context, opts dock
|
||||
}
|
||||
|
||||
func (d *dockerWrap) WaitContainerWithContext(id string, ctx context.Context) (code int, err error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "docker_wait_container")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "docker_wait_container")
|
||||
defer span.End()
|
||||
|
||||
logger := common.Logger(ctx).WithField("docker_cmd", "WaitContainer")
|
||||
err = d.retry(ctx, logger, func() error {
|
||||
@@ -220,8 +321,8 @@ func (d *dockerWrap) WaitContainerWithContext(id string, ctx context.Context) (c
|
||||
}
|
||||
|
||||
func (d *dockerWrap) StartContainerWithContext(id string, hostConfig *docker.HostConfig, ctx context.Context) (err error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "docker_start_container")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "docker_start_container")
|
||||
defer span.End()
|
||||
|
||||
logger := common.Logger(ctx).WithField("docker_cmd", "StartContainer")
|
||||
err = d.retry(ctx, logger, func() error {
|
||||
@@ -236,8 +337,8 @@ func (d *dockerWrap) StartContainerWithContext(id string, hostConfig *docker.Hos
|
||||
}
|
||||
|
||||
func (d *dockerWrap) CreateContainer(opts docker.CreateContainerOptions) (c *docker.Container, err error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(opts.Context, "docker_create_container")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(opts.Context, "docker_create_container")
|
||||
defer span.End()
|
||||
|
||||
logger := common.Logger(ctx).WithField("docker_cmd", "CreateContainer")
|
||||
err = d.retry(ctx, logger, func() error {
|
||||
@@ -248,8 +349,8 @@ func (d *dockerWrap) CreateContainer(opts docker.CreateContainerOptions) (c *doc
|
||||
}
|
||||
|
||||
func (d *dockerWrap) PullImage(opts docker.PullImageOptions, auth docker.AuthConfiguration) (err error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(opts.Context, "docker_pull_image")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(opts.Context, "docker_pull_image")
|
||||
defer span.End()
|
||||
|
||||
logger := common.Logger(ctx).WithField("docker_cmd", "PullImage")
|
||||
err = d.retry(ctx, logger, func() error {
|
||||
@@ -262,9 +363,9 @@ func (d *dockerWrap) PullImage(opts docker.PullImageOptions, auth docker.AuthCon
|
||||
func (d *dockerWrap) RemoveContainer(opts docker.RemoveContainerOptions) (err error) {
|
||||
// extract the span, but do not keep the context, since the enclosing context
|
||||
// may be timed out, and we still want to remove the container. TODO in caller? who cares?
|
||||
span, _ := opentracing.StartSpanFromContext(opts.Context, "docker_remove_container")
|
||||
defer span.Finish()
|
||||
ctx := opentracing.ContextWithSpan(context.Background(), span)
|
||||
ctx := common.BackgroundContext(opts.Context)
|
||||
ctx, span := trace.StartSpan(ctx, "docker_remove_container")
|
||||
defer span.End()
|
||||
|
||||
ctx, cancel := context.WithTimeout(ctx, retryTimeout)
|
||||
defer cancel()
|
||||
@@ -278,8 +379,8 @@ func (d *dockerWrap) RemoveContainer(opts docker.RemoveContainerOptions) (err er
|
||||
}
|
||||
|
||||
func (d *dockerWrap) PauseContainer(id string, ctx context.Context) (err error) {
|
||||
span, _ := opentracing.StartSpanFromContext(ctx, "docker_pause_container")
|
||||
defer span.Finish()
|
||||
_, span := trace.StartSpan(ctx, "docker_pause_container")
|
||||
defer span.End()
|
||||
ctx, cancel := context.WithTimeout(ctx, pauseTimeout)
|
||||
defer cancel()
|
||||
|
||||
@@ -292,8 +393,8 @@ func (d *dockerWrap) PauseContainer(id string, ctx context.Context) (err error)
|
||||
}
|
||||
|
||||
func (d *dockerWrap) UnpauseContainer(id string, ctx context.Context) (err error) {
|
||||
span, _ := opentracing.StartSpanFromContext(ctx, "docker_unpause_container")
|
||||
defer span.Finish()
|
||||
_, span := trace.StartSpan(ctx, "docker_unpause_container")
|
||||
defer span.End()
|
||||
ctx, cancel := context.WithTimeout(ctx, pauseTimeout)
|
||||
defer cancel()
|
||||
|
||||
@@ -306,8 +407,8 @@ func (d *dockerWrap) UnpauseContainer(id string, ctx context.Context) (err error
|
||||
}
|
||||
|
||||
func (d *dockerWrap) InspectImage(ctx context.Context, name string) (i *docker.Image, err error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "docker_inspect_image")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "docker_inspect_image")
|
||||
defer span.End()
|
||||
ctx, cancel := context.WithTimeout(ctx, retryTimeout)
|
||||
defer cancel()
|
||||
|
||||
@@ -320,8 +421,8 @@ func (d *dockerWrap) InspectImage(ctx context.Context, name string) (i *docker.I
|
||||
}
|
||||
|
||||
func (d *dockerWrap) InspectContainerWithContext(container string, ctx context.Context) (c *docker.Container, err error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "docker_inspect_container")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "docker_inspect_container")
|
||||
defer span.End()
|
||||
ctx, cancel := context.WithTimeout(ctx, retryTimeout)
|
||||
defer cancel()
|
||||
|
||||
|
||||
@@ -17,7 +17,8 @@ import (
|
||||
"github.com/fnproject/fn/api/agent"
|
||||
"github.com/fnproject/fn/api/common"
|
||||
"github.com/fnproject/fn/api/models"
|
||||
opentracing "github.com/opentracing/opentracing-go"
|
||||
"go.opencensus.io/plugin/ochttp/propagation/b3"
|
||||
"go.opencensus.io/trace"
|
||||
)
|
||||
|
||||
// client implements agent.DataAccess
|
||||
@@ -66,16 +67,16 @@ func NewClient(u string) (agent.DataAccess, error) {
|
||||
}
|
||||
|
||||
func (cl *client) Enqueue(ctx context.Context, c *models.Call) error {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "hybrid_client_enqueue")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "hybrid_client_enqueue")
|
||||
defer span.End()
|
||||
|
||||
err := cl.do(ctx, c, nil, "PUT", "runner", "async")
|
||||
return err
|
||||
}
|
||||
|
||||
func (cl *client) Dequeue(ctx context.Context) (*models.Call, error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "hybrid_client_dequeue")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "hybrid_client_dequeue")
|
||||
defer span.End()
|
||||
|
||||
var c struct {
|
||||
C []*models.Call `json:"calls"`
|
||||
@@ -88,16 +89,16 @@ func (cl *client) Dequeue(ctx context.Context) (*models.Call, error) {
|
||||
}
|
||||
|
||||
func (cl *client) Start(ctx context.Context, c *models.Call) error {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "hybrid_client_start")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "hybrid_client_start")
|
||||
defer span.End()
|
||||
|
||||
err := cl.do(ctx, c, nil, "POST", "runner", "start")
|
||||
return err
|
||||
}
|
||||
|
||||
func (cl *client) Finish(ctx context.Context, c *models.Call, r io.Reader, async bool) error {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "hybrid_client_end")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "hybrid_client_end")
|
||||
defer span.End()
|
||||
|
||||
var b bytes.Buffer // TODO pool / we should multipart this?
|
||||
_, err := io.Copy(&b, r)
|
||||
@@ -118,8 +119,8 @@ func (cl *client) Finish(ctx context.Context, c *models.Call, r io.Reader, async
|
||||
}
|
||||
|
||||
func (cl *client) GetApp(ctx context.Context, appName string) (*models.App, error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "hybrid_client_get_app")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "hybrid_client_get_app")
|
||||
defer span.End()
|
||||
|
||||
var a struct {
|
||||
A models.App `json:"app"`
|
||||
@@ -129,8 +130,8 @@ func (cl *client) GetApp(ctx context.Context, appName string) (*models.App, erro
|
||||
}
|
||||
|
||||
func (cl *client) GetRoute(ctx context.Context, appName, route string) (*models.Route, error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "hybrid_client_get_route")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "hybrid_client_get_route")
|
||||
defer span.End()
|
||||
|
||||
// TODO trim prefix is pretty odd here eh?
|
||||
var r struct {
|
||||
@@ -181,8 +182,8 @@ func (cl *client) do(ctx context.Context, request, result interface{}, method st
|
||||
}
|
||||
|
||||
func (cl *client) once(ctx context.Context, request, result interface{}, method string, url ...string) error {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "hybrid_client_http_do")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "hybrid_client_http_do")
|
||||
defer span.End()
|
||||
|
||||
var b bytes.Buffer // TODO pool
|
||||
if request != nil {
|
||||
@@ -196,12 +197,9 @@ func (cl *client) once(ctx context.Context, request, result interface{}, method
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req = req.WithContext(ctx)
|
||||
// shove the span headers in so that the server will continue this span
|
||||
opentracing.GlobalTracer().Inject(
|
||||
span.Context(),
|
||||
opentracing.HTTPHeaders,
|
||||
opentracing.HTTPHeadersCarrier(req.Header))
|
||||
var xxx b3.HTTPFormat
|
||||
xxx.SpanContextToRequest(span.SpanContext(), req)
|
||||
|
||||
resp, err := cl.http.Do(req)
|
||||
if err != nil {
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
)
|
||||
|
||||
func (a *agent) PromHandler() http.Handler {
|
||||
return a.promHandler
|
||||
}
|
||||
@@ -7,8 +7,9 @@ import (
|
||||
"io"
|
||||
"net/http"
|
||||
|
||||
"go.opencensus.io/trace"
|
||||
|
||||
"github.com/fnproject/fn/api/models"
|
||||
opentracing "github.com/opentracing/opentracing-go"
|
||||
)
|
||||
|
||||
// HTTPProtocol converts stdin/stdout streams into HTTP/1.1 compliant
|
||||
@@ -23,8 +24,8 @@ type HTTPProtocol struct {
|
||||
func (p *HTTPProtocol) IsStreamable() bool { return true }
|
||||
|
||||
func (h *HTTPProtocol) Dispatch(ctx context.Context, ci CallInfo, w io.Writer) error {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "dispatch_http")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "dispatch_http")
|
||||
defer span.End()
|
||||
|
||||
req := ci.Request()
|
||||
|
||||
@@ -36,23 +37,23 @@ func (h *HTTPProtocol) Dispatch(ctx context.Context, ci CallInfo, w io.Writer) e
|
||||
req.Header.Set("FN_REQUEST_URL", ci.RequestURL())
|
||||
req.Header.Set("FN_CALL_ID", ci.CallID())
|
||||
|
||||
span, _ = opentracing.StartSpanFromContext(ctx, "dispatch_http_write_request")
|
||||
_, span = trace.StartSpan(ctx, "dispatch_http_write_request")
|
||||
// req.Write handles if the user does not specify content length
|
||||
err := req.Write(h.in)
|
||||
span.Finish()
|
||||
span.End()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
span, _ = opentracing.StartSpanFromContext(ctx, "dispatch_http_read_response")
|
||||
_, span = trace.StartSpan(ctx, "dispatch_http_read_response")
|
||||
resp, err := http.ReadResponse(bufio.NewReader(h.out), ci.Request())
|
||||
span.Finish()
|
||||
span.End()
|
||||
if err != nil {
|
||||
return models.NewAPIError(http.StatusBadGateway, fmt.Errorf("invalid http response from function err: %v", err))
|
||||
}
|
||||
|
||||
span, _ = opentracing.StartSpanFromContext(ctx, "dispatch_http_write_response")
|
||||
defer span.Finish()
|
||||
_, span = trace.StartSpan(ctx, "dispatch_http_write_response")
|
||||
defer span.End()
|
||||
|
||||
rw, ok := w.(http.ResponseWriter)
|
||||
if !ok {
|
||||
|
||||
@@ -9,8 +9,9 @@ import (
|
||||
"net/http"
|
||||
"sync"
|
||||
|
||||
"go.opencensus.io/trace"
|
||||
|
||||
"github.com/fnproject/fn/api/models"
|
||||
opentracing "github.com/opentracing/opentracing-go"
|
||||
)
|
||||
|
||||
var (
|
||||
@@ -87,26 +88,26 @@ func (h *JSONProtocol) writeJSONToContainer(ci CallInfo) error {
|
||||
}
|
||||
|
||||
func (h *JSONProtocol) Dispatch(ctx context.Context, ci CallInfo, w io.Writer) error {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "dispatch_json")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "dispatch_json")
|
||||
defer span.End()
|
||||
|
||||
span, _ = opentracing.StartSpanFromContext(ctx, "dispatch_json_write_request")
|
||||
_, span = trace.StartSpan(ctx, "dispatch_json_write_request")
|
||||
err := h.writeJSONToContainer(ci)
|
||||
span.Finish()
|
||||
span.End()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
span, _ = opentracing.StartSpanFromContext(ctx, "dispatch_json_read_response")
|
||||
_, span = trace.StartSpan(ctx, "dispatch_json_read_response")
|
||||
var jout jsonOut
|
||||
err = json.NewDecoder(h.out).Decode(&jout)
|
||||
span.Finish()
|
||||
span.End()
|
||||
if err != nil {
|
||||
return models.NewAPIError(http.StatusBadGateway, fmt.Errorf("invalid json response from function err: %v", err))
|
||||
}
|
||||
|
||||
span, _ = opentracing.StartSpanFromContext(ctx, "dispatch_json_write_response")
|
||||
defer span.Finish()
|
||||
_, span = trace.StartSpan(ctx, "dispatch_json_write_response")
|
||||
defer span.End()
|
||||
|
||||
rw, ok := w.(http.ResponseWriter)
|
||||
if !ok {
|
||||
|
||||
@@ -13,7 +13,8 @@ import (
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
opentracing "github.com/opentracing/opentracing-go"
|
||||
"go.opencensus.io/trace"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
@@ -166,9 +167,9 @@ func (a *resourceTracker) GetResourceToken(ctx context.Context, memory uint64, c
|
||||
c.L.Unlock()
|
||||
}()
|
||||
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_get_resource_token")
|
||||
ctx, span := trace.StartSpan(ctx, "agent_get_resource_token")
|
||||
go func() {
|
||||
defer span.Finish()
|
||||
defer span.End()
|
||||
defer cancel()
|
||||
c.L.Lock()
|
||||
|
||||
@@ -254,9 +255,9 @@ func (a *resourceTracker) WaitAsyncResource(ctx context.Context) chan struct{} {
|
||||
c.L.Unlock()
|
||||
}()
|
||||
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_wait_async_resource")
|
||||
ctx, span := trace.StartSpan(ctx, "agent_wait_async_resource")
|
||||
go func() {
|
||||
defer span.Finish()
|
||||
defer span.End()
|
||||
defer cancel()
|
||||
c.L.Lock()
|
||||
isWaiting = true
|
||||
|
||||
@@ -5,7 +5,10 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/fnproject/fn/api/common"
|
||||
"github.com/sirupsen/logrus"
|
||||
"go.opencensus.io/stats"
|
||||
"go.opencensus.io/stats/view"
|
||||
"go.opencensus.io/tag"
|
||||
)
|
||||
|
||||
type RequestStateType int
|
||||
@@ -137,16 +140,76 @@ func (c *containerState) UpdateState(ctx context.Context, newState ContainerStat
|
||||
// update old state stats
|
||||
gaugeKey := containerGaugeKeys[oldState]
|
||||
if gaugeKey != "" {
|
||||
common.DecrementGauge(ctx, gaugeKey)
|
||||
stats.Record(ctx, stats.FindMeasure(gaugeKey).(*stats.Int64Measure).M(-1))
|
||||
}
|
||||
|
||||
timeKey := containerTimeKeys[oldState]
|
||||
if timeKey != "" {
|
||||
common.PublishElapsedTimeHistogram(ctx, timeKey, before, now)
|
||||
stats.Record(ctx, stats.FindMeasure(timeKey).(*stats.Int64Measure).M(int64(now.Sub(before).Round(time.Millisecond))))
|
||||
}
|
||||
|
||||
// update new state stats
|
||||
gaugeKey = containerGaugeKeys[newState]
|
||||
if gaugeKey != "" {
|
||||
common.IncrementGauge(ctx, gaugeKey)
|
||||
stats.Record(ctx, stats.FindMeasure(gaugeKey).(*stats.Int64Measure).M(1))
|
||||
}
|
||||
}
|
||||
|
||||
func init() {
|
||||
// TODO(reed): do we have to do this? the measurements will be tagged on the context, will they be propagated
|
||||
// or we have to white list them in the view for them to show up? test...
|
||||
appKey, err := tag.NewKey("fn_appname")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
pathKey, err := tag.NewKey("fn_path")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
|
||||
for _, key := range containerGaugeKeys {
|
||||
if key == "" { // leave nil intentionally, let it panic
|
||||
continue
|
||||
}
|
||||
measure, err := stats.Int64(key, "containers in state "+key, "")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
v, err := view.New(
|
||||
key,
|
||||
"containers in state "+key,
|
||||
[]tag.Key{appKey, pathKey},
|
||||
measure,
|
||||
view.CountAggregation{},
|
||||
)
|
||||
if err != nil {
|
||||
logrus.Fatalf("cannot create view: %v", err)
|
||||
}
|
||||
if err := v.Subscribe(); err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
for _, key := range containerTimeKeys {
|
||||
if key == "" {
|
||||
continue
|
||||
}
|
||||
measure, err := stats.Int64(key, "time spent in container state "+key, "ms")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
v, err := view.New(
|
||||
key,
|
||||
"time spent in container state "+key,
|
||||
[]tag.Key{appKey, pathKey},
|
||||
measure,
|
||||
view.DistributionAggregation{},
|
||||
)
|
||||
if err != nil {
|
||||
logrus.Fatalf("cannot create view: %v", err)
|
||||
}
|
||||
if err := v.Subscribe(); err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,54 +2,63 @@ package agent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"github.com/fnproject/fn/api/common"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
"go.opencensus.io/stats"
|
||||
"go.opencensus.io/stats/view"
|
||||
"go.opencensus.io/tag"
|
||||
)
|
||||
|
||||
func StatsEnqueue(ctx context.Context) {
|
||||
common.IncrementGauge(ctx, queuedMetricName)
|
||||
common.IncrementCounter(ctx, callsMetricName)
|
||||
// TODO add some suga:
|
||||
// * hot containers active
|
||||
// * memory used / available
|
||||
|
||||
func statsEnqueue(ctx context.Context) {
|
||||
stats.Record(ctx, queuedMeasure.M(1))
|
||||
stats.Record(ctx, callsMeasure.M(1))
|
||||
}
|
||||
|
||||
// Call when a function has been queued but cannot be started because of an error
|
||||
func StatsDequeue(ctx context.Context) {
|
||||
common.DecrementGauge(ctx, queuedMetricName)
|
||||
func statsDequeue(ctx context.Context) {
|
||||
stats.Record(ctx, queuedMeasure.M(-1))
|
||||
}
|
||||
|
||||
func StatsDequeueAndStart(ctx context.Context) {
|
||||
common.DecrementGauge(ctx, queuedMetricName)
|
||||
common.IncrementGauge(ctx, runningMetricName)
|
||||
func statsDequeueAndStart(ctx context.Context) {
|
||||
stats.Record(ctx, queuedMeasure.M(-1))
|
||||
stats.Record(ctx, runningMeasure.M(1))
|
||||
}
|
||||
|
||||
func StatsComplete(ctx context.Context) {
|
||||
common.DecrementGauge(ctx, runningMetricName)
|
||||
common.IncrementCounter(ctx, completedMetricName)
|
||||
func statsComplete(ctx context.Context) {
|
||||
stats.Record(ctx, runningMeasure.M(-1))
|
||||
stats.Record(ctx, completedMeasure.M(1))
|
||||
}
|
||||
|
||||
func StatsFailed(ctx context.Context) {
|
||||
common.DecrementGauge(ctx, runningMetricName)
|
||||
common.IncrementCounter(ctx, failedMetricName)
|
||||
func statsFailed(ctx context.Context) {
|
||||
stats.Record(ctx, runningMeasure.M(-1))
|
||||
stats.Record(ctx, failedMeasure.M(1))
|
||||
}
|
||||
|
||||
func StatsDequeueAndFail(ctx context.Context) {
|
||||
common.DecrementGauge(ctx, queuedMetricName)
|
||||
common.IncrementCounter(ctx, failedMetricName)
|
||||
func statsDequeueAndFail(ctx context.Context) {
|
||||
stats.Record(ctx, queuedMeasure.M(-1))
|
||||
stats.Record(ctx, failedMeasure.M(1))
|
||||
}
|
||||
|
||||
func StatsIncrementTimedout(ctx context.Context) {
|
||||
common.IncrementCounter(ctx, timedoutMetricName)
|
||||
func statsTimedout(ctx context.Context) {
|
||||
stats.Record(ctx, timedoutMeasure.M(1))
|
||||
}
|
||||
|
||||
func StatsIncrementErrors(ctx context.Context) {
|
||||
common.IncrementCounter(ctx, errorsMetricName)
|
||||
func statsErrors(ctx context.Context) {
|
||||
stats.Record(ctx, errorsMeasure.M(1))
|
||||
}
|
||||
|
||||
func StatsIncrementTooBusy(ctx context.Context) {
|
||||
common.IncrementCounter(ctx, serverBusyMetricName)
|
||||
func statsTooBusy(ctx context.Context) {
|
||||
stats.Record(ctx, serverBusyMeasure.M(1))
|
||||
}
|
||||
|
||||
const (
|
||||
// TODO we should probably prefix these with calls_ ?
|
||||
queuedMetricName = "queued"
|
||||
callsMetricName = "calls"
|
||||
callsMetricName = "calls" // TODO this is a dupe of sum {complete,failed} ?
|
||||
runningMetricName = "running"
|
||||
completedMetricName = "completed"
|
||||
failedMetricName = "failed"
|
||||
@@ -57,3 +66,192 @@ const (
|
||||
errorsMetricName = "errors"
|
||||
serverBusyMetricName = "server_busy"
|
||||
)
|
||||
|
||||
var (
|
||||
queuedMeasure *stats.Int64Measure
|
||||
callsMeasure *stats.Int64Measure // TODO this is a dupe of sum {complete,failed} ?
|
||||
runningMeasure *stats.Int64Measure
|
||||
completedMeasure *stats.Int64Measure
|
||||
failedMeasure *stats.Int64Measure
|
||||
timedoutMeasure *stats.Int64Measure
|
||||
errorsMeasure *stats.Int64Measure
|
||||
serverBusyMeasure *stats.Int64Measure
|
||||
)
|
||||
|
||||
func init() {
|
||||
// TODO(reed): doing this at each call site seems not the intention of the library since measurements
|
||||
// need to be created and views registered. doing this up front seems painful but maybe there
|
||||
// are benefits?
|
||||
|
||||
// TODO(reed): do we have to do this? the measurements will be tagged on the context, will they be propagated
|
||||
// or we have to white list them in the view for them to show up? test...
|
||||
var err error
|
||||
appKey, err := tag.NewKey("fn_appname")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
pathKey, err := tag.NewKey("fn_path")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
|
||||
{
|
||||
queuedMeasure, err = stats.Int64(queuedMetricName, "calls currently queued against agent", "")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
v, err := view.New(
|
||||
queuedMetricName,
|
||||
"calls currently queued to agent",
|
||||
[]tag.Key{appKey, pathKey},
|
||||
queuedMeasure,
|
||||
view.SumAggregation{},
|
||||
)
|
||||
if err != nil {
|
||||
logrus.Fatalf("cannot create view: %v", err)
|
||||
}
|
||||
if err := v.Subscribe(); err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
callsMeasure, err = stats.Int64(callsMetricName, "calls created in agent", "")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
v, err := view.New(
|
||||
callsMetricName,
|
||||
"calls created in agent",
|
||||
[]tag.Key{appKey, pathKey},
|
||||
callsMeasure,
|
||||
view.SumAggregation{},
|
||||
)
|
||||
if err != nil {
|
||||
logrus.Fatalf("cannot create view: %v", err)
|
||||
}
|
||||
if err := v.Subscribe(); err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
runningMeasure, err = stats.Int64(runningMetricName, "calls currently running in agent", "")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
v, err := view.New(
|
||||
runningMetricName,
|
||||
"calls currently running in agent",
|
||||
[]tag.Key{appKey, pathKey},
|
||||
runningMeasure,
|
||||
view.SumAggregation{},
|
||||
)
|
||||
if err != nil {
|
||||
logrus.Fatalf("cannot create view: %v", err)
|
||||
}
|
||||
if err := v.Subscribe(); err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
completedMeasure, err = stats.Int64(completedMetricName, "calls completed in agent", "")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
v, err := view.New(
|
||||
completedMetricName,
|
||||
"calls completed in agent",
|
||||
[]tag.Key{appKey, pathKey},
|
||||
completedMeasure,
|
||||
view.SumAggregation{},
|
||||
)
|
||||
if err != nil {
|
||||
logrus.Fatalf("cannot create view: %v", err)
|
||||
}
|
||||
if err := v.Subscribe(); err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
failedMeasure, err = stats.Int64(failedMetricName, "calls failed in agent", "")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
v, err := view.New(
|
||||
failedMetricName,
|
||||
"calls failed in agent",
|
||||
[]tag.Key{appKey, pathKey},
|
||||
failedMeasure,
|
||||
view.SumAggregation{},
|
||||
)
|
||||
if err != nil {
|
||||
logrus.Fatalf("cannot create view: %v", err)
|
||||
}
|
||||
if err := v.Subscribe(); err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
timedoutMeasure, err = stats.Int64(timedoutMetricName, "calls timed out in agent", "")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
v, err := view.New(
|
||||
timedoutMetricName,
|
||||
"calls timed out in agent",
|
||||
[]tag.Key{appKey, pathKey},
|
||||
timedoutMeasure,
|
||||
view.SumAggregation{},
|
||||
)
|
||||
if err != nil {
|
||||
logrus.Fatalf("cannot create view: %v", err)
|
||||
}
|
||||
if err := v.Subscribe(); err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
errorsMeasure, err = stats.Int64(errorsMetricName, "calls errored in agent", "")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
v, err := view.New(
|
||||
errorsMetricName,
|
||||
"calls errored in agent",
|
||||
[]tag.Key{appKey, pathKey},
|
||||
errorsMeasure,
|
||||
view.SumAggregation{},
|
||||
)
|
||||
if err != nil {
|
||||
logrus.Fatalf("cannot create view: %v", err)
|
||||
}
|
||||
if err := v.Subscribe(); err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
serverBusyMeasure, err = stats.Int64(serverBusyMetricName, "calls where server was too busy in agent", "")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
v, err := view.New(
|
||||
serverBusyMetricName,
|
||||
"calls where server was too busy in agent",
|
||||
[]tag.Key{appKey, pathKey},
|
||||
serverBusyMeasure,
|
||||
view.SumAggregation{},
|
||||
)
|
||||
if err != nil {
|
||||
logrus.Fatalf("cannot create view: %v", err)
|
||||
}
|
||||
if err := v.Subscribe(); err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,8 @@ import (
|
||||
"context"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
"go.opencensus.io/tag"
|
||||
"go.opencensus.io/trace"
|
||||
)
|
||||
|
||||
type contextKey string
|
||||
@@ -22,11 +24,35 @@ func Logger(ctx context.Context) logrus.FieldLogger {
|
||||
return l
|
||||
}
|
||||
|
||||
// Attempt at simplifying this whole logger in the context thing
|
||||
// Could even make this take a generic map, then the logger that gets returned could be used just like the stdlib too, since it's compatible
|
||||
// LoggerWithFields returns a child context of the provided parent that
|
||||
// contains a logger with additional fields from the parent's logger, it
|
||||
// returns the new child logger, as well.
|
||||
func LoggerWithFields(ctx context.Context, fields logrus.Fields) (context.Context, logrus.FieldLogger) {
|
||||
l := Logger(ctx)
|
||||
l = l.WithFields(fields)
|
||||
ctx = WithLogger(ctx, l)
|
||||
return ctx, l
|
||||
}
|
||||
|
||||
// BackgroundContext returns a context that is specifically not a child of the
|
||||
// provided parent context wrt any cancellation or deadline of the parent,
|
||||
// returning a context that contains all values only. At present, this is a
|
||||
// best effort as there is not a great way to extract all values, known values:
|
||||
// * logger
|
||||
// * span
|
||||
// * tags
|
||||
// (TODO(reed): we could have our own context.Context implementer that stores
|
||||
// all values from WithValue in a bucket we could extract more easily?)
|
||||
func BackgroundContext(ctx context.Context) context.Context {
|
||||
logger := Logger(ctx)
|
||||
span := trace.FromContext(ctx)
|
||||
tagMap := tag.FromContext(ctx)
|
||||
|
||||
// fresh context
|
||||
ctx = context.Background()
|
||||
|
||||
ctx = tag.NewContext(ctx, tagMap)
|
||||
ctx = trace.WithSpan(ctx, span)
|
||||
ctx = WithLogger(ctx, logger)
|
||||
return ctx
|
||||
}
|
||||
|
||||
@@ -1,140 +0,0 @@
|
||||
package common
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/opentracing/opentracing-go"
|
||||
"github.com/opentracing/opentracing-go/log"
|
||||
)
|
||||
|
||||
// IncrementGauge increments the specified gauge metric
|
||||
// It does this by logging an appropriate field value to a tracing span.
|
||||
func IncrementGauge(ctx context.Context, metric string) {
|
||||
// The field name we use is the specified metric name prepended with FieldnamePrefixGauge to designate that it is a Prometheus gauge metric
|
||||
// The collector will replace that prefix with "fn_" and use the result as the Prometheus metric name.
|
||||
fieldname := FieldnamePrefixGauge + metric
|
||||
|
||||
// Spans are not processed by the collector until the span ends, so to prevent any delay
|
||||
// in processing the stats when the current span is long-lived we create a new span for every call
|
||||
// suffix the span name with SpannameSuffixDummy to denote that it is used only to hold a metric and isn't itself of any interest
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, fieldname+SpannameSuffixDummy)
|
||||
defer span.Finish()
|
||||
|
||||
// gauge metrics are actually float64; here we log that it should be increased by +1
|
||||
span.LogFields(log.Float64(fieldname, 1.))
|
||||
}
|
||||
|
||||
// DecrementGauge decrements the specified gauge metric
|
||||
// It does this by logging an appropriate field value to a tracing span.
|
||||
func DecrementGauge(ctx context.Context, metric string) {
|
||||
// The field name we use is the specified metric name prepended with FieldnamePrefixGauge to designate that it is a Prometheus gauge metric
|
||||
// The collector will replace that prefix with "fn_" and use the result as the Prometheus metric name.
|
||||
fieldname := FieldnamePrefixGauge + metric
|
||||
|
||||
// Spans are not processed by the collector until the span ends, so to prevent any delay
|
||||
// in processing the stats when the current span is long-lived we create a new span for every call.
|
||||
// suffix the span name with SpannameSuffixDummy to denote that it is used only to hold a metric and isn't itself of any interest
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, fieldname+SpannameSuffixDummy)
|
||||
defer span.Finish()
|
||||
|
||||
// gauge metrics are actually float64; here we log that it should be increased by -1
|
||||
span.LogFields(log.Float64(fieldname, -1.))
|
||||
}
|
||||
|
||||
// IncrementCounter increments the specified counter metric
|
||||
// It does this by logging an appropriate field value to a tracing span.
|
||||
func IncrementCounter(ctx context.Context, metric string) {
|
||||
// The field name we use is the specified metric name prepended with FieldnamePrefixCounter to designate that it is a Prometheus counter metric
|
||||
// The collector will replace that prefix with "fn_" and use the result as the Prometheus metric name.
|
||||
fieldname := FieldnamePrefixCounter + metric
|
||||
|
||||
// Spans are not processed by the collector until the span ends, so to prevent any delay
|
||||
// in processing the stats when the current span is long-lived we create a new span for every call.
|
||||
// suffix the span name with SpannameSuffixDummy to denote that it is used only to hold a metric and isn't itself of any interest
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, fieldname+SpannameSuffixDummy)
|
||||
defer span.Finish()
|
||||
|
||||
// counter metrics are actually float64; here we log that it should be increased by +1
|
||||
span.LogFields(log.Float64(fieldname, 1.))
|
||||
}
|
||||
|
||||
// If required, create a scalar version of PublishHistograms that publishes a single histogram metric
|
||||
|
||||
// PublishHistograms publishes the specifed histogram metrics
|
||||
// It does this by logging appropriate field values to a tracing span
|
||||
// Use this when the current tracing span is long-lived and you want the metric to be visible before it ends
|
||||
func PublishHistograms(ctx context.Context, metrics map[string]float64) {
|
||||
|
||||
// Spans are not processed by the collector until the span ends, so to prevent any delay
|
||||
// in processing the stats when the current span is long-lived we create a new span for every call.
|
||||
// suffix the span name with SpannameSuffixDummy to denote that it is used only to hold a metric and isn't itself of any interest
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "histogram_metrics"+SpannameSuffixDummy)
|
||||
defer span.Finish()
|
||||
|
||||
for key, value := range metrics {
|
||||
// The field name we use is the metric name prepended with FieldnamePrefixHistogram to designate that it is a Prometheus histogram metric
|
||||
// The collector will replace that prefix with "fn_" and use the result as the Prometheus metric name.
|
||||
fieldname := FieldnamePrefixHistogram + key
|
||||
span.LogFields(log.Float64(fieldname, value))
|
||||
}
|
||||
}
|
||||
|
||||
// PublishHistogram publishes the specifed histogram metric
|
||||
// It does this by logging an appropriate field value to a tracing span
|
||||
// Use this when the current tracing span is long-lived and you want the metric to be visible before it ends
|
||||
func PublishHistogram(ctx context.Context, key string, value float64) {
|
||||
|
||||
// Spans are not processed by the collector until the span ends, so to prevent any delay
|
||||
// in processing the stats when the current span is long-lived we create a new span for every call.
|
||||
// suffix the span name with SpannameSuffixDummy to denote that it is used only to hold a metric and isn't itself of any interest
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "histogram_metrics"+SpannameSuffixDummy)
|
||||
defer span.Finish()
|
||||
|
||||
// The field name we use is the metric name prepended with FieldnamePrefixHistogram to designate that it is a Prometheus histogram metric
|
||||
// The collector will replace that prefix with "fn_" and use the result as the Prometheus metric name.
|
||||
fieldname := FieldnamePrefixHistogram + key
|
||||
span.LogFields(log.Float64(fieldname, value))
|
||||
}
|
||||
|
||||
// PublishHistogramToSpan publishes the specifed histogram metric
|
||||
// It does this by logging an appropriate field value to the specified tracing span
|
||||
// Use this when you don't need to create a new tracing span
|
||||
func PublishHistogramToSpan(span opentracing.Span, key string, value float64) {
|
||||
|
||||
// The field name we use is the metric name prepended with FieldnamePrefixHistogram to designate that it is a Prometheus histogram metric
|
||||
// The collector will replace that prefix with "fn_" and use the result as the Prometheus metric name.
|
||||
fieldname := FieldnamePrefixHistogram + key
|
||||
span.LogFields(log.Float64(fieldname, value))
|
||||
}
|
||||
|
||||
// PublishElapsedTimeToSpan publishes the specifed histogram elapsed time since start
|
||||
// It does this by logging an appropriate field value to a tracing span
|
||||
// Use this when the current tracing span is long-lived and you want the metric to be visible before it ends
|
||||
func PublishElapsedTimeHistogram(ctx context.Context, key string, start, end time.Time) {
|
||||
elapsed := float64(end.Sub(start).Seconds())
|
||||
PublishHistogram(ctx, key, elapsed)
|
||||
}
|
||||
|
||||
const (
|
||||
|
||||
// FnPrefix is a constant for "fn_", used as a prefix for span names, field names, Prometheus metric names and Prometheus label names
|
||||
FnPrefix = "fn_"
|
||||
|
||||
// FieldnamePrefixHistogram is prefixed to the name of a logged field
|
||||
// to denote that it corresponds to a histogram metric
|
||||
FieldnamePrefixHistogram = FnPrefix + "histogram_"
|
||||
|
||||
// FieldnamePrefixCounter is prefixed to the name of a logged field
|
||||
// to denote that it corresponds to a counter metric
|
||||
FieldnamePrefixCounter = FnPrefix + "counter_"
|
||||
|
||||
// FieldnamePrefixGauge is prefixed to the name of a logged field
|
||||
// to denote that it corresponds to a gauge metric
|
||||
FieldnamePrefixGauge = FnPrefix + "gauge_"
|
||||
|
||||
// SpannameSuffixDummy is suffixed to the name of a tracing span
|
||||
// to denote that it has been created solely for the purpose of carrying metric values
|
||||
// and is not itself of any interest and should not be converted to a Prometheus duration metric
|
||||
SpannameSuffixDummy = "_dummy"
|
||||
)
|
||||
@@ -4,9 +4,10 @@ import (
|
||||
"context"
|
||||
"io"
|
||||
|
||||
"go.opencensus.io/trace"
|
||||
|
||||
"github.com/fnproject/fn/api/models"
|
||||
"github.com/jmoiron/sqlx"
|
||||
"github.com/opentracing/opentracing-go"
|
||||
)
|
||||
|
||||
func MetricDS(ds models.Datastore) models.Datastore {
|
||||
@@ -18,98 +19,98 @@ type metricds struct {
|
||||
}
|
||||
|
||||
func (m *metricds) GetApp(ctx context.Context, appName string) (*models.App, error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_get_app")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "ds_get_app")
|
||||
defer span.End()
|
||||
return m.ds.GetApp(ctx, appName)
|
||||
}
|
||||
|
||||
func (m *metricds) GetApps(ctx context.Context, filter *models.AppFilter) ([]*models.App, error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_get_apps")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "ds_get_apps")
|
||||
defer span.End()
|
||||
return m.ds.GetApps(ctx, filter)
|
||||
}
|
||||
|
||||
func (m *metricds) InsertApp(ctx context.Context, app *models.App) (*models.App, error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_insert_app")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "ds_insert_app")
|
||||
defer span.End()
|
||||
return m.ds.InsertApp(ctx, app)
|
||||
}
|
||||
|
||||
func (m *metricds) UpdateApp(ctx context.Context, app *models.App) (*models.App, error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_update_app")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "ds_update_app")
|
||||
defer span.End()
|
||||
return m.ds.UpdateApp(ctx, app)
|
||||
}
|
||||
|
||||
func (m *metricds) RemoveApp(ctx context.Context, appName string) error {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_remove_app")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "ds_remove_app")
|
||||
defer span.End()
|
||||
return m.ds.RemoveApp(ctx, appName)
|
||||
}
|
||||
|
||||
func (m *metricds) GetRoute(ctx context.Context, appName, routePath string) (*models.Route, error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_get_route")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "ds_get_route")
|
||||
defer span.End()
|
||||
return m.ds.GetRoute(ctx, appName, routePath)
|
||||
}
|
||||
|
||||
func (m *metricds) GetRoutesByApp(ctx context.Context, appName string, filter *models.RouteFilter) (routes []*models.Route, err error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_get_routes_by_app")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "ds_get_routes_by_app")
|
||||
defer span.End()
|
||||
return m.ds.GetRoutesByApp(ctx, appName, filter)
|
||||
}
|
||||
|
||||
func (m *metricds) InsertRoute(ctx context.Context, route *models.Route) (*models.Route, error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_insert_route")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "ds_insert_route")
|
||||
defer span.End()
|
||||
return m.ds.InsertRoute(ctx, route)
|
||||
}
|
||||
|
||||
func (m *metricds) UpdateRoute(ctx context.Context, route *models.Route) (*models.Route, error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_update_route")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "ds_update_route")
|
||||
defer span.End()
|
||||
return m.ds.UpdateRoute(ctx, route)
|
||||
}
|
||||
|
||||
func (m *metricds) RemoveRoute(ctx context.Context, appName, routePath string) error {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_remove_route")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "ds_remove_route")
|
||||
defer span.End()
|
||||
return m.ds.RemoveRoute(ctx, appName, routePath)
|
||||
}
|
||||
|
||||
func (m *metricds) InsertCall(ctx context.Context, call *models.Call) error {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_insert_call")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "ds_insert_call")
|
||||
defer span.End()
|
||||
return m.ds.InsertCall(ctx, call)
|
||||
}
|
||||
|
||||
func (m *metricds) UpdateCall(ctx context.Context, from *models.Call, to *models.Call) error {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_update_call")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "ds_update_call")
|
||||
defer span.End()
|
||||
return m.ds.UpdateCall(ctx, from, to)
|
||||
}
|
||||
|
||||
func (m *metricds) GetCall(ctx context.Context, appName, callID string) (*models.Call, error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_get_call")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "ds_get_call")
|
||||
defer span.End()
|
||||
return m.ds.GetCall(ctx, appName, callID)
|
||||
}
|
||||
|
||||
func (m *metricds) GetCalls(ctx context.Context, filter *models.CallFilter) ([]*models.Call, error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_get_calls")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "ds_get_calls")
|
||||
defer span.End()
|
||||
return m.ds.GetCalls(ctx, filter)
|
||||
}
|
||||
|
||||
func (m *metricds) InsertLog(ctx context.Context, appName, callID string, callLog io.Reader) error {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_insert_log")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "ds_insert_log")
|
||||
defer span.End()
|
||||
return m.ds.InsertLog(ctx, appName, callID, callLog)
|
||||
}
|
||||
|
||||
func (m *metricds) GetLog(ctx context.Context, appName, callID string) (io.Reader, error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "ds_get_log")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "ds_get_log")
|
||||
defer span.End()
|
||||
return m.ds.GetLog(ctx, appName, callID)
|
||||
}
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"testing"
|
||||
|
||||
"context"
|
||||
|
||||
"github.com/fnproject/fn/api/datastore/internal/datastoretest"
|
||||
"github.com/fnproject/fn/api/datastore/internal/datastoreutil"
|
||||
"github.com/fnproject/fn/api/models"
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"net/url"
|
||||
|
||||
"context"
|
||||
|
||||
"github.com/fnproject/fn/api/common"
|
||||
"github.com/fnproject/fn/api/datastore/sql"
|
||||
"github.com/fnproject/fn/api/logs/s3"
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"testing"
|
||||
|
||||
"context"
|
||||
|
||||
"github.com/fnproject/fn/api/datastore/sql"
|
||||
logTesting "github.com/fnproject/fn/api/logs/testing"
|
||||
)
|
||||
|
||||
@@ -17,10 +17,12 @@ import (
|
||||
"github.com/aws/aws-sdk-go/aws/session"
|
||||
"github.com/aws/aws-sdk-go/service/s3"
|
||||
"github.com/aws/aws-sdk-go/service/s3/s3manager"
|
||||
"github.com/fnproject/fn/api/common"
|
||||
"github.com/fnproject/fn/api/models"
|
||||
"github.com/opentracing/opentracing-go"
|
||||
"github.com/sirupsen/logrus"
|
||||
"go.opencensus.io/stats"
|
||||
"go.opencensus.io/stats/view"
|
||||
"go.opencensus.io/tag"
|
||||
"go.opencensus.io/trace"
|
||||
)
|
||||
|
||||
// TODO we should encrypt these, user will have to supply a key though (or all
|
||||
@@ -124,8 +126,8 @@ func path(appName, callID string) string {
|
||||
}
|
||||
|
||||
func (s *store) InsertLog(ctx context.Context, appName, callID string, callLog io.Reader) error {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "s3_insert_log")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "s3_insert_log")
|
||||
defer span.End()
|
||||
|
||||
// wrap original reader in a decorator to keep track of read bytes without buffering
|
||||
cr := &countingReader{r: callLog}
|
||||
@@ -144,13 +146,13 @@ func (s *store) InsertLog(ctx context.Context, appName, callID string, callLog i
|
||||
return fmt.Errorf("failed to write log, %v", err)
|
||||
}
|
||||
|
||||
common.PublishHistogramToSpan(span, "s3_log_upload_size", float64(cr.count))
|
||||
stats.Record(ctx, uploadSizeMeasure.M(int64(cr.count)))
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *store) GetLog(ctx context.Context, appName, callID string) (io.Reader, error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "s3_get_log")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "s3_get_log")
|
||||
defer span.End()
|
||||
|
||||
objectName := path(appName, callID)
|
||||
logrus.WithFields(logrus.Fields{"bucketName": s.bucket, "key": objectName}).Debug("Downloading log")
|
||||
@@ -169,6 +171,65 @@ func (s *store) GetLog(ctx context.Context, appName, callID string) (io.Reader,
|
||||
return nil, fmt.Errorf("failed to read log, %v", err)
|
||||
}
|
||||
|
||||
common.PublishHistogramToSpan(span, "s3_log_download_size", float64(size))
|
||||
stats.Record(ctx, downloadSizeMeasure.M(size))
|
||||
return bytes.NewReader(target.Bytes()), nil
|
||||
}
|
||||
|
||||
var (
|
||||
uploadSizeMeasure *stats.Int64Measure
|
||||
downloadSizeMeasure *stats.Int64Measure
|
||||
)
|
||||
|
||||
func init() {
|
||||
// TODO(reed): do we have to do this? the measurements will be tagged on the context, will they be propagated
|
||||
// or we have to white list them in the view for them to show up? test...
|
||||
var err error
|
||||
appKey, err := tag.NewKey("fn_appname")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
pathKey, err := tag.NewKey("fn_path")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
|
||||
{
|
||||
uploadSizeMeasure, err = stats.Int64("s3_log_upload_size", "uploaded log size", "byte")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
v, err := view.New(
|
||||
"s3_log_upload_size",
|
||||
"uploaded log size",
|
||||
[]tag.Key{appKey, pathKey},
|
||||
uploadSizeMeasure,
|
||||
view.DistributionAggregation{},
|
||||
)
|
||||
if err != nil {
|
||||
logrus.Fatalf("cannot create view: %v", err)
|
||||
}
|
||||
if err := v.Subscribe(); err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
downloadSizeMeasure, err = stats.Int64("s3_log_download_size", "downloaded log size", "byte")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
v, err := view.New(
|
||||
"s3_log_download_size",
|
||||
"downloaded log size",
|
||||
[]tag.Key{appKey, pathKey},
|
||||
downloadSizeMeasure,
|
||||
view.DistributionAggregation{},
|
||||
)
|
||||
if err != nil {
|
||||
logrus.Fatalf("cannot create view: %v", err)
|
||||
}
|
||||
if err := v.Subscribe(); err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,8 +5,9 @@ import (
|
||||
"fmt"
|
||||
"net/url"
|
||||
|
||||
"go.opencensus.io/trace"
|
||||
|
||||
"github.com/fnproject/fn/api/models"
|
||||
"github.com/opentracing/opentracing-go"
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
@@ -43,19 +44,19 @@ type metricMQ struct {
|
||||
}
|
||||
|
||||
func (m *metricMQ) Push(ctx context.Context, t *models.Call) (*models.Call, error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "mq_push")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "mq_push")
|
||||
defer span.End()
|
||||
return m.mq.Push(ctx, t)
|
||||
}
|
||||
|
||||
func (m *metricMQ) Reserve(ctx context.Context) (*models.Call, error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "mq_reserve")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "mq_reserve")
|
||||
defer span.End()
|
||||
return m.mq.Reserve(ctx)
|
||||
}
|
||||
|
||||
func (m *metricMQ) Delete(ctx context.Context, t *models.Call) error {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "mq_delete")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "mq_delete")
|
||||
defer span.End()
|
||||
return m.mq.Delete(ctx, t)
|
||||
}
|
||||
|
||||
@@ -6,10 +6,11 @@ import (
|
||||
"net/http"
|
||||
|
||||
"errors"
|
||||
"strings"
|
||||
|
||||
"github.com/fnproject/fn/api"
|
||||
"github.com/fnproject/fn/api/models"
|
||||
"github.com/gin-gonic/gin"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// note: for backward compatibility, will go away later
|
||||
|
||||
@@ -1,60 +0,0 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"github.com/opentracing/opentracing-go"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// FnTracer is a custom Tracer which wraps another another tracer
|
||||
// its main purpose is to wrap the underlying Span in a FnSpan,
|
||||
// which adds some extra behaviour required for sending tracing spans to prometheus
|
||||
type FnTracer struct {
|
||||
opentracing.Tracer
|
||||
}
|
||||
|
||||
// NewFnTracer returns a new FnTracer which wraps the specified Tracer
|
||||
func NewFnTracer(t opentracing.Tracer) opentracing.Tracer {
|
||||
return &FnTracer{t}
|
||||
}
|
||||
|
||||
// FnTracer implements opentracing.Tracer
|
||||
// Override StartSpan to wrap the returned Span in a FnSpan
|
||||
func (fnt FnTracer) StartSpan(operationName string, opts ...opentracing.StartSpanOption) opentracing.Span {
|
||||
return NewFnSpan(fnt.Tracer.StartSpan(operationName, opts...))
|
||||
}
|
||||
|
||||
// FnSpan is a custom Span that wraps another span
|
||||
// which adds some extra behaviour required for sending tracing spans to prometheus
|
||||
type FnSpan struct {
|
||||
opentracing.Span
|
||||
}
|
||||
|
||||
// NewFnSpan returns a new FnSpan which wraps the specified Span
|
||||
func NewFnSpan(s opentracing.Span) opentracing.Span {
|
||||
return &FnSpan{s}
|
||||
}
|
||||
|
||||
// FnSpan implements opentracing.Span
|
||||
func (fns FnSpan) Finish() {
|
||||
fns.copyBaggageItemsToTags()
|
||||
fns.Span.Finish()
|
||||
}
|
||||
|
||||
// FnSpan implements opentracing.Span
|
||||
func (fns FnSpan) FinishWithOptions(opts opentracing.FinishOptions) {
|
||||
fns.copyBaggageItemsToTags()
|
||||
fns.Span.FinishWithOptions(opts)
|
||||
}
|
||||
|
||||
func (fns FnSpan) copyBaggageItemsToTags() {
|
||||
// copy baggage items (which are inherited from the parent) with keys starting with "fn" to tags
|
||||
// the PrometheusCollector will send these to Prometheus
|
||||
// need to do this because the collector can't access baggage items, but it can access tags
|
||||
// whereas here we can access the parent's baggage items, but not its tags
|
||||
fns.Context().ForeachBaggageItem(func(k, v string) bool {
|
||||
if strings.HasPrefix(k, "fn") {
|
||||
fns.SetTag(k, v)
|
||||
}
|
||||
return true
|
||||
})
|
||||
}
|
||||
@@ -13,9 +13,9 @@ import (
|
||||
"github.com/fnproject/fn/fnext"
|
||||
"github.com/gin-contrib/cors"
|
||||
"github.com/gin-gonic/gin"
|
||||
opentracing "github.com/opentracing/opentracing-go"
|
||||
"github.com/opentracing/opentracing-go/ext"
|
||||
"github.com/sirupsen/logrus"
|
||||
"go.opencensus.io/tag"
|
||||
"go.opencensus.io/trace"
|
||||
)
|
||||
|
||||
func optionalCorsWrap(r *gin.Engine) {
|
||||
@@ -37,20 +37,28 @@ func optionalCorsWrap(r *gin.Engine) {
|
||||
|
||||
// we should use http grr
|
||||
func traceWrap(c *gin.Context) {
|
||||
// try to grab a span from the request if made from another service, ignore err if not
|
||||
wireContext, _ := opentracing.GlobalTracer().Extract(
|
||||
opentracing.HTTPHeaders,
|
||||
opentracing.HTTPHeadersCarrier(c.Request.Header))
|
||||
appKey, err := tag.NewKey("fn_appname")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
pathKey, err := tag.NewKey("fn_path")
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
ctx, err := tag.New(c.Request.Context(),
|
||||
tag.Insert(appKey, c.Param(api.CApp)),
|
||||
tag.Insert(pathKey, c.Param(api.CRoute)),
|
||||
)
|
||||
if err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
|
||||
// Create the span referring to the RPC client if available.
|
||||
// If wireContext == nil, a root span will be created.
|
||||
// TODO we should add more tags?
|
||||
serverSpan := opentracing.StartSpan("serve_http", ext.RPCServerOption(wireContext), opentracing.Tag{Key: "path", Value: c.Request.URL.Path})
|
||||
serverSpan.SetBaggageItem("fn_appname", c.Param(api.CApp))
|
||||
serverSpan.SetBaggageItem("fn_path", c.Param(api.CRoute))
|
||||
defer serverSpan.Finish()
|
||||
// TODO inspect opencensus more and see if we need to define a header ourselves
|
||||
// to trigger per-request spans (we will want this), we can set sampler here per request.
|
||||
|
||||
ctx, serverSpan := trace.StartSpan(ctx, "serve_http")
|
||||
defer serverSpan.End()
|
||||
|
||||
ctx := opentracing.ContextWithSpan(c.Request.Context(), serverSpan)
|
||||
c.Request = c.Request.WithContext(ctx)
|
||||
c.Next()
|
||||
}
|
||||
|
||||
@@ -1,327 +0,0 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"github.com/fnproject/fn/api/common"
|
||||
"github.com/openzipkin/zipkin-go-opentracing"
|
||||
"github.com/openzipkin/zipkin-go-opentracing/thrift/gen-go/zipkincore"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// PrometheusCollector is a custom Collector
|
||||
// which sends ZipKin traces to Prometheus
|
||||
type PrometheusCollector struct {
|
||||
lock sync.Mutex
|
||||
// Each span name is published as a separate Histogram metric
|
||||
// Using metric names of the form fn_span_<span-name>_duration_seconds
|
||||
|
||||
// In this map, the key is the name of a tracing span,
|
||||
// and the corresponding value is a HistogramVec metric used to report the duration of spans with this name to Prometheus
|
||||
histogramVecMap map[string]*prometheus.HistogramVec
|
||||
|
||||
// In this map, the key is the name of a tracing span,
|
||||
// and the corresponding value is a CounterVec metric used to report the duration of spans with this name to Prometheus
|
||||
counterVecMap map[string]*prometheus.CounterVec
|
||||
|
||||
// In this map, the key is the name of a tracing span,
|
||||
// and the corresponding value is a GaugeVec metric used to report the duration of spans with this name to Prometheus
|
||||
gaugeVecMap map[string]*prometheus.GaugeVec
|
||||
|
||||
// In this map, the key is the name of a tracing span,
|
||||
// and the corresponding value is an array containing the label keys that were specified when the HistogramVec metric was created
|
||||
registeredLabelKeysMap map[string][]string
|
||||
}
|
||||
|
||||
// NewPrometheusCollector returns a new PrometheusCollector
|
||||
func NewPrometheusCollector() (zipkintracer.Collector, error) {
|
||||
pc := &PrometheusCollector{
|
||||
histogramVecMap: make(map[string]*prometheus.HistogramVec),
|
||||
counterVecMap: make(map[string]*prometheus.CounterVec),
|
||||
gaugeVecMap: make(map[string]*prometheus.GaugeVec),
|
||||
registeredLabelKeysMap: make(map[string][]string),
|
||||
}
|
||||
return pc, nil
|
||||
}
|
||||
|
||||
// PrometheusCollector implements Collector.
|
||||
func (pc *PrometheusCollector) Collect(span *zipkincore.Span) error {
|
||||
|
||||
spanName := span.GetName()
|
||||
|
||||
// extract any label values from the span
|
||||
labelKeysFromSpan, labelValuesFromSpan := getLabels(span)
|
||||
|
||||
// report the duration of this span as a histogram
|
||||
// (unless the span name ends with SpannameSuffixDummy to denote it as being purely the carrier of a metric value and so of no interest in itself)
|
||||
if !strings.HasSuffix(spanName, common.SpannameSuffixDummy) {
|
||||
|
||||
// get the HistogramVec for this span name
|
||||
histogramVec, labelValuesToUse := pc.getHistogramVec(
|
||||
("fn_span_" + spanName + "_duration_seconds"), ("Span " + spanName + " duration, by span name"), labelKeysFromSpan, labelValuesFromSpan)
|
||||
|
||||
// now report the span duration value
|
||||
histogramVec.With(labelValuesToUse).Observe((time.Duration(span.GetDuration()) * time.Microsecond).Seconds())
|
||||
|
||||
}
|
||||
|
||||
// now extract any logged histogram metric values from the span
|
||||
for key, value := range getLoggedHistogramMetrics(span) {
|
||||
|
||||
// get the HistogramVec for this metric
|
||||
thisMetricHistogramVec, labelValuesToUse := pc.getHistogramVec(
|
||||
key, ("Metric " + key), labelKeysFromSpan, labelValuesFromSpan)
|
||||
|
||||
// now report the metric value
|
||||
thisMetricHistogramVec.With(labelValuesToUse).Observe(value)
|
||||
}
|
||||
|
||||
// now extract any logged counter metric values from the span
|
||||
for key, value := range getLoggedCounterMetrics(span) {
|
||||
|
||||
// get the CounterVec for this metric
|
||||
thisMetricCounterVec, labelValuesToUse := pc.getCounterVec(
|
||||
key, ("Metric " + key), labelKeysFromSpan, labelValuesFromSpan)
|
||||
|
||||
// now report the metric value
|
||||
thisMetricCounterVec.With(labelValuesToUse).Add(value)
|
||||
}
|
||||
|
||||
// now extract any logged gauge metric values from the span
|
||||
for key, value := range getLoggedGaugeMetrics(span) {
|
||||
|
||||
// get the GaugeVec for this metric
|
||||
thisMetricGaugeVec, labelValuesToUse := pc.getGaugeVec(
|
||||
key, ("Metric " + key), labelKeysFromSpan, labelValuesFromSpan)
|
||||
|
||||
// now report the metric value
|
||||
thisMetricGaugeVec.With(labelValuesToUse).Add(value)
|
||||
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Return (and create, if necessary) a HistogramVec for the specified Prometheus metric
|
||||
func (pc *PrometheusCollector) getHistogramVec(
|
||||
metricName string, metricHelp string, labelKeysFromSpan []string, labelValuesFromSpan map[string]string) (
|
||||
*prometheus.HistogramVec, map[string]string) {
|
||||
|
||||
var labelValuesToUse map[string]string
|
||||
|
||||
pc.lock.Lock()
|
||||
defer pc.lock.Unlock()
|
||||
|
||||
histogramVec, found := pc.histogramVecMap[metricName]
|
||||
if !found {
|
||||
// create a new HistogramVec
|
||||
histogramVec = prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Name: metricName,
|
||||
Help: metricHelp,
|
||||
},
|
||||
labelKeysFromSpan,
|
||||
)
|
||||
pc.histogramVecMap[metricName] = histogramVec
|
||||
pc.registeredLabelKeysMap[metricName] = labelKeysFromSpan
|
||||
prometheus.MustRegister(histogramVec)
|
||||
labelValuesToUse = labelValuesFromSpan
|
||||
} else {
|
||||
// found an existing HistogramVec
|
||||
// need to be careful here, since we must supply the same label keys as when we first created the metric
|
||||
// otherwise we will get a "inconsistent label cardinality" panic
|
||||
// that's why we saved the original label keys in the registeredLabelKeysMap map
|
||||
// so we can use that to construct a map of label key/value pairs to set on the metric
|
||||
labelValuesToUse = make(map[string]string)
|
||||
for _, thisRegisteredLabelKey := range pc.registeredLabelKeysMap[metricName] {
|
||||
if value, found := labelValuesFromSpan[thisRegisteredLabelKey]; found {
|
||||
labelValuesToUse[thisRegisteredLabelKey] = value
|
||||
} else {
|
||||
labelValuesToUse[thisRegisteredLabelKey] = ""
|
||||
}
|
||||
}
|
||||
}
|
||||
return histogramVec, labelValuesToUse
|
||||
}
|
||||
|
||||
// Return (and create, if necessary) a CounterVec for the specified Prometheus metric
|
||||
func (pc *PrometheusCollector) getCounterVec(
|
||||
metricName string, metricHelp string, labelKeysFromSpan []string, labelValuesFromSpan map[string]string) (
|
||||
*prometheus.CounterVec, map[string]string) {
|
||||
|
||||
var labelValuesToUse map[string]string
|
||||
|
||||
pc.lock.Lock()
|
||||
defer pc.lock.Unlock()
|
||||
|
||||
counterVec, found := pc.counterVecMap[metricName]
|
||||
if !found {
|
||||
// create a new CounterVec
|
||||
counterVec = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: metricName,
|
||||
Help: metricHelp,
|
||||
},
|
||||
labelKeysFromSpan,
|
||||
)
|
||||
pc.counterVecMap[metricName] = counterVec
|
||||
pc.registeredLabelKeysMap[metricName] = labelKeysFromSpan
|
||||
prometheus.MustRegister(counterVec)
|
||||
labelValuesToUse = labelValuesFromSpan
|
||||
} else {
|
||||
// found an existing CounterVec
|
||||
// need to be careful here, since we must supply the same label keys as when we first created the metric
|
||||
// otherwise we will get a "inconsistent label cardinality" panic
|
||||
// that's why we saved the original label keys in the registeredLabelKeysMap map
|
||||
// so we can use that to construct a map of label key/value pairs to set on the metric
|
||||
labelValuesToUse = make(map[string]string)
|
||||
for _, thisRegisteredLabelKey := range pc.registeredLabelKeysMap[metricName] {
|
||||
if value, found := labelValuesFromSpan[thisRegisteredLabelKey]; found {
|
||||
labelValuesToUse[thisRegisteredLabelKey] = value
|
||||
} else {
|
||||
labelValuesToUse[thisRegisteredLabelKey] = ""
|
||||
}
|
||||
}
|
||||
}
|
||||
return counterVec, labelValuesToUse
|
||||
}
|
||||
|
||||
// Return (and create, if necessary) a GaugeVec for the specified Prometheus metric
|
||||
func (pc *PrometheusCollector) getGaugeVec(
|
||||
metricName string, metricHelp string, labelKeysFromSpan []string, labelValuesFromSpan map[string]string) (
|
||||
*prometheus.GaugeVec, map[string]string) {
|
||||
|
||||
var labelValuesToUse map[string]string
|
||||
|
||||
pc.lock.Lock()
|
||||
defer pc.lock.Unlock()
|
||||
|
||||
gaugeVec, found := pc.gaugeVecMap[metricName]
|
||||
if !found {
|
||||
// create a new GaugeVec
|
||||
gaugeVec = prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Name: metricName,
|
||||
Help: metricHelp,
|
||||
},
|
||||
labelKeysFromSpan,
|
||||
)
|
||||
pc.gaugeVecMap[metricName] = gaugeVec
|
||||
pc.registeredLabelKeysMap[metricName] = labelKeysFromSpan
|
||||
prometheus.MustRegister(gaugeVec)
|
||||
labelValuesToUse = labelValuesFromSpan
|
||||
} else {
|
||||
// found an existing GaugeVec
|
||||
// need to be careful here, since we must supply the same label keys as when we first created the metric
|
||||
// otherwise we will get a "inconsistent label cardinality" panic
|
||||
// that's why we saved the original label keys in the registeredLabelKeysMap map
|
||||
// so we can use that to construct a map of label key/value pairs to set on the metric
|
||||
labelValuesToUse = make(map[string]string)
|
||||
for _, thisRegisteredLabelKey := range pc.registeredLabelKeysMap[metricName] {
|
||||
if value, found := labelValuesFromSpan[thisRegisteredLabelKey]; found {
|
||||
labelValuesToUse[thisRegisteredLabelKey] = value
|
||||
} else {
|
||||
labelValuesToUse[thisRegisteredLabelKey] = ""
|
||||
}
|
||||
}
|
||||
}
|
||||
return gaugeVec, labelValuesToUse
|
||||
}
|
||||
|
||||
// extract from the specified span the key/value pairs that we want to add as labels to the Prometheus metric for this span
|
||||
// returns an array of keys, and a map of key-value pairs
|
||||
func getLabels(span *zipkincore.Span) ([]string, map[string]string) {
|
||||
|
||||
var keys []string
|
||||
labelMap := make(map[string]string)
|
||||
|
||||
// extract any tags whose key starts with "fn" from the span
|
||||
binaryAnnotations := span.GetBinaryAnnotations()
|
||||
for _, thisBinaryAnnotation := range binaryAnnotations {
|
||||
key := thisBinaryAnnotation.GetKey()
|
||||
if thisBinaryAnnotation.GetAnnotationType() == zipkincore.AnnotationType_STRING && strings.HasPrefix(key, "fn") {
|
||||
keys = append(keys, key)
|
||||
value := string(thisBinaryAnnotation.GetValue()[:])
|
||||
labelMap[key] = value
|
||||
}
|
||||
}
|
||||
|
||||
return keys, labelMap
|
||||
}
|
||||
|
||||
// extract from the span the logged histogram metric values.
|
||||
// These are the ones whose names start with FieldnamePrefixHistogram,
|
||||
// and whose values we assume are float64
|
||||
func getLoggedHistogramMetrics(span *zipkincore.Span) map[string]float64 {
|
||||
|
||||
keyValueMap := make(map[string]float64)
|
||||
|
||||
// extract any annotations whose Value starts with FieldnamePrefixHistogram
|
||||
annotations := span.GetAnnotations()
|
||||
for _, thisAnnotation := range annotations {
|
||||
if strings.HasPrefix(thisAnnotation.GetValue(), common.FieldnamePrefixHistogram) {
|
||||
keyvalue := strings.Split(thisAnnotation.GetValue(), "=")
|
||||
if len(keyvalue) == 2 {
|
||||
if value, err := strconv.ParseFloat(keyvalue[1], 64); err == nil {
|
||||
key := strings.TrimSpace(keyvalue[0])
|
||||
key = common.FnPrefix + key[len(common.FieldnamePrefixHistogram):] // strip off fieldname prefix and then prepend "fn_" to the front
|
||||
keyValueMap[key] = value
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return keyValueMap
|
||||
}
|
||||
|
||||
// extract from the span the logged counter metric values.
|
||||
// These are the ones whose names start with FieldnamePrefixCounter,
|
||||
// and whose values we assume are float64
|
||||
func getLoggedCounterMetrics(span *zipkincore.Span) map[string]float64 {
|
||||
|
||||
keyValueMap := make(map[string]float64)
|
||||
|
||||
// extract any annotations whose Value starts with FieldnamePrefixCounter
|
||||
annotations := span.GetAnnotations()
|
||||
for _, thisAnnotation := range annotations {
|
||||
if strings.HasPrefix(thisAnnotation.GetValue(), common.FieldnamePrefixCounter) {
|
||||
keyvalue := strings.Split(thisAnnotation.GetValue(), "=")
|
||||
if len(keyvalue) == 2 {
|
||||
if value, err := strconv.ParseFloat(keyvalue[1], 64); err == nil {
|
||||
key := strings.TrimSpace(keyvalue[0])
|
||||
key = common.FnPrefix + key[len(common.FieldnamePrefixCounter):] // strip off fieldname prefix and then prepend "fn_" to the front
|
||||
keyValueMap[key] = value
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return keyValueMap
|
||||
}
|
||||
|
||||
// extract from the span the logged gauge metric values.
|
||||
// These are the ones whose names start with FieldnamePrefixGauge,
|
||||
// and whose values we assume are float64
|
||||
func getLoggedGaugeMetrics(span *zipkincore.Span) map[string]float64 {
|
||||
|
||||
keyValueMap := make(map[string]float64)
|
||||
|
||||
// extract any annotations whose Value starts with FieldnamePrefixGauge
|
||||
annotations := span.GetAnnotations()
|
||||
for _, thisAnnotation := range annotations {
|
||||
if strings.HasPrefix(thisAnnotation.GetValue(), common.FieldnamePrefixGauge) {
|
||||
keyvalue := strings.Split(thisAnnotation.GetValue(), "=")
|
||||
if len(keyvalue) == 2 {
|
||||
if value, err := strconv.ParseFloat(keyvalue[1], 64); err == nil {
|
||||
key := strings.TrimSpace(keyvalue[0])
|
||||
key = common.FnPrefix + key[len(common.FieldnamePrefixGauge):] // strip off fieldname prefix and then prepend "fn_" to the front
|
||||
keyValueMap[key] = value
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return keyValueMap
|
||||
}
|
||||
|
||||
// PrometheusCollector implements Collector.
|
||||
func (*PrometheusCollector) Close() error { return nil }
|
||||
@@ -1,9 +0,0 @@
|
||||
package server
|
||||
|
||||
import (
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
func (s *Server) handlePrometheusMetrics(c *gin.Context) {
|
||||
s.agent.PromHandler().ServeHTTP(c.Writer, c.Request)
|
||||
}
|
||||
@@ -24,9 +24,13 @@ import (
|
||||
"github.com/fnproject/fn/api/version"
|
||||
"github.com/fnproject/fn/fnext"
|
||||
"github.com/gin-gonic/gin"
|
||||
opentracing "github.com/opentracing/opentracing-go"
|
||||
zipkintracer "github.com/openzipkin/zipkin-go-opentracing"
|
||||
zipkinhttp "github.com/openzipkin/zipkin-go/reporter/http"
|
||||
"github.com/sirupsen/logrus"
|
||||
"go.opencensus.io/exporter/prometheus"
|
||||
"go.opencensus.io/exporter/zipkin"
|
||||
"go.opencensus.io/plugin/ochttp"
|
||||
"go.opencensus.io/stats/view"
|
||||
"go.opencensus.io/trace"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -79,6 +83,7 @@ type Server struct {
|
||||
appListeners *appListeners
|
||||
rootMiddlewares []fnext.Middleware
|
||||
apiMiddlewares []fnext.Middleware
|
||||
promExporter *prometheus.Exporter
|
||||
}
|
||||
|
||||
func nodeTypeFromString(value string) ServerNodeType {
|
||||
@@ -226,8 +231,8 @@ func WithAgent(agent agent.Agent) ServerOption {
|
||||
// New creates a new Functions server with the opts given. For convenience, users may
|
||||
// prefer to use NewFromEnv but New is more flexible if needed.
|
||||
func New(ctx context.Context, opts ...ServerOption) *Server {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "server_init")
|
||||
defer span.Finish()
|
||||
ctx, span := trace.StartSpan(ctx, "server_init")
|
||||
defer span.End()
|
||||
|
||||
log := common.Logger(ctx)
|
||||
s := &Server{
|
||||
@@ -290,50 +295,35 @@ func New(ctx context.Context, opts ...ServerOption) *Server {
|
||||
func WithTracer(zipkinURL string) ServerOption {
|
||||
return func(ctx context.Context, s *Server) error {
|
||||
var (
|
||||
debugMode = false
|
||||
serviceName = "fnserver"
|
||||
serviceHostPort = "localhost:8080" // meh
|
||||
// TODO add server identifier to this crap
|
||||
//debugMode = false
|
||||
//serviceName = "fnserver"
|
||||
//serviceHostPort = "localhost:8080" // meh
|
||||
zipkinHTTPEndpoint = zipkinURL
|
||||
// ex: "http://zipkin:9411/api/v1/spans"
|
||||
// ex: "http://zipkin:9411/api/v2/spans"
|
||||
)
|
||||
|
||||
var collector zipkintracer.Collector
|
||||
|
||||
// custom Zipkin collector to send tracing spans to Prometheus
|
||||
promCollector, promErr := NewPrometheusCollector()
|
||||
if promErr != nil {
|
||||
logrus.WithError(promErr).Fatalln("couldn't start Prometheus trace collector")
|
||||
}
|
||||
|
||||
logger := zipkintracer.LoggerFunc(func(i ...interface{}) error { logrus.Error(i...); return nil })
|
||||
|
||||
if zipkinHTTPEndpoint != "" {
|
||||
// Custom PrometheusCollector and Zipkin HTTPCollector
|
||||
httpCollector, zipErr := zipkintracer.NewHTTPCollector(zipkinHTTPEndpoint,
|
||||
zipkintracer.HTTPLogger(logger), zipkintracer.HTTPMaxBacklog(1000),
|
||||
)
|
||||
if zipErr != nil {
|
||||
logrus.WithError(zipErr).Fatalln("couldn't start Zipkin trace collector")
|
||||
}
|
||||
collector = zipkintracer.MultiCollector{httpCollector, promCollector}
|
||||
} else {
|
||||
// Custom PrometheusCollector only
|
||||
collector = promCollector
|
||||
reporter := zipkinhttp.NewReporter(zipkinURL, zipkinhttp.MaxBacklog(10000))
|
||||
exporter := zipkin.NewExporter(reporter, nil)
|
||||
trace.RegisterExporter(exporter)
|
||||
logrus.WithFields(logrus.Fields{"url": zipkinHTTPEndpoint}).Info("exporting spans to zipkin")
|
||||
|
||||
// TODO don't do this. testing parity.
|
||||
trace.SetDefaultSampler(trace.AlwaysSample())
|
||||
}
|
||||
|
||||
ziptracer, err := zipkintracer.NewTracer(zipkintracer.NewRecorder(collector, debugMode, serviceHostPort, serviceName),
|
||||
zipkintracer.ClientServerSameSpan(true),
|
||||
zipkintracer.TraceID128Bit(true),
|
||||
)
|
||||
// TODO we can keep this on *Server and unregister it in Close()... can finagle later. same for tracer
|
||||
exporter, err := prometheus.NewExporter(prometheus.Options{
|
||||
Namespace: "fn",
|
||||
OnError: func(err error) { logrus.WithError(err).Error("opencensus prometheus exporter err") },
|
||||
})
|
||||
if err != nil {
|
||||
logrus.WithError(err).Fatalln("couldn't start tracer")
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
s.promExporter = exporter
|
||||
view.RegisterExporter(exporter)
|
||||
|
||||
// wrap the Zipkin tracer in a FnTracer which will also send spans to Prometheus
|
||||
fntracer := NewFnTracer(ziptracer)
|
||||
|
||||
opentracing.SetGlobalTracer(fntracer)
|
||||
logrus.WithFields(logrus.Fields{"url": zipkinHTTPEndpoint}).Info("started tracer")
|
||||
return nil
|
||||
}
|
||||
}
|
||||
@@ -404,7 +394,8 @@ func (s *Server) startGears(ctx context.Context, cancel context.CancelFunc) {
|
||||
|
||||
server := http.Server{
|
||||
Addr: listen,
|
||||
Handler: s.Router,
|
||||
Handler: &ochttp.Handler{Handler: s.Router},
|
||||
|
||||
// TODO we should set read/write timeouts
|
||||
}
|
||||
|
||||
@@ -438,8 +429,11 @@ func (s *Server) bindHandlers(ctx context.Context) {
|
||||
|
||||
engine.GET("/", handlePing)
|
||||
engine.GET("/version", handleVersion)
|
||||
// TODO: move the following under v1
|
||||
engine.GET("/metrics", s.handlePrometheusMetrics)
|
||||
|
||||
// TODO: move under v1 ?
|
||||
if s.promExporter != nil {
|
||||
engine.GET("/metrics", gin.WrapH(s.promExporter))
|
||||
}
|
||||
|
||||
profilerSetup(engine, "/debug")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user