add per call stats field as histogram (#528)

* add per call stats field as histogram

this will add a histogram of up to 240 data points of call data, produced
every second, stored at the end of a call invocation in the db. the same
metrics are also still shipped to prometheus (prometheus has the
not-potentially-reduced version). for the API reference, see the updates to
the swagger spec, this is just added onto the get call endpoint.

this does not add any extra db calls and the field for stats in call is a json
blob, which is easily modified to add / omit future fields. this is just
tacked on to the call we're making to InsertCall, and expect this to add very
little overhead; we are bounding the set to be relatively small, planning to
clean out the db of calls periodically, functions will generally be short, and
the same code used at a previous firm did not cause a notable db size increase
with production workload that is worse, wrt histogram size (I checked). the
code changes are really small aside from changing to strfmt.DateTime,
adding a migration and implementing sql.Valuer; needed to slightly modify the
swap function so that we can safely read `call.Stats` field to upload at end.

with the full histogram in hand, we can compute max/min/average/median/growth
rate/bernoulli distributions/whatever very easily in a UI or tooling. in
particular, this data is easily chartable [for a UI], which is beneficial.

* adds swagger spec of api update to calls endpoint
* adds migration for call.stats field
* adds call.stats field to sql queries
* change swapping of hot logger to exec, so we know that call.Stats is no
longer being modified after `exec` [in call.End]
* throws out docker stats between function invocations in hot functions (no
call to store them on, we could change this later for debug; they're in prom)
* tested in tests and API

closes #19

* add format of ints to swag
This commit is contained in:
Reed Allman
2017-11-27 08:52:53 -06:00
committed by GitHub
parent d8c41aac9d
commit c9198b8525
12 changed files with 202 additions and 32 deletions

View File

@@ -428,8 +428,9 @@ func (s *hotSlot) exec(ctx context.Context, call *call) error {
// link the container id and id in the logs [for us!]
common.Logger(ctx).WithField("container_id", s.container.id).Info("starting call")
// swap in the new stderr logger
s.container.swap(call.stderr)
// swap in the new stderr logger & stat accumulator
oldStderr := s.container.swap(call.stderr, &call.Stats)
defer s.container.swap(oldStderr, nil) // once we're done, swap out in this scope to prevent races
errApp := make(chan error, 1)
go func() {
@@ -442,8 +443,7 @@ func (s *hotSlot) exec(ctx context.Context, call *call) error {
select {
case err := <-s.errC: // error from container
return err
case err := <-errApp:
// would be great to be able to decipher what error is returning from here so we can show better messages
case err := <-errApp: // from dispatch
return err
case <-ctx.Done(): // call timeout
return ctx.Err()
@@ -488,6 +488,7 @@ func (a *agent) prepCold(ctx context.Context, slots chan<- slot, call *call, tok
stdin: call.req.Body,
stdout: call.w,
stderr: call.stderr,
stats: &call.Stats,
}
// pull & create container before we return a slot, so as to be friendly
@@ -605,7 +606,6 @@ func (a *agent) runHot(ctxArg context.Context, slots chan<- slot, call *call, to
// wait for this call to finish
// NOTE do NOT select with shutdown / other channels. slot handles this.
<-done
container.swap(stderr) // log between tasks
}
}()
@@ -634,14 +634,25 @@ type container struct {
stdin io.Reader
stdout io.Writer
stderr io.Writer
// lock protects the swap and any fields that need to be swapped
sync.Mutex
stats *drivers.Stats
}
func (c *container) swap(stderr io.Writer) {
func (c *container) swap(stderr io.Writer, cs *drivers.Stats) (old io.Writer) {
c.Lock()
defer c.Unlock()
// TODO meh, maybe shouldn't bury this
old = c.stderr
gw, ok := c.stderr.(*ghostWriter)
if ok {
gw.swap(stderr)
old = gw.swap(stderr)
}
c.stats = cs
return old
}
func (c *container) Id() string { return c.id }
@@ -665,6 +676,12 @@ func (c *container) WriteStat(ctx context.Context, stat drivers.Stat) {
for key, value := range stat.Metrics {
span.LogFields(log.Uint64("fn_"+key, value))
}
c.Lock()
defer c.Unlock()
if c.stats != nil {
*(c.stats) = append(*(c.stats), stat)
}
}
//func (c *container) DockerAuth() (docker.AuthConfiguration, error) {
@@ -679,10 +696,12 @@ type ghostWriter struct {
inner io.Writer
}
func (g *ghostWriter) swap(w io.Writer) {
func (g *ghostWriter) swap(w io.Writer) (old io.Writer) {
g.Lock()
old = g.inner
g.inner = w
g.Unlock()
return old
}
func (g *ghostWriter) Write(b []byte) (int, error) {

View File

@@ -9,6 +9,7 @@ import (
"strings"
"time"
"github.com/fnproject/fn/api/agent/drivers"
"github.com/fnproject/fn/api/common"
"github.com/fnproject/fn/api/id"
"github.com/fnproject/fn/api/models"
@@ -347,6 +348,9 @@ func (c *call) End(ctx context.Context, errIn error, t callTrigger) error {
// XXX (reed): delete MQ message, eventually
}
// ensure stats histogram is reasonably bounded
c.Call.Stats = drivers.Decimate(240, c.Call.Stats)
// this means that we could potentially store an error / timeout status for a
// call that ran successfully [by a user's perspective]
// TODO: this should be update, really

View File

@@ -16,6 +16,7 @@ import (
"github.com/fnproject/fn/api/common"
"github.com/fnproject/fn/api/models"
"github.com/fsouza/go-dockerclient"
"github.com/go-openapi/strfmt"
"github.com/opentracing/opentracing-go"
"github.com/sirupsen/logrus"
)
@@ -366,10 +367,9 @@ func (drv *DockerDriver) collectStats(ctx context.Context, stopSignal <-chan str
return
}
stats := cherryPick(ds)
if !stats.Timestamp.IsZero() {
if !time.Time(stats.Timestamp).IsZero() {
task.WriteStat(ctx, stats)
}
}
}
}
@@ -405,7 +405,7 @@ func cherryPick(ds *docker.Stats) drivers.Stat {
}
return drivers.Stat{
Timestamp: ds.Read,
Timestamp: strfmt.DateTime(ds.Read),
Metrics: map[string]uint64{
// source: https://godoc.org/github.com/fsouza/go-dockerclient#Stats
// ex (for future expansion): {"read":"2016-08-03T18:08:05Z","pids_stats":{},"network":{},"networks":{"eth0":{"rx_bytes":508,"tx_packets":6,"rx_packets":6,"tx_bytes":508}},"memory_stats":{"stats":{"cache":16384,"pgpgout":281,"rss":8826880,"pgpgin":2440,"total_rss":8826880,"hierarchical_memory_limit":536870912,"total_pgfault":3809,"active_anon":8843264,"total_active_anon":8843264,"total_pgpgout":281,"total_cache":16384,"pgfault":3809,"total_pgpgin":2440},"max_usage":8953856,"usage":8953856,"limit":536870912},"blkio_stats":{"io_service_bytes_recursive":[{"major":202,"op":"Read"},{"major":202,"op":"Write"},{"major":202,"op":"Sync"},{"major":202,"op":"Async"},{"major":202,"op":"Total"}],"io_serviced_recursive":[{"major":202,"op":"Read"},{"major":202,"op":"Write"},{"major":202,"op":"Sync"},{"major":202,"op":"Async"},{"major":202,"op":"Total"}]},"cpu_stats":{"cpu_usage":{"percpu_usage":[47641874],"usage_in_usermode":30000000,"total_usage":47641874},"system_cpu_usage":8880800500000000,"throttling_data":{}},"precpu_stats":{"cpu_usage":{"percpu_usage":[44946186],"usage_in_usermode":30000000,"total_usage":44946186},"system_cpu_usage":8880799510000000,"throttling_data":{}}}

View File

@@ -3,10 +3,16 @@
package drivers
import (
"bytes"
"context"
"database/sql/driver"
"encoding/json"
"fmt"
"io"
"strings"
"time"
"github.com/go-openapi/strfmt"
)
// A DriverCookie identifies a unique request to run a task.
@@ -109,8 +115,50 @@ type ContainerTask interface {
// Stat is a bucket of stats from a driver at a point in time for a certain task.
type Stat struct {
Timestamp time.Time
Metrics map[string]uint64
Timestamp strfmt.DateTime `json:"timestamp"`
Metrics map[string]uint64 `json:"metrics"`
}
// Stats is a list of Stat, notably implements sql.Valuer
type Stats []Stat
// implements sql.Valuer, returning a string
func (s Stats) Value() (driver.Value, error) {
if len(s) < 1 {
return driver.Value(string("")), nil
}
var b bytes.Buffer
err := json.NewEncoder(&b).Encode(s)
// return a string type
return driver.Value(b.String()), err
}
// implements sql.Scanner
func (s *Stats) Scan(value interface{}) error {
if value == nil {
*s = nil
return nil
}
bv, err := driver.String.ConvertValue(value)
if err == nil {
var b []byte
switch x := bv.(type) {
case []byte:
b = x
case string:
b = []byte(x)
}
if len(b) > 0 {
return json.Unmarshal(b, s)
}
*s = nil
return nil
}
// otherwise, return an error
return fmt.Errorf("stats invalid db format: %T %T value, err: %v", value, bv, err)
}
// TODO: ensure some type is applied to these statuses.
@@ -149,15 +197,15 @@ func average(samples []Stat) (Stat, bool) {
s := Stat{
Metrics: samples[0].Metrics, // Recycle Metrics map from first sample
}
t := samples[0].Timestamp.UnixNano() / int64(l)
t := time.Time(samples[0].Timestamp).UnixNano() / int64(l)
for _, sample := range samples[1:] {
t += sample.Timestamp.UnixNano() / int64(l)
t += time.Time(sample.Timestamp).UnixNano() / int64(l)
for k, v := range sample.Metrics {
s.Metrics[k] += v
}
}
s.Timestamp = time.Unix(0, t)
s.Timestamp = strfmt.DateTime(time.Unix(0, t))
for k, v := range s.Metrics {
s.Metrics[k] = v / uint64(l)
}
@@ -183,8 +231,8 @@ func Decimate(maxSamples int, stats []Stat) []Stat {
return nil
}
start := stats[0].Timestamp
window := stats[len(stats)-1].Timestamp.Sub(start) / time.Duration(maxSamples)
start := time.Time(stats[0].Timestamp)
window := time.Time(stats[len(stats)-1].Timestamp).Sub(start) / time.Duration(maxSamples)
nextEntry, current := 0, start // nextEntry is the index tracking next Stats record location
for x := 0; x < len(stats); {
@@ -192,7 +240,7 @@ func Decimate(maxSamples int, stats []Stat) []Stat {
var samples []Stat
for offset := 0; x+offset < len(stats); offset++ { // Iterate through samples until out of window
if !isLastEntry && stats[x+offset].Timestamp.After(current.Add(window)) {
if !isLastEntry && time.Time(stats[x+offset].Timestamp).After(current.Add(window)) {
break
}
samples = stats[x : x+offset+1]

View File

@@ -3,6 +3,8 @@ package drivers
import (
"testing"
"time"
"github.com/go-openapi/strfmt"
)
func TestAverage(t *testing.T) {
@@ -10,7 +12,7 @@ func TestAverage(t *testing.T) {
stats := make([]Stat, 10)
for i := 0; i < len(stats); i++ {
stats[i] = Stat{
Timestamp: start.Add(time.Duration(i) * time.Minute),
Timestamp: strfmt.DateTime(start.Add(time.Duration(i) * time.Minute)),
Metrics: map[string]uint64{"x": uint64(i)},
}
}
@@ -26,7 +28,7 @@ func TestAverage(t *testing.T) {
}
expectedT := time.Unix(1470873870, 0)
if res.Timestamp != expectedT {
if time.Time(res.Timestamp) != expectedT {
t.Error("Actual average didn't match expected", "actual", res.Timestamp, "expected", expectedT)
}
}
@@ -36,10 +38,9 @@ func TestDecimate(t *testing.T) {
stats := make([]Stat, 480)
for i := range stats {
stats[i] = Stat{
Timestamp: start.Add(time.Duration(i) * time.Second),
Timestamp: strfmt.DateTime(start.Add(time.Duration(i) * time.Second)),
Metrics: map[string]uint64{"x": uint64(i)},
}
// t.Log(stats[i])
}
stats = Decimate(240, stats)
@@ -54,7 +55,7 @@ func TestDecimate(t *testing.T) {
stats = make([]Stat, 700)
for i := range stats {
stats[i] = Stat{
Timestamp: start.Add(time.Duration(i) * time.Second),
Timestamp: strfmt.DateTime(start.Add(time.Duration(i) * time.Second)),
Metrics: map[string]uint64{"x": uint64(i)},
}
}
@@ -66,7 +67,7 @@ func TestDecimate(t *testing.T) {
stats = make([]Stat, 300)
for i := range stats {
stats[i] = Stat{
Timestamp: start.Add(time.Duration(i) * time.Second),
Timestamp: strfmt.DateTime(start.Add(time.Duration(i) * time.Second)),
Metrics: map[string]uint64{"x": uint64(i)},
}
}
@@ -82,7 +83,7 @@ func TestDecimate(t *testing.T) {
start = start.Add(20 * time.Minute)
}
stats[i] = Stat{
Timestamp: start.Add(time.Duration(i) * time.Second),
Timestamp: strfmt.DateTime(start.Add(time.Duration(i) * time.Second)),
Metrics: map[string]uint64{"x": uint64(i)},
}
}