adds wait time based scaling across nodes

this works by having every request from the functions server kick back a
FXLB-WAIT header on every request with the wait time for that function to
start. the lb then keeps track on a per node+function basis an ewma of the
last 10 request's wait times (to reduce jitter).  now that we don't have max
concurrency it's actually pretty challenging to get the wait time stuff to
tick. i expect in the near future we will be throttling functions on a given
node in order to induce this, but that is for another day as that code needs a
lot of reworking. i tested this by introducing some arbitrary throttling (not
checked in) and load spreads over nodes correctly (see images). we will also
need to play with the intervals we want to use, as if you have a func with
50ms run time then basically 10 of those will rev up another node (this was
before removing max_c, with max_c=1) but in any event this wires in the basic
plumbing.

* make docs great again. renamed lb dir to fnlb
* added wait time to dashboard
* wires in a ready channel to await the first pull for hot images to count in
the wait time (should be otherwise useful)

future:
TODO rework lb code api to be pluggable + wire in data store
TODO toss out first data point containing pull to not jump onto another node
immediately (maybe this is actually a good thing?)
This commit is contained in:
Reed Allman
2017-05-22 14:03:03 -07:00
parent c23d893da6
commit 75c5e83936
15 changed files with 285 additions and 116 deletions

View File

@@ -47,6 +47,7 @@ func getCfg(t *models.Task) *task.Config {
ID: t.ID, ID: t.ID,
AppName: t.AppName, AppName: t.AppName,
Env: t.EnvVars, Env: t.EnvVars,
Ready: make(chan struct{}),
} }
if t.Timeout == nil || *t.Timeout <= 0 { if t.Timeout == nil || *t.Timeout <= 0 {
cfg.Timeout = DefaultTimeout cfg.Timeout = DefaultTimeout

View File

@@ -60,7 +60,8 @@ type Auther interface {
type runResult struct { type runResult struct {
error error
StatusValue string status string
start time.Time
} }
func (r *runResult) Error() string { func (r *runResult) Error() string {
@@ -70,8 +71,9 @@ func (r *runResult) Error() string {
return r.error.Error() return r.error.Error()
} }
func (r *runResult) Status() string { return r.StatusValue } func (r *runResult) Status() string { return r.status }
func (r *runResult) UserVisible() bool { return common.IsUserVisibleError(r.error) } func (r *runResult) UserVisible() bool { return common.IsUserVisibleError(r.error) }
func (r *runResult) StartTime() time.Time { return r.start }
type DockerDriver struct { type DockerDriver struct {
conf drivers.Config conf drivers.Config
@@ -409,6 +411,8 @@ func (drv *DockerDriver) run(ctx context.Context, container string, task drivers
return nil, err return nil, err
} }
start := time.Now()
err = drv.startTask(ctx, container) err = drv.startTask(ctx, container)
if err != nil { if err != nil {
return nil, err return nil, err
@@ -429,8 +433,9 @@ func (drv *DockerDriver) run(ctx context.Context, container string, task drivers
status, err := drv.status(ctx, container) status, err := drv.status(ctx, container)
return &runResult{ return &runResult{
StatusValue: status, start: start,
error: err, status: status,
error: err,
}, nil }, nil
} }

View File

@@ -52,6 +52,11 @@ type RunResult interface {
// Status should return the current status of the task. // Status should return the current status of the task.
// Only valid options are {"error", "success", "timeout", "killed", "cancelled"}. // Only valid options are {"error", "success", "timeout", "killed", "cancelled"}.
Status() string Status() string
// StartTime returns the time just before beginning execution of a task,
// for example including the time to pull a container image and doing any
// other setup. This should not include a container's execution duration.
StartTime() time.Time
} }
// The ContainerTask interface guides task execution across a wide variety of // The ContainerTask interface guides task execution across a wide variety of

View File

@@ -3,6 +3,7 @@ package mock
import ( import (
"context" "context"
"fmt" "fmt"
"time"
"gitlab-odx.oracle.com/odx/functions/api/runner/drivers" "gitlab-odx.oracle.com/odx/functions/api/runner/drivers"
) )
@@ -31,16 +32,17 @@ func (c *cookie) Run(ctx context.Context) (drivers.RunResult, error) {
return nil, fmt.Errorf("Mocker error! Bad.") return nil, fmt.Errorf("Mocker error! Bad.")
} }
return &runResult{ return &runResult{
error: nil, error: nil,
StatusValue: "success", status: "success",
start: time.Now(),
}, nil }, nil
} }
type runResult struct { type runResult struct {
error error
StatusValue string status string
start time.Time
} }
func (runResult *runResult) Status() string { func (r *runResult) Status() string { return r.status }
return runResult.StatusValue func (r *runResult) StartTime() time.Time { return r.start }
}

View File

@@ -211,6 +211,12 @@ func (r *Runner) run(ctx context.Context, cfg *task.Config) (drivers.RunResult,
} }
defer cookie.Close() defer cookie.Close()
select {
case <-cfg.Ready:
default:
close(cfg.Ready)
}
metricStart := time.Now() metricStart := time.Now()
result, err := cookie.Run(ctx) result, err := cookie.Run(ctx)

View File

@@ -37,6 +37,7 @@ func TestRunnerHello(t *testing.T) {
ID: fmt.Sprintf("hello-%d-%d", i, time.Now().Unix()), ID: fmt.Sprintf("hello-%d-%d", i, time.Now().Unix()),
Image: test.route.Image, Image: test.route.Image,
Timeout: 10 * time.Second, Timeout: 10 * time.Second,
Ready: make(chan struct{}),
Stdin: strings.NewReader(test.payload), Stdin: strings.NewReader(test.payload),
Stdout: &stdout, Stdout: &stdout,
Stderr: &stderr, Stderr: &stderr,
@@ -90,6 +91,7 @@ func TestRunnerError(t *testing.T) {
ID: fmt.Sprintf("err-%d-%d", i, time.Now().Unix()), ID: fmt.Sprintf("err-%d-%d", i, time.Now().Unix()),
Image: test.route.Image, Image: test.route.Image,
Timeout: 10 * time.Second, Timeout: 10 * time.Second,
Ready: make(chan struct{}),
Stdin: strings.NewReader(test.payload), Stdin: strings.NewReader(test.payload),
Stdout: &stdout, Stdout: &stdout,
Stderr: &stderr, Stderr: &stderr,

View File

@@ -9,15 +9,18 @@ import (
) )
type Config struct { type Config struct {
ID string ID string
Path string Path string
Image string Image string
Timeout time.Duration Timeout time.Duration
IdleTimeout time.Duration IdleTimeout time.Duration
AppName string AppName string
Memory uint64 Memory uint64
Env map[string]string Env map[string]string
Format string Format string
ReceivedTime time.Time
// Ready is used to await the first pull
Ready chan struct{}
Stdin io.Reader Stdin io.Reader
Stdout io.Writer Stdout io.Writer

View File

@@ -94,6 +94,7 @@ func (rnr *Runner) RunTask(ctx context.Context, cfg *task.Config) (drivers.RunRe
} else { } else {
tasks <- treq tasks <- treq
} }
resp := <-treq.Response resp := <-treq.Response
return resp.Result, resp.Err return resp.Result, resp.Err
} }
@@ -256,6 +257,14 @@ func (hc *htfn) serve(ctx context.Context) {
go func() { go func() {
for { for {
select {
case <-lctx.Done():
case <-cfg.Ready:
// on first execution, wait before starting idle timeout / stopping wait time clock,
// since docker pull / container create need to happen.
// XXX (reed): should we still obey the task timeout? docker image could be 8GB...
}
select { select {
case <-lctx.Done(): case <-lctx.Done():
return return
@@ -263,6 +272,7 @@ func (hc *htfn) serve(ctx context.Context) {
logger.Info("Canceling inactive hot function") logger.Info("Canceling inactive hot function")
cancel() cancel()
case t := <-hc.tasks: case t := <-hc.tasks:
start := time.Now()
err := hc.proto.Dispatch(lctx, t) err := hc.proto.Dispatch(lctx, t)
status := "success" status := "success"
if err != nil { if err != nil {
@@ -272,8 +282,8 @@ func (hc *htfn) serve(ctx context.Context) {
hc.once() hc.once()
t.Response <- task.Response{ t.Response <- task.Response{
&runResult{StatusValue: status, error: err}, Result: &runResult{start: start, status: status, error: err},
err, Err: err,
} }
} }
} }
@@ -304,7 +314,8 @@ func runTaskReq(rnr *Runner, t task.Request) {
type runResult struct { type runResult struct {
error error
StatusValue string status string
start time.Time
} }
func (r *runResult) Error() string { func (r *runResult) Error() string {
@@ -314,4 +325,5 @@ func (r *runResult) Error() string {
return r.error.Error() return r.error.Error()
} }
func (r *runResult) Status() string { return r.StatusValue } func (r *runResult) Status() string { return r.status }
func (r *runResult) StartTime() time.Time { return r.start }

View File

@@ -193,17 +193,19 @@ func (s *Server) serve(ctx context.Context, c *gin.Context, appName string, foun
} }
cfg := &task.Config{ cfg := &task.Config{
AppName: appName, AppName: appName,
Path: found.Path, Path: found.Path,
Env: envVars, Env: envVars,
Format: found.Format, Format: found.Format,
ID: reqID, ID: reqID,
Image: found.Image, Image: found.Image,
Memory: found.Memory, Memory: found.Memory,
Stdin: payload, Stdin: payload,
Stdout: &stdout, Stdout: &stdout,
Timeout: time.Duration(found.Timeout) * time.Second, Timeout: time.Duration(found.Timeout) * time.Second,
IdleTimeout: time.Duration(found.IdleTimeout) * time.Second, IdleTimeout: time.Duration(found.IdleTimeout) * time.Second,
ReceivedTime: time.Now(),
Ready: make(chan struct{}),
} }
// ensure valid values // ensure valid values
@@ -244,6 +246,11 @@ func (s *Server) serve(ctx context.Context, c *gin.Context, appName string, foun
default: default:
result, err := s.Runner.RunTrackedTask(newTask, ctx, cfg, s.Datastore) result, err := s.Runner.RunTrackedTask(newTask, ctx, cfg, s.Datastore)
if result != nil {
waitTime := result.StartTime().Sub(cfg.ReceivedTime)
c.Header("XXX-FXLB-WAIT", waitTime.String())
}
if err != nil { if err != nil {
c.JSON(http.StatusInternalServerError, runnerResponse{ c.JSON(http.StatusInternalServerError, runnerResponse{
RequestID: cfg.ID, RequestID: cfg.ID,
@@ -254,6 +261,7 @@ func (s *Server) serve(ctx context.Context, c *gin.Context, appName string, foun
log.WithError(err).Error("Failed to run task") log.WithError(err).Error("Failed to run task")
break break
} }
for k, v := range found.Headers { for k, v := range found.Headers {
c.Header(k, v[0]) c.Header(k, v[0])
} }

View File

@@ -25,11 +25,36 @@ type consistentHash struct {
func newCH() *consistentHash { func newCH() *consistentHash {
return &consistentHash{ return &consistentHash{
rng: rand.New(rand.NewSource(time.Now().Unix())), rng: rand.New(&lockedSource{src: rand.NewSource(time.Now().Unix()).(rand.Source64)}),
load: make(map[string]*int64), load: make(map[string]*int64),
} }
} }
type lockedSource struct {
lk sync.Mutex
src rand.Source64
}
func (r *lockedSource) Int63() (n int64) {
r.lk.Lock()
n = r.src.Int63()
r.lk.Unlock()
return n
}
func (r *lockedSource) Uint64() (n uint64) {
r.lk.Lock()
n = r.src.Uint64()
r.lk.Unlock()
return n
}
func (r *lockedSource) Seed(seed int64) {
r.lk.Lock()
r.src.Seed(seed)
r.lk.Unlock()
}
func (ch *consistentHash) add(newb string) { func (ch *consistentHash) add(newb string) {
ch.Lock() ch.Lock()
defer ch.Unlock() defer ch.Unlock()
@@ -84,12 +109,22 @@ func jumpConsistentHash(key uint64, num_buckets int32) int32 {
return int32(b) return int32(b)
} }
// tracks last 10 samples (very fast)
const DECAY = 0.1
func ewma(old, new int64) int64 {
// TODO could 'warm' it up and drop first few samples since we'll have docker pulls / hot starts
return int64((float64(new) * DECAY) + (float64(old) * (1 - DECAY)))
}
func (ch *consistentHash) setLoad(key string, load int64) { func (ch *consistentHash) setLoad(key string, load int64) {
ch.loadMu.RLock() ch.loadMu.RLock()
l, ok := ch.load[key] l, ok := ch.load[key]
ch.loadMu.RUnlock() ch.loadMu.RUnlock()
if ok { if ok {
atomic.StoreInt64(l, load) // this is a lossy ewma w/ or w/o CAS but if things are moving fast we have plenty of sample
prev := atomic.LoadInt64(l)
atomic.StoreInt64(l, ewma(prev, load))
} else { } else {
ch.loadMu.Lock() ch.loadMu.Lock()
if _, ok := ch.load[key]; !ok { if _, ok := ch.load[key]; !ok {
@@ -117,50 +152,62 @@ func (ch *consistentHash) besti(key string, i int) (string, error) {
} }
f := func(n string) string { f := func(n string) string {
var load int64 var load time.Duration
ch.loadMu.RLock() ch.loadMu.RLock()
loadPtr := ch.load[loadKey(n, key)] loadPtr := ch.load[loadKey(n, key)]
ch.loadMu.RUnlock() ch.loadMu.RUnlock()
if loadPtr != nil { if loadPtr != nil {
load = atomic.LoadInt64(loadPtr) load = time.Duration(atomic.LoadInt64(loadPtr))
} }
// TODO flesh out these values. should be wait times. const (
lowerLat = 500 * time.Millisecond
upperLat = 2 * time.Second
)
// TODO flesh out these values.
// if we send < 50% of traffic off to other nodes when loaded // if we send < 50% of traffic off to other nodes when loaded
// then as function scales nodes will get flooded, need to be careful. // then as function scales nodes will get flooded, need to be careful.
// //
// back off loaded node/function combos slightly to spread load // back off loaded node/function combos slightly to spread load
// TODO do we need a kind of ref counter as well so as to send functions // TODO do we need a kind of ref counter as well so as to send functions
// to a different node while there's an outstanding call to another? // to a different node while there's an outstanding call to another?
if load < 70 { if load < lowerLat {
return n return n
} else if load > 90 { } else if load > upperLat {
if ch.rng.Intn(100) < 60 { // really loaded
if ch.rng.Intn(100) < 10 { // XXX (reed): 10% could be problematic, should sliding scale prob with log(x) ?
return n return n
} }
} else if load > 70 { } else {
if ch.rng.Float64() < 80 { // 10 < x < 40, as load approaches upperLat, x decreases [linearly]
x := translate(int64(load), int64(lowerLat), int64(upperLat), 10, 40)
if ch.rng.Intn(100) < x {
return n return n
} }
} }
// otherwise loop until we find a sufficiently unloaded node or a lucky coin flip
// return invalid node to try next node
return "" return ""
} }
for _, n := range ch.nodes[i:] { for ; ; i++ {
node := f(n) // theoretically this could take infinite time, but practically improbable...
node := f(ch.nodes[i])
if node != "" { if node != "" {
return node, nil return node, nil
} else if i == len(ch.nodes)-1 {
i = -1 // reset i to 0
} }
} }
// try the other half of the ring panic("strange things are afoot at the circle k")
for _, n := range ch.nodes[:i] { }
node := f(n)
if node != "" { func translate(val, inFrom, inTo, outFrom, outTo int64) int {
return node, nil outRange := outTo - outFrom
} inRange := inTo - inFrom
} inVal := val - inFrom
// we want the number to be lower as intensity increases
return "", ErrNoNodes return int(float64(outTo) - (float64(inVal)/float64(inRange))*float64(outRange))
} }

View File

@@ -3,7 +3,7 @@ var example = 'dynamic-update',
theme = 'default'; theme = 'default';
var chart; // global var chart; // global
var seriesMapper = {}; var seriesMapper = {}; // map by node+func name to chart[i] index
Highcharts.setOptions({ Highcharts.setOptions({
global: { global: {
@@ -15,12 +15,8 @@ function requestData() {
$.ajax({ $.ajax({
url: '/1/lb/stats', url: '/1/lb/stats',
success: function(point) { success: function(point) {
console.log(point)
var jason = JSON.parse(point); var jason = JSON.parse(point);
//var series = chart.series[0],
//shift = series.data.length > 20;
if (!jason["stats"] || jason["stats"].length == 0) { if (!jason["stats"] || jason["stats"].length == 0) {
// XXX (reed): using server timestamps for real data this can drift easily // XXX (reed): using server timestamps for real data this can drift easily
// XXX (reed): uh how to insert empty data point w/o node data? enum all series names? // XXX (reed): uh how to insert empty data point w/o node data? enum all series names?
@@ -29,39 +25,40 @@ function requestData() {
//name: 'Random data', //name: 'Random data',
//data: [] //data: []
//}] //}]
setTimeout(requestData, 1000);
return return
} }
for (var i = 0; i < jason["stats"].length; i++) { for (var i = 0; i < jason["stats"].length; i++) {
stat = jason["stats"][i]; stat = jason["stats"][i];
var node = stat["node"]; var node = stat["node"];
var func = stat["func"];
var key = node + func
console.log("before", seriesMapper[node]) if (seriesMapper[key] == null) {
if (seriesMapper[node] == null) { chart.addSeries({name: key, data: []});
console.log("yodawg") waitChart.addSeries({name: key, data: []});
chart.addSeries({name: node, data: []}) seriesMapper[key] = chart.series.length - 1;
seriesMapper[node] = chart.series.length - 1
chart.redraw();
} }
console.log("done", seriesMapper[node]) series = chart.series[seriesMapper[key]];
series = chart.series[seriesMapper[node]] waitSeries = waitChart.series[seriesMapper[key]];
//series = chart.series[0]
// XXX (reed): hack // XXX (reed): hack
shift = series.data.length > 20 && i == jason["stats"].length + 1; shift = series.data.length > 20 && i == jason["stats"].length + 1;
timestamp = Date.parse(stat["timestamp"]); timestamp = Date.parse(stat["timestamp"]);
console.log(series.data.length, timestamp, stat["tp"]) console.log(series.data.length, timestamp, stat["tp"], stat["wait"]);
series.addPoint([timestamp, stat["tp"]], true, shift); series.addPoint([timestamp, stat["tp"]], false, shift);
//series.addPoint({ waitSeries.addPoint([timestamp, stat["wait"]], false, shift);
//name: node, }
//data: {x: timestamp, y: stat["tp"]} if (jason["stats"].length > 0) {
//}, true, shift); chart.redraw();
waitChart.redraw();
} }
// call it again after one second // call it again after one second
// XXX (reed): this won't work cuz if the endpoint fails then we won't ask for more datas // XXX (reed): this won't work perfectly cuz if the endpoint fails then we won't ask for more datas
setTimeout(requestData, 1000); setTimeout(requestData, 1000);
}, },
cache: false cache: false
@@ -71,7 +68,7 @@ function requestData() {
$(document).ready(function() { $(document).ready(function() {
chart = new Highcharts.Chart({ chart = new Highcharts.Chart({
chart: { chart: {
renderTo: 'container', renderTo: 'throughput_chart',
events: { events: {
load: requestData load: requestData
} }
@@ -107,7 +104,52 @@ $(document).ready(function() {
minPadding: 0.2, minPadding: 0.2,
maxPadding: 0.2, maxPadding: 0.2,
title: { title: {
text: 'Value', text: 'throughput (/s)',
margin: 80
}
},
series: []
});
waitChart = new Highcharts.Chart({
chart: {
renderTo: 'wait_chart',
events: {
load: requestData
}
},
rangeSelector: {
buttons: [{
count: 1,
type: 'minute',
text: '1M'
}, {
count: 5,
type: 'minute',
text: '5M'
}, {
type: 'all',
text: 'All'
}],
//inputEnabled: false,
selected: 0
},
title: {
text: 'lb data'
},
exporting: {
enabled: false
},
xAxis: {
type: 'datetime',
tickPixelInterval: 150,
maxZoom: 20 * 1000
},
yAxis: {
minPadding: 0.2,
maxPadding: 0.2,
title: {
text: 'wait time (seconds)',
margin: 80 margin: 80
} }
}, },

View File

@@ -1,29 +1,27 @@
package main package main
import ( import (
"context"
"encoding/json" "encoding/json"
"flag" "flag"
"fmt" "fmt"
"io/ioutil" "io/ioutil"
"net/http" "net/http"
"os"
"os/signal"
"strings" "strings"
"sync"
"syscall"
"time" "time"
"github.com/Sirupsen/logrus" "github.com/Sirupsen/logrus"
) )
// TODO: consistent hashing is nice to get a cheap way to place nodes but it
// doesn't account well for certain functions that may be 'hotter' than others.
// we should very likely keep a load ordered list and distribute based on that.
// if we can get some kind of feedback from the f(x) nodes, we can use that.
// maybe it's good enough to just ch(x) + 1 if ch(x) is marked as "hot"?
// TODO the load balancers all need to have the same list of nodes. gossip? // TODO the load balancers all need to have the same list of nodes. gossip?
// also gossip would handle failure detection instead of elb style. or it can // also gossip would handle failure detection instead of elb style. or it can
// be pluggable and then we can read from where bmc is storing them and use that // be pluggable and then we can read from where bmc is storing them and use that
// or some OSS alternative // or some OSS alternative
// TODO when adding nodes we should health check them once before adding them
// TODO when node goes offline should try to redirect request instead of 5xxing // TODO when node goes offline should try to redirect request instead of 5xxing
// TODO we could add some kind of pre-warming call to the functions server where // TODO we could add some kind of pre-warming call to the functions server where
@@ -39,7 +37,7 @@ func main() {
fnodes := flag.String("nodes", "", "comma separated list of IronFunction nodes") fnodes := flag.String("nodes", "", "comma separated list of IronFunction nodes")
var conf config var conf config
flag.IntVar(&conf.Port, "port", 8081, "port to run on") flag.StringVar(&conf.Listen, "listen", ":8081", "port to run on")
flag.IntVar(&conf.HealthcheckInterval, "hc-interval", 3, "how often to check f(x) nodes, in seconds") flag.IntVar(&conf.HealthcheckInterval, "hc-interval", 3, "how often to check f(x) nodes, in seconds")
flag.StringVar(&conf.HealthcheckEndpoint, "hc-path", "/version", "endpoint to determine node health") flag.StringVar(&conf.HealthcheckEndpoint, "hc-path", "/version", "endpoint to determine node health")
flag.IntVar(&conf.HealthcheckUnhealthy, "hc-unhealthy", 2, "threshold of failed checks to declare node unhealthy") flag.IntVar(&conf.HealthcheckUnhealthy, "hc-unhealthy", 2, "threshold of failed checks to declare node unhealthy")
@@ -50,12 +48,34 @@ func main() {
ch := newProxy(conf) ch := newProxy(conf)
// XXX (reed): safe shutdown err := serve(conf.Listen, ch)
fmt.Println(http.ListenAndServe(":8081", ch)) if err != nil {
logrus.WithError(err).Error("server error")
}
}
func serve(addr string, handler http.Handler) error {
server := &http.Server{Addr: addr, Handler: handler}
var wg sync.WaitGroup
wg.Add(1)
defer wg.Wait()
ch := make(chan os.Signal, 1)
signal.Notify(ch, syscall.SIGQUIT, syscall.SIGINT)
go func() {
defer wg.Done()
for sig := range ch {
logrus.WithFields(logrus.Fields{"signal": sig}).Info("received signal")
server.Shutdown(context.Background()) // safe shutdown
return
}
}()
return server.ListenAndServe()
} }
type config struct { type config struct {
Port int `json:"port"` Listen string `json:"port"`
Nodes []string `json:"nodes"` Nodes []string `json:"nodes"`
HealthcheckInterval int `json:"healthcheck_interval"` HealthcheckInterval int `json:"healthcheck_interval"`
HealthcheckEndpoint string `json:"healthcheck_endpoint"` HealthcheckEndpoint string `json:"healthcheck_endpoint"`
@@ -108,31 +128,38 @@ func (ch *chProxy) statsGet(w http.ResponseWriter, r *http.Request) {
Timestamp time.Time `json:"timestamp"` Timestamp time.Time `json:"timestamp"`
Throughput int `json:"tp"` Throughput int `json:"tp"`
Node string `json:"node"` Node string `json:"node"`
Func string `json:"func"`
Wait float64 `json:"wait"` // seconds
} }
var sts []st var sts []st
// roll up and calculate throughput per second. idk why i hate myself
aggs := make(map[string][]*stat) aggs := make(map[string][]*stat)
for _, s := range stats { for _, s := range stats {
// roll up and calculate throughput per second. idk why i hate myself key := s.node + "/" + s.fx
if t := aggs[s.node]; len(t) > 0 && t[0].timestamp.Before(s.timestamp.Add(-1*time.Second)) { if t := aggs[key]; len(t) > 0 && t[0].timestamp.Before(s.timestamp.Add(-1*time.Second)) {
sts = append(sts, st{ sts = append(sts, st{
Timestamp: t[0].timestamp, Timestamp: t[0].timestamp,
Throughput: len(t), Throughput: len(t),
Node: s.node, Node: t[0].node,
Func: t[0].fx,
Wait: avgWait(t),
}) })
aggs[s.node] = append(aggs[s.node][:0], s) aggs[key] = append(aggs[key][:0], s)
} else { } else {
aggs[s.node] = append(aggs[s.node], s) aggs[key] = append(aggs[key], s)
} }
} }
for node, t := range aggs { // leftovers
for _, t := range aggs {
sts = append(sts, st{ sts = append(sts, st{
Timestamp: t[0].timestamp, Timestamp: t[0].timestamp,
Throughput: len(t), Throughput: len(t),
Node: node, Node: t[0].node,
Func: t[0].fx,
Wait: avgWait(t),
}) })
} }
@@ -143,6 +170,14 @@ func (ch *chProxy) statsGet(w http.ResponseWriter, r *http.Request) {
}) })
} }
func avgWait(stats []*stat) float64 {
var sum time.Duration
for _, s := range stats {
sum += s.wait
}
return (sum / time.Duration(len(stats))).Seconds()
}
func (ch *chProxy) addNode(w http.ResponseWriter, r *http.Request) { func (ch *chProxy) addNode(w http.ResponseWriter, r *http.Request) {
var bod struct { var bod struct {
Node string `json:"node"` Node string `json:"node"`
@@ -220,7 +255,6 @@ var dashStr = `<!DOCTYPE html>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<title>lb dash</title> <title>lb dash</title>
<!-- 1. Add these JavaScript inclusions in the head of your page -->
<script type="text/javascript" src="https://code.jquery.com/jquery-1.10.1.js"></script> <script type="text/javascript" src="https://code.jquery.com/jquery-1.10.1.js"></script>
<script type="text/javascript" src="https://code.highcharts.com/stock/highstock.js"></script> <script type="text/javascript" src="https://code.highcharts.com/stock/highstock.js"></script>
<script type="text/javascript" src="https://code.highcharts.com/stock/modules/exporting.js"></script> <script type="text/javascript" src="https://code.highcharts.com/stock/modules/exporting.js"></script>
@@ -228,12 +262,11 @@ var dashStr = `<!DOCTYPE html>
%s %s
</script> </script>
<!-- 2. Add the JavaScript to initialize the chart on document ready -->
</head> </head>
<body> <body>
<!-- 3. Add the container --> <div id="throughput_chart" style="height: 400px; min-width: 310px"></div>
<div id="container" style="height: 400px; min-width: 310px"></div> <div id="wait_chart" style="height: 400px; min-width: 310px"></div>
</body> </body>
</html> </html>

View File

@@ -8,7 +8,6 @@ import (
"net" "net"
"net/http" "net/http"
"net/http/httputil" "net/http/httputil"
"strconv"
"sync" "sync"
"time" "time"
@@ -36,12 +35,13 @@ type chProxy struct {
transport http.RoundTripper transport http.RoundTripper
} }
// TODO should prob track per f(x) per node
type stat struct { type stat struct {
timestamp time.Time timestamp time.Time
latency time.Duration latency time.Duration
node string node string
code int code int
fx string
wait time.Duration
} }
func (ch *chProxy) addStat(s *stat) { func (ch *chProxy) addStat(s *stat) {
@@ -60,7 +60,6 @@ func (ch *chProxy) getStats() []*stat {
ch.stats = ch.stats[:0] ch.stats = ch.stats[:0]
ch.statsMu.Unlock() ch.statsMu.Unlock()
// XXX (reed): down sample to per second
return stats return stats
} }
@@ -95,6 +94,7 @@ func newProxy(conf config) *chProxy {
director := func(req *http.Request) { director := func(req *http.Request) {
target, err := ch.ch.get(req.URL.Path) target, err := ch.ch.get(req.URL.Path)
if err != nil { if err != nil {
logrus.WithError(err).WithFields(logrus.Fields{"url": req.URL.Path}).Error("getting index failed")
target = "error" target = "error"
} }
@@ -109,7 +109,6 @@ func newProxy(conf config) *chProxy {
} }
for _, n := range conf.Nodes { for _, n := range conf.Nodes {
// XXX (reed): need to health check these
ch.ch.add(n) ch.ch.add(n)
} }
go ch.healthcheck() go ch.healthcheck()
@@ -123,6 +122,7 @@ type bufferPool struct {
func newBufferPool() httputil.BufferPool { func newBufferPool() httputil.BufferPool {
return &bufferPool{ return &bufferPool{
bufs: &sync.Pool{ bufs: &sync.Pool{
// 32KB is what the proxy would've used without recycling them
New: func() interface{} { return make([]byte, 32*1024) }, New: func() interface{} { return make([]byte, 32*1024) },
}, },
} }
@@ -132,9 +132,11 @@ func (b *bufferPool) Get() []byte { return b.bufs.Get().([]byte) }
func (b *bufferPool) Put(x []byte) { b.bufs.Put(x) } func (b *bufferPool) Put(x []byte) { b.bufs.Put(x) }
func (ch *chProxy) RoundTrip(req *http.Request) (*http.Response, error) { func (ch *chProxy) RoundTrip(req *http.Request) (*http.Response, error) {
if req.URL.Host == "error" { if req != nil && req.URL.Host == "error" {
io.Copy(ioutil.Discard, req.Body) if req.Body != nil {
req.Body.Close() io.Copy(ioutil.Discard, req.Body)
req.Body.Close()
}
// XXX (reed): if we let the proxy code write the response it will be body-less. ok? // XXX (reed): if we let the proxy code write the response it will be body-less. ok?
return nil, ErrNoNodes return nil, ErrNoNodes
} }
@@ -148,9 +150,10 @@ func (ch *chProxy) RoundTrip(req *http.Request) (*http.Response, error) {
} }
func (ch *chProxy) intercept(req *http.Request, resp *http.Response, latency time.Duration) { func (ch *chProxy) intercept(req *http.Request, resp *http.Response, latency time.Duration) {
// XXX (reed): give f(x) nodes ability to send back wait time in response load, _ := time.ParseDuration(resp.Header.Get("XXX-FXLB-WAIT"))
// XXX (reed): we should prob clear this from user response // XXX (reed): we should prob clear this from user response?
load, _ := strconv.Atoi(resp.Header.Get("XXX-FXLB-WAIT")) // resp.Header.Del("XXX-FXLB-WAIT") // don't show this to user
// XXX (reed): need to validate these prob // XXX (reed): need to validate these prob
ch.ch.setLoad(loadKey(req.URL.Host, req.URL.Path), int64(load)) ch.ch.setLoad(loadKey(req.URL.Host, req.URL.Path), int64(load))
@@ -159,7 +162,8 @@ func (ch *chProxy) intercept(req *http.Request, resp *http.Response, latency tim
latency: latency, latency: latency,
node: req.URL.Host, node: req.URL.Host,
code: resp.StatusCode, code: resp.StatusCode,
// XXX (reed): function fx: req.URL.Path,
wait: load,
}) })
} }
@@ -167,7 +171,6 @@ func (ch *chProxy) healthcheck() {
for range time.Tick(ch.hcInterval) { for range time.Tick(ch.hcInterval) {
nodes := ch.ch.list() nodes := ch.ch.list()
nodes = append(nodes, ch.dead()...) nodes = append(nodes, ch.dead()...)
// XXX (reed): need to figure out elegant adding / removing better
for _, n := range nodes { for _, n := range nodes {
go ch.ping(n) go ch.ping(n)
} }