mirror of
https://github.com/fnproject/fn.git
synced 2022-10-28 21:29:17 +03:00
slot tracking improvements (#562)
* fn: remove 100 msec sleep for hot containers *) moved slot management to its own file *) slots are now implemented with LIFO semantics, this is important since we do not want to round robin hot containers. Idle hot containers should timeout properly. *) each slot queue now stores a few basic stats such as avg time a call spent in a given state and number of running/launching containers, number of waiting calls in those states. *) first metrics in these basic stats are discarded to avoid initial docker pull/start spikes. *) agent now records/updates slot queue state and how much time a call stayed in that state. *) waitHotSlot() replaces the previous wait 100 msec logic where it sends a msg to hot slot go routine launchHot() and waits for a slot *) launchHot() is now a go routine for tracking containers in hot slots, it determines if a new containers is needed based on slot queue stats.
This commit is contained in:
@@ -2,12 +2,9 @@ package agent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha1"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"sort"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@@ -115,9 +112,7 @@ type agent struct {
|
||||
|
||||
driver drivers.Driver
|
||||
|
||||
hMu sync.RWMutex // protects hot
|
||||
hot map[string]chan slot
|
||||
|
||||
slotMgr *slotQueueMgr
|
||||
// track usage
|
||||
resources ResourceTracker
|
||||
|
||||
@@ -138,7 +133,7 @@ func New(da DataAccess) Agent {
|
||||
a := &agent{
|
||||
da: da,
|
||||
driver: driver,
|
||||
hot: make(map[string]chan slot),
|
||||
slotMgr: NewSlotQueueMgr(),
|
||||
resources: NewResourceTracker(),
|
||||
shutdown: make(chan struct{}),
|
||||
promHandler: promhttp.Handler(),
|
||||
@@ -260,147 +255,138 @@ func (a *agent) Submit(callI Call) error {
|
||||
return transformTimeout(err, false)
|
||||
}
|
||||
|
||||
// getSlot must ensure that if it receives a slot, it will be returned, otherwise
|
||||
// a container will be locked up forever waiting for slot to free.
|
||||
func (a *agent) getSlot(ctx context.Context, call *call) (slot, error) {
|
||||
// getSlot returns a Slot (or error) for the request to run. Depending on hot/cold
|
||||
// request type, this may launch a new container or wait for other containers to become idle
|
||||
// or it may wait for resources to become available to launch a new container.
|
||||
func (a *agent) getSlot(ctx context.Context, call *call) (Slot, error) {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_get_slot")
|
||||
defer span.Finish()
|
||||
|
||||
if protocol.IsStreamable(protocol.Protocol(call.Format)) {
|
||||
return a.hotSlot(ctx, call)
|
||||
isHot := protocol.IsStreamable(protocol.Protocol(call.Format))
|
||||
if isHot {
|
||||
// For hot requests, we use a long lived slot queue, which we use to manage hot containers
|
||||
call.slots = a.slotMgr.getHotSlotQueue(call)
|
||||
start := time.Now()
|
||||
|
||||
call.slots.enterState(SlotQueueWaiter)
|
||||
s, err := a.launchHot(ctx, call)
|
||||
call.slots.exitStateWithLatency(SlotQueueWaiter, uint64(time.Now().Sub(start).Seconds()*1000))
|
||||
|
||||
return s, err
|
||||
}
|
||||
|
||||
// make new channel and launch 1 for cold
|
||||
ch := make(chan slot)
|
||||
return a.launchOrSlot(ctx, ch, call)
|
||||
return a.launchCold(ctx, call)
|
||||
}
|
||||
|
||||
// launchOrSlot will launch a container that will send slots on the provided channel when it
|
||||
// is free if no slots are available on that channel first. the returned slot may or may not
|
||||
// be from the launched container. if there is an error launching a new container (if necessary),
|
||||
// then that will be returned rather than a slot, if no slot is free first.
|
||||
func (a *agent) launchOrSlot(ctx context.Context, slots chan slot, call *call) (slot, error) {
|
||||
var errCh <-chan error
|
||||
// launchHot checks with slot queue to see if a new container needs to be launched and waits
|
||||
// for available slots in the queue for hot request execution.
|
||||
func (a *agent) launchHot(ctx context.Context, call *call) (Slot, error) {
|
||||
|
||||
// check if any slot immediately without trying to get a ram token
|
||||
select {
|
||||
case s := <-slots:
|
||||
return s, nil
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
default:
|
||||
}
|
||||
|
||||
// IMPORTANT: This means, if this request was submitted indirectly through fnlb or
|
||||
// other proxy, we will continue classifying it as 'async' which is good as async
|
||||
// regardless of origin should use the async resources.
|
||||
isAsync := call.Type == models.TypeAsync
|
||||
|
||||
// add context cancel here to prevent ramToken/launch race, w/o this ramToken /
|
||||
// launch won't know whether we are no longer receiving or not yet receiving.
|
||||
ctx, launchCancel := context.WithCancel(ctx)
|
||||
defer launchCancel()
|
||||
launchLoop:
|
||||
for {
|
||||
// Check/evaluate if we need to launch a new hot container
|
||||
doLaunch, stats := call.slots.isNewContainerNeeded()
|
||||
common.Logger(ctx).WithField("stats", stats).Debug("checking hot container launch ", doLaunch)
|
||||
|
||||
if doLaunch {
|
||||
ctxToken, tokenCancel := context.WithCancel(context.Background())
|
||||
|
||||
// wait on token/slot/timeout whichever comes first
|
||||
select {
|
||||
case tok, isOpen := <-a.resources.GetResourceToken(ctxToken, call.Memory, isAsync):
|
||||
tokenCancel()
|
||||
if !isOpen {
|
||||
return nil, models.ErrCallTimeoutServerBusy
|
||||
}
|
||||
go a.runHot(ctx, call, tok)
|
||||
case s, ok := <-call.slots.getDequeueChan():
|
||||
tokenCancel()
|
||||
if !ok {
|
||||
return nil, errors.New("slot shut down while waiting for hot slot")
|
||||
}
|
||||
if s.acquireSlot() {
|
||||
if s.slot.Error() != nil {
|
||||
s.slot.Close()
|
||||
return nil, s.slot.Error()
|
||||
}
|
||||
return s.slot, nil
|
||||
}
|
||||
|
||||
// we failed to take ownership of the token (eg. container idle timeout)
|
||||
// try launching again
|
||||
continue launchLoop
|
||||
case <-ctx.Done():
|
||||
tokenCancel()
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
}
|
||||
|
||||
// After launching (if it was necessary) a container, now wait for slot/timeout
|
||||
// or periodically reevaluate the launchHot() logic from beginning.
|
||||
select {
|
||||
case s, ok := <-call.slots.getDequeueChan():
|
||||
if !ok {
|
||||
return nil, errors.New("slot shut down while waiting for hot slot")
|
||||
}
|
||||
if s.acquireSlot() {
|
||||
if s.slot.Error() != nil {
|
||||
s.slot.Close()
|
||||
return nil, s.slot.Error()
|
||||
}
|
||||
return s.slot, nil
|
||||
}
|
||||
|
||||
// we failed to take ownership of the token (eg. container idle timeout)
|
||||
// try launching again
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
case <-time.After(time.Duration(200) * time.Millisecond):
|
||||
// reevaluate
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// launchCold waits for necessary resources to launch a new container, then
|
||||
// returns the slot for that new container to run the request on.
|
||||
func (a *agent) launchCold(ctx context.Context, call *call) (Slot, error) {
|
||||
|
||||
isAsync := call.Type == models.TypeAsync
|
||||
ch := make(chan Slot)
|
||||
|
||||
// if nothing free, wait for ram token or a slot
|
||||
select {
|
||||
case s := <-slots:
|
||||
return s, nil
|
||||
case tok, isOpen := <-a.resources.GetResourceToken(ctx, call.Memory, isAsync):
|
||||
if !isOpen {
|
||||
return nil, models.ErrCallTimeoutServerBusy
|
||||
}
|
||||
errCh = a.launch(ctx, slots, call, tok) // TODO mangle
|
||||
go a.prepCold(ctx, call, tok, ch)
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
|
||||
// wait for launch err or a slot to open up (possibly from launch)
|
||||
// wait for launch err or a slot to open up
|
||||
select {
|
||||
case err := <-errCh:
|
||||
// if we get a launch err, try to return to user (e.g. image not found)
|
||||
return nil, err
|
||||
case slot := <-slots:
|
||||
return slot, nil
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
}
|
||||
|
||||
func (a *agent) hotSlot(ctx context.Context, call *call) (slot, error) {
|
||||
slots := a.slots(hotKey(call))
|
||||
|
||||
// TODO if we track avg run time we could know how long to wait or
|
||||
// if we need to launch instead of waiting.
|
||||
|
||||
// if we can get a slot in a reasonable amount of time, use it
|
||||
select {
|
||||
case s := <-slots:
|
||||
return s, nil
|
||||
case <-time.After(100 * time.Millisecond): // XXX(reed): precise^
|
||||
// TODO this means the first launched container if none are running eats
|
||||
// this. yes it sucks but there are a lot of other fish to fry, opening a
|
||||
// policy discussion...
|
||||
}
|
||||
|
||||
// then wait for a slot or try to launch...
|
||||
return a.launchOrSlot(ctx, slots, call)
|
||||
}
|
||||
|
||||
// TODO this should be a LIFO stack of channels, perhaps. a queue (channel)
|
||||
// will always send the least recently used, not ideal.
|
||||
func (a *agent) slots(key string) chan slot {
|
||||
a.hMu.RLock()
|
||||
slots, ok := a.hot[key]
|
||||
a.hMu.RUnlock()
|
||||
if !ok {
|
||||
a.hMu.Lock()
|
||||
slots, ok = a.hot[key]
|
||||
if !ok {
|
||||
slots = make(chan slot) // should not be buffered
|
||||
a.hot[key] = slots
|
||||
case s := <-ch:
|
||||
if s.Error() != nil {
|
||||
s.Close()
|
||||
return nil, s.Error()
|
||||
}
|
||||
a.hMu.Unlock()
|
||||
return s, nil
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
return slots
|
||||
}
|
||||
|
||||
func hotKey(call *call) string {
|
||||
// return a sha1 hash of a (hopefully) unique string of all the config
|
||||
// values, to make map lookups quicker [than the giant unique string]
|
||||
|
||||
hash := sha1.New()
|
||||
fmt.Fprint(hash, call.AppName, "\x00")
|
||||
fmt.Fprint(hash, call.Path, "\x00")
|
||||
fmt.Fprint(hash, call.Image, "\x00")
|
||||
fmt.Fprint(hash, call.Timeout, "\x00")
|
||||
fmt.Fprint(hash, call.IdleTimeout, "\x00")
|
||||
fmt.Fprint(hash, call.Memory, "\x00")
|
||||
fmt.Fprint(hash, call.Format, "\x00")
|
||||
|
||||
// we have to sort these before printing, yay. TODO do better
|
||||
keys := make([]string, 0, len(call.BaseEnv))
|
||||
for k := range call.BaseEnv {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
|
||||
sort.Strings(keys)
|
||||
for _, k := range keys {
|
||||
fmt.Fprint(hash, k, "\x00", call.BaseEnv[k], "\x00")
|
||||
}
|
||||
|
||||
var buf [sha1.Size]byte
|
||||
return string(hash.Sum(buf[:0]))
|
||||
}
|
||||
|
||||
type slot interface {
|
||||
exec(ctx context.Context, call *call) error
|
||||
io.Closer
|
||||
}
|
||||
|
||||
// implements Slot
|
||||
type coldSlot struct {
|
||||
cookie drivers.Cookie
|
||||
tok ResourceToken
|
||||
err error
|
||||
}
|
||||
|
||||
func (s *coldSlot) Error() error {
|
||||
return s.err
|
||||
}
|
||||
|
||||
func (s *coldSlot) exec(ctx context.Context, call *call) error {
|
||||
@@ -430,7 +416,9 @@ func (s *coldSlot) Close() error {
|
||||
// removal latency
|
||||
s.cookie.Close(context.Background()) // ensure container removal, separate ctx
|
||||
}
|
||||
s.tok.Close()
|
||||
if s.tok != nil {
|
||||
s.tok.Close()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -440,9 +428,17 @@ type hotSlot struct {
|
||||
proto protocol.ContainerIO
|
||||
errC <-chan error // container error
|
||||
container *container // TODO mask this
|
||||
err error
|
||||
}
|
||||
|
||||
func (s *hotSlot) Close() error { close(s.done); return nil }
|
||||
func (s *hotSlot) Close() error {
|
||||
close(s.done)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *hotSlot) Error() error {
|
||||
return s.err
|
||||
}
|
||||
|
||||
func (s *hotSlot) exec(ctx context.Context, call *call) error {
|
||||
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_hot_exec")
|
||||
@@ -451,6 +447,11 @@ func (s *hotSlot) exec(ctx context.Context, call *call) error {
|
||||
// link the container id and id in the logs [for us!]
|
||||
common.Logger(ctx).WithField("container_id", s.container.id).Info("starting call")
|
||||
|
||||
start := time.Now()
|
||||
defer func() {
|
||||
call.slots.recordLatency(SlotQueueRunner, uint64(time.Now().Sub(start).Seconds()*1000))
|
||||
}()
|
||||
|
||||
// swap in the new stderr logger & stat accumulator
|
||||
oldStderr := s.container.swap(call.stderr, &call.Stats)
|
||||
defer s.container.swap(oldStderr, nil) // once we're done, swap out in this scope to prevent races
|
||||
@@ -475,33 +476,7 @@ func (s *hotSlot) exec(ctx context.Context, call *call) error {
|
||||
// TODO we REALLY need to wait for dispatch to return before conceding our slot
|
||||
}
|
||||
|
||||
// this will work for hot & cold (woo)
|
||||
// if launch encounters a non-nil error it will send it on the returned channel,
|
||||
// this can be useful if an image doesn't exist, e.g.
|
||||
func (a *agent) launch(ctx context.Context, slots chan<- slot, call *call, tok ResourceToken) <-chan error {
|
||||
ch := make(chan error, 1)
|
||||
|
||||
if !protocol.IsStreamable(protocol.Protocol(call.Format)) {
|
||||
// TODO no
|
||||
go func() {
|
||||
err := a.prepCold(ctx, slots, call, tok)
|
||||
if err != nil {
|
||||
ch <- err
|
||||
}
|
||||
}()
|
||||
return ch
|
||||
}
|
||||
|
||||
go func() {
|
||||
err := a.runHot(ctx, slots, call, tok)
|
||||
if err != nil {
|
||||
ch <- err
|
||||
}
|
||||
}()
|
||||
return ch
|
||||
}
|
||||
|
||||
func (a *agent) prepCold(ctx context.Context, slots chan<- slot, call *call, tok ResourceToken) error {
|
||||
func (a *agent) prepCold(ctx context.Context, call *call, tok ResourceToken, ch chan Slot) {
|
||||
container := &container{
|
||||
id: id.New().String(), // XXX we could just let docker generate ids...
|
||||
image: call.Image,
|
||||
@@ -517,32 +492,21 @@ func (a *agent) prepCold(ctx context.Context, slots chan<- slot, call *call, tok
|
||||
// pull & create container before we return a slot, so as to be friendly
|
||||
// about timing out if this takes a while...
|
||||
cookie, err := a.driver.Prepare(ctx, container)
|
||||
if err != nil {
|
||||
tok.Close()
|
||||
return err
|
||||
}
|
||||
|
||||
slot := &coldSlot{cookie, tok}
|
||||
slot := &coldSlot{cookie, tok, err}
|
||||
select {
|
||||
case slots <- slot:
|
||||
case ch <- slot:
|
||||
case <-ctx.Done():
|
||||
slot.Close() // if we can't send this slot, need to take care of it ourselves
|
||||
slot.Close()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (a *agent) runHot(ctxArg context.Context, slots chan<- slot, call *call, tok ResourceToken) error {
|
||||
func (a *agent) runHot(ctxArg context.Context, call *call, tok ResourceToken) {
|
||||
// We must be careful to only use ctxArg for logs/spans
|
||||
|
||||
// create a span from ctxArg but ignore the new Context
|
||||
// instead we will create a new Context below and explicitly set its span
|
||||
span, _ := opentracing.StartSpanFromContext(ctxArg, "docker_run_hot")
|
||||
defer span.Finish()
|
||||
|
||||
if tok == nil {
|
||||
// TODO we should panic, probably ;)
|
||||
return errors.New("no token provided, not giving you a slot")
|
||||
}
|
||||
defer tok.Close()
|
||||
|
||||
// TODO we have to make sure we flush these pipes or we will deadlock
|
||||
@@ -562,6 +526,9 @@ func (a *agent) runHot(ctxArg context.Context, slots chan<- slot, call *call, to
|
||||
// add the span we created above to the new Context
|
||||
ctx = opentracing.ContextWithSpan(ctx, span)
|
||||
|
||||
start := time.Now()
|
||||
call.slots.enterState(SlotQueueStarter)
|
||||
|
||||
cid := id.New().String()
|
||||
|
||||
// set up the stderr for the first one to capture any logs before the slot is
|
||||
@@ -585,16 +552,23 @@ func (a *agent) runHot(ctxArg context.Context, slots chan<- slot, call *call, to
|
||||
|
||||
cookie, err := a.driver.Prepare(ctx, container)
|
||||
if err != nil {
|
||||
return err
|
||||
call.slots.exitStateWithLatency(SlotQueueStarter, uint64(time.Now().Sub(start).Seconds()*1000))
|
||||
call.slots.queueSlot(&hotSlot{done: make(chan struct{}), err: err})
|
||||
return
|
||||
}
|
||||
defer cookie.Close(context.Background()) // ensure container removal, separate ctx
|
||||
|
||||
waiter, err := cookie.Run(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
call.slots.exitStateWithLatency(SlotQueueStarter, uint64(time.Now().Sub(start).Seconds()*1000))
|
||||
call.slots.queueSlot(&hotSlot{done: make(chan struct{}), err: err})
|
||||
return
|
||||
}
|
||||
|
||||
// container is running
|
||||
call.slots.enterState(SlotQueueRunner)
|
||||
call.slots.exitStateWithLatency(SlotQueueStarter, uint64(time.Now().Sub(start).Seconds()*1000))
|
||||
defer call.slots.exitState(SlotQueueRunner)
|
||||
|
||||
// buffered, in case someone has slot when waiter returns but isn't yet listening
|
||||
errC := make(chan error, 1)
|
||||
@@ -611,20 +585,31 @@ func (a *agent) runHot(ctxArg context.Context, slots chan<- slot, call *call, to
|
||||
}
|
||||
|
||||
done := make(chan struct{})
|
||||
slot := &hotSlot{done, proto, errC, container}
|
||||
s := call.slots.queueSlot(&hotSlot{done, proto, errC, container, nil})
|
||||
|
||||
select {
|
||||
case slots <- slot:
|
||||
case <-s.trigger:
|
||||
case <-time.After(time.Duration(call.IdleTimeout) * time.Second):
|
||||
logger.Info("Canceling inactive hot function")
|
||||
shutdownContainer()
|
||||
return
|
||||
if call.slots.ejectSlot(s) {
|
||||
logger.Info("Canceling inactive hot function")
|
||||
shutdownContainer()
|
||||
return
|
||||
}
|
||||
case <-ctx.Done(): // container shutdown
|
||||
return
|
||||
if call.slots.ejectSlot(s) {
|
||||
return
|
||||
}
|
||||
case <-a.shutdown: // server shutdown
|
||||
shutdownContainer()
|
||||
return
|
||||
if call.slots.ejectSlot(s) {
|
||||
shutdownContainer()
|
||||
return
|
||||
}
|
||||
}
|
||||
// IMPORTANT: if we fail to eject the slot, it means that a consumer
|
||||
// just dequeued this and acquired the slot. In other words, we were
|
||||
// late in ejectSlots(), so we have to execute this request in this
|
||||
// iteration. Beginning of for-loop will re-check ctx/shutdown case
|
||||
// and terminate after this request is done.
|
||||
|
||||
// wait for this call to finish
|
||||
// NOTE do NOT select with shutdown / other channels. slot handles this.
|
||||
@@ -640,7 +625,6 @@ func (a *agent) runHot(ctxArg context.Context, slots chan<- slot, call *call, to
|
||||
}
|
||||
|
||||
logger.WithError(err).Info("hot function terminated")
|
||||
return err
|
||||
}
|
||||
|
||||
// container implements drivers.ContainerTask container is the execution of a
|
||||
|
||||
@@ -300,6 +300,7 @@ type call struct {
|
||||
req *http.Request
|
||||
stderr io.ReadWriteCloser
|
||||
ct callTrigger
|
||||
slots *slotQueue
|
||||
}
|
||||
|
||||
func (c *call) Model() *models.Call { return c.Call }
|
||||
|
||||
353
api/agent/slots.go
Normal file
353
api/agent/slots.go
Normal file
@@ -0,0 +1,353 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha1"
|
||||
"fmt"
|
||||
"sort"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
)
|
||||
|
||||
//
|
||||
// slotQueueMgr keeps track of hot container slotQueues where each slotQueue
|
||||
// provides for multiple consumers/producers. slotQueue also stores
|
||||
// a few basic stats in slotStats.
|
||||
//
|
||||
|
||||
type Slot interface {
|
||||
exec(ctx context.Context, call *call) error
|
||||
Close() error
|
||||
Error() error
|
||||
}
|
||||
|
||||
// slotQueueMgr manages hot container slotQueues
|
||||
type slotQueueMgr struct {
|
||||
hMu sync.RWMutex // protects hot
|
||||
hot map[string]*slotQueue
|
||||
}
|
||||
|
||||
type SlotQueueMetricType int
|
||||
|
||||
const (
|
||||
SlotQueueRunner SlotQueueMetricType = iota
|
||||
SlotQueueStarter
|
||||
SlotQueueWaiter
|
||||
SlotQueueLast
|
||||
)
|
||||
|
||||
// counters per state and moving avg of time spent in each state
|
||||
type slotQueueStats struct {
|
||||
states [SlotQueueLast]uint64
|
||||
latencyCount [SlotQueueLast]uint64
|
||||
latencies [SlotQueueLast]uint64
|
||||
}
|
||||
|
||||
type slotToken struct {
|
||||
slot Slot
|
||||
trigger chan struct{}
|
||||
id uint64
|
||||
isBusy uint32
|
||||
}
|
||||
|
||||
// LIFO queue that exposes input/output channels along
|
||||
// with runner/waiter tracking for agent
|
||||
type slotQueue struct {
|
||||
key string
|
||||
cond *sync.Cond
|
||||
slots []*slotToken
|
||||
nextId uint64
|
||||
output chan *slotToken
|
||||
isClosed bool
|
||||
statsLock sync.Mutex // protects stats below
|
||||
stats slotQueueStats
|
||||
}
|
||||
|
||||
func NewSlotQueueMgr() *slotQueueMgr {
|
||||
obj := &slotQueueMgr{
|
||||
hot: make(map[string]*slotQueue),
|
||||
}
|
||||
return obj
|
||||
}
|
||||
|
||||
func NewSlotQueue(key string) *slotQueue {
|
||||
obj := &slotQueue{
|
||||
key: key,
|
||||
cond: sync.NewCond(new(sync.Mutex)),
|
||||
slots: make([]*slotToken, 0),
|
||||
output: make(chan *slotToken),
|
||||
}
|
||||
|
||||
// producer go routine to pick LIFO slots and
|
||||
// push them into output channel
|
||||
go func() {
|
||||
for {
|
||||
obj.cond.L.Lock()
|
||||
for len(obj.slots) <= 0 && !obj.isClosed {
|
||||
obj.cond.Wait()
|
||||
}
|
||||
|
||||
// cleanup and exit
|
||||
if obj.isClosed {
|
||||
|
||||
purge := obj.slots
|
||||
obj.slots = obj.slots[:0]
|
||||
obj.cond.L.Unlock()
|
||||
|
||||
close(obj.output)
|
||||
|
||||
for _, val := range purge {
|
||||
if val.acquireSlot() {
|
||||
val.slot.Close()
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// pop
|
||||
item := obj.slots[len(obj.slots)-1]
|
||||
obj.slots = obj.slots[:len(obj.slots)-1]
|
||||
obj.cond.L.Unlock()
|
||||
|
||||
// block
|
||||
obj.output <- item
|
||||
}
|
||||
}()
|
||||
|
||||
return obj
|
||||
}
|
||||
|
||||
func (a *slotToken) acquireSlot() bool {
|
||||
// let's get the lock
|
||||
if !atomic.CompareAndSwapUint32(&a.isBusy, 0, 1) {
|
||||
return false
|
||||
}
|
||||
|
||||
// now we have the lock, push the trigger
|
||||
close(a.trigger)
|
||||
return true
|
||||
}
|
||||
|
||||
func (a *slotQueue) ejectSlot(s *slotToken) bool {
|
||||
// let's get the lock
|
||||
if !atomic.CompareAndSwapUint32(&s.isBusy, 0, 1) {
|
||||
return false
|
||||
}
|
||||
|
||||
isFound := false
|
||||
|
||||
a.cond.L.Lock()
|
||||
for idx, val := range a.slots {
|
||||
if val.id == s.id {
|
||||
a.slots[0], a.slots[idx] = a.slots[idx], a.slots[0]
|
||||
isFound = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if isFound {
|
||||
a.slots = a.slots[1:]
|
||||
}
|
||||
a.cond.L.Unlock()
|
||||
|
||||
s.slot.Close()
|
||||
// now we have the lock, push the trigger
|
||||
close(s.trigger)
|
||||
return true
|
||||
}
|
||||
|
||||
func (a *slotQueue) destroySlotQueue() {
|
||||
doSignal := false
|
||||
a.cond.L.Lock()
|
||||
if !a.isClosed {
|
||||
a.isClosed = true
|
||||
doSignal = true
|
||||
}
|
||||
a.cond.L.Unlock()
|
||||
if doSignal {
|
||||
a.cond.Signal()
|
||||
}
|
||||
}
|
||||
|
||||
func (a *slotQueue) getDequeueChan() chan *slotToken {
|
||||
return a.output
|
||||
}
|
||||
|
||||
func (a *slotQueue) queueSlot(slot Slot) *slotToken {
|
||||
|
||||
token := &slotToken{slot, make(chan struct{}), 0, 0}
|
||||
isClosed := false
|
||||
|
||||
a.cond.L.Lock()
|
||||
if !a.isClosed {
|
||||
token.id = a.nextId
|
||||
a.slots = append(a.slots, token)
|
||||
a.nextId += 1
|
||||
} else {
|
||||
isClosed = true
|
||||
}
|
||||
a.cond.L.Unlock()
|
||||
|
||||
if !isClosed {
|
||||
a.cond.Signal()
|
||||
return token
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (a *slotQueue) getStats() slotQueueStats {
|
||||
var out slotQueueStats
|
||||
a.statsLock.Lock()
|
||||
out = a.stats
|
||||
a.statsLock.Unlock()
|
||||
return out
|
||||
}
|
||||
|
||||
func (a *slotQueue) isNewContainerNeeded() (bool, slotQueueStats) {
|
||||
|
||||
stats := a.getStats()
|
||||
|
||||
waiters := stats.states[SlotQueueWaiter]
|
||||
if waiters == 0 {
|
||||
return false, stats
|
||||
}
|
||||
|
||||
// while a container is starting, do not start more than waiters
|
||||
starters := stats.states[SlotQueueStarter]
|
||||
if starters >= waiters {
|
||||
return false, stats
|
||||
}
|
||||
|
||||
// no executors? Start a container now.
|
||||
executors := starters + stats.states[SlotQueueRunner]
|
||||
if executors == 0 {
|
||||
return true, stats
|
||||
}
|
||||
|
||||
runLat := stats.latencies[SlotQueueRunner]
|
||||
waitLat := stats.latencies[SlotQueueWaiter]
|
||||
startLat := stats.latencies[SlotQueueStarter]
|
||||
|
||||
// no wait latency? No need to spin up new container
|
||||
if waitLat == 0 {
|
||||
return false, stats
|
||||
}
|
||||
|
||||
// this determines the aggresiveness of the container launch.
|
||||
if runLat/executors*2 < waitLat {
|
||||
return true, stats
|
||||
}
|
||||
if runLat < waitLat {
|
||||
return true, stats
|
||||
}
|
||||
if startLat < waitLat {
|
||||
return true, stats
|
||||
}
|
||||
|
||||
return false, stats
|
||||
}
|
||||
|
||||
func (a *slotQueue) enterState(metricIdx SlotQueueMetricType) {
|
||||
a.statsLock.Lock()
|
||||
a.stats.states[metricIdx] += 1
|
||||
a.statsLock.Unlock()
|
||||
}
|
||||
|
||||
func (a *slotQueue) exitState(metricIdx SlotQueueMetricType) {
|
||||
a.statsLock.Lock()
|
||||
if a.stats.states[metricIdx] == 0 {
|
||||
panic(fmt.Sprintf("BUG: metric tracking fault idx=%v", metricIdx))
|
||||
}
|
||||
a.stats.states[metricIdx] -= 1
|
||||
a.statsLock.Unlock()
|
||||
}
|
||||
|
||||
func (a *slotQueue) recordLatencyLocked(metricIdx SlotQueueMetricType, latency uint64) {
|
||||
// exponentially weighted moving average with smoothing factor of 0.5
|
||||
// 0.5 is a high value to age older observations fast while filtering
|
||||
// some noise. For our purposes, newer observations are much more important
|
||||
// than older, but we still would like to low pass some noise.
|
||||
// first samples are ignored.
|
||||
if a.stats.latencyCount[metricIdx] != 0 {
|
||||
a.stats.latencies[metricIdx] = (a.stats.latencies[metricIdx]*5 + latency*5) / 10
|
||||
}
|
||||
a.stats.latencyCount[metricIdx] += 1
|
||||
if a.stats.latencyCount[metricIdx] == 0 {
|
||||
a.stats.latencyCount[metricIdx] += 1
|
||||
}
|
||||
}
|
||||
|
||||
func (a *slotQueue) recordLatency(metricIdx SlotQueueMetricType, latency uint64) {
|
||||
a.statsLock.Lock()
|
||||
a.recordLatencyLocked(metricIdx, latency)
|
||||
a.statsLock.Unlock()
|
||||
}
|
||||
|
||||
func (a *slotQueue) exitStateWithLatency(metricIdx SlotQueueMetricType, latency uint64) {
|
||||
a.statsLock.Lock()
|
||||
if a.stats.states[metricIdx] == 0 {
|
||||
panic(fmt.Sprintf("BUG: metric tracking fault idx=%v", metricIdx))
|
||||
}
|
||||
a.stats.states[metricIdx] -= 1
|
||||
a.recordLatencyLocked(metricIdx, latency)
|
||||
a.statsLock.Unlock()
|
||||
}
|
||||
|
||||
// getSlot must ensure that if it receives a slot, it will be returned, otherwise
|
||||
// a container will be locked up forever waiting for slot to free.
|
||||
func (a *slotQueueMgr) getHotSlotQueue(call *call) *slotQueue {
|
||||
|
||||
key := getSlotQueueKey(call)
|
||||
|
||||
a.hMu.RLock()
|
||||
slots, ok := a.hot[key]
|
||||
a.hMu.RUnlock()
|
||||
if !ok {
|
||||
a.hMu.Lock()
|
||||
slots, ok = a.hot[key]
|
||||
if !ok {
|
||||
slots = NewSlotQueue(key)
|
||||
a.hot[key] = slots
|
||||
}
|
||||
a.hMu.Unlock()
|
||||
}
|
||||
return slots
|
||||
}
|
||||
|
||||
// currently unused. But at some point, we need to age/delete old
|
||||
// slotQueues.
|
||||
func (a *slotQueueMgr) destroySlotQueue(slots *slotQueue) {
|
||||
slots.destroySlotQueue()
|
||||
a.hMu.Lock()
|
||||
delete(a.hot, slots.key)
|
||||
a.hMu.Unlock()
|
||||
}
|
||||
|
||||
func getSlotQueueKey(call *call) string {
|
||||
// return a sha1 hash of a (hopefully) unique string of all the config
|
||||
// values, to make map lookups quicker [than the giant unique string]
|
||||
|
||||
hash := sha1.New()
|
||||
fmt.Fprint(hash, call.AppName, "\x00")
|
||||
fmt.Fprint(hash, call.Path, "\x00")
|
||||
fmt.Fprint(hash, call.Image, "\x00")
|
||||
fmt.Fprint(hash, call.Timeout, "\x00")
|
||||
fmt.Fprint(hash, call.IdleTimeout, "\x00")
|
||||
fmt.Fprint(hash, call.Memory, "\x00")
|
||||
fmt.Fprint(hash, call.Format, "\x00")
|
||||
|
||||
// we have to sort these before printing, yay. TODO do better
|
||||
keys := make([]string, 0, len(call.BaseEnv))
|
||||
for k := range call.BaseEnv {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
|
||||
sort.Strings(keys)
|
||||
for _, k := range keys {
|
||||
fmt.Fprint(hash, k, "\x00", call.BaseEnv[k], "\x00")
|
||||
}
|
||||
|
||||
var buf [sha1.Size]byte
|
||||
return string(hash.Sum(buf[:0]))
|
||||
}
|
||||
Reference in New Issue
Block a user