mirror of
https://github.com/fnproject/fn.git
synced 2022-10-28 21:29:17 +03:00
* fn: remove 100 msec sleep for hot containers *) moved slot management to its own file *) slots are now implemented with LIFO semantics, this is important since we do not want to round robin hot containers. Idle hot containers should timeout properly. *) each slot queue now stores a few basic stats such as avg time a call spent in a given state and number of running/launching containers, number of waiting calls in those states. *) first metrics in these basic stats are discarded to avoid initial docker pull/start spikes. *) agent now records/updates slot queue state and how much time a call stayed in that state. *) waitHotSlot() replaces the previous wait 100 msec logic where it sends a msg to hot slot go routine launchHot() and waits for a slot *) launchHot() is now a go routine for tracking containers in hot slots, it determines if a new containers is needed based on slot queue stats.
354 lines
7.5 KiB
Go
354 lines
7.5 KiB
Go
package agent
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha1"
|
|
"fmt"
|
|
"sort"
|
|
"sync"
|
|
"sync/atomic"
|
|
)
|
|
|
|
//
|
|
// slotQueueMgr keeps track of hot container slotQueues where each slotQueue
|
|
// provides for multiple consumers/producers. slotQueue also stores
|
|
// a few basic stats in slotStats.
|
|
//
|
|
|
|
type Slot interface {
|
|
exec(ctx context.Context, call *call) error
|
|
Close() error
|
|
Error() error
|
|
}
|
|
|
|
// slotQueueMgr manages hot container slotQueues
|
|
type slotQueueMgr struct {
|
|
hMu sync.RWMutex // protects hot
|
|
hot map[string]*slotQueue
|
|
}
|
|
|
|
type SlotQueueMetricType int
|
|
|
|
const (
|
|
SlotQueueRunner SlotQueueMetricType = iota
|
|
SlotQueueStarter
|
|
SlotQueueWaiter
|
|
SlotQueueLast
|
|
)
|
|
|
|
// counters per state and moving avg of time spent in each state
|
|
type slotQueueStats struct {
|
|
states [SlotQueueLast]uint64
|
|
latencyCount [SlotQueueLast]uint64
|
|
latencies [SlotQueueLast]uint64
|
|
}
|
|
|
|
type slotToken struct {
|
|
slot Slot
|
|
trigger chan struct{}
|
|
id uint64
|
|
isBusy uint32
|
|
}
|
|
|
|
// LIFO queue that exposes input/output channels along
|
|
// with runner/waiter tracking for agent
|
|
type slotQueue struct {
|
|
key string
|
|
cond *sync.Cond
|
|
slots []*slotToken
|
|
nextId uint64
|
|
output chan *slotToken
|
|
isClosed bool
|
|
statsLock sync.Mutex // protects stats below
|
|
stats slotQueueStats
|
|
}
|
|
|
|
func NewSlotQueueMgr() *slotQueueMgr {
|
|
obj := &slotQueueMgr{
|
|
hot: make(map[string]*slotQueue),
|
|
}
|
|
return obj
|
|
}
|
|
|
|
func NewSlotQueue(key string) *slotQueue {
|
|
obj := &slotQueue{
|
|
key: key,
|
|
cond: sync.NewCond(new(sync.Mutex)),
|
|
slots: make([]*slotToken, 0),
|
|
output: make(chan *slotToken),
|
|
}
|
|
|
|
// producer go routine to pick LIFO slots and
|
|
// push them into output channel
|
|
go func() {
|
|
for {
|
|
obj.cond.L.Lock()
|
|
for len(obj.slots) <= 0 && !obj.isClosed {
|
|
obj.cond.Wait()
|
|
}
|
|
|
|
// cleanup and exit
|
|
if obj.isClosed {
|
|
|
|
purge := obj.slots
|
|
obj.slots = obj.slots[:0]
|
|
obj.cond.L.Unlock()
|
|
|
|
close(obj.output)
|
|
|
|
for _, val := range purge {
|
|
if val.acquireSlot() {
|
|
val.slot.Close()
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
// pop
|
|
item := obj.slots[len(obj.slots)-1]
|
|
obj.slots = obj.slots[:len(obj.slots)-1]
|
|
obj.cond.L.Unlock()
|
|
|
|
// block
|
|
obj.output <- item
|
|
}
|
|
}()
|
|
|
|
return obj
|
|
}
|
|
|
|
func (a *slotToken) acquireSlot() bool {
|
|
// let's get the lock
|
|
if !atomic.CompareAndSwapUint32(&a.isBusy, 0, 1) {
|
|
return false
|
|
}
|
|
|
|
// now we have the lock, push the trigger
|
|
close(a.trigger)
|
|
return true
|
|
}
|
|
|
|
func (a *slotQueue) ejectSlot(s *slotToken) bool {
|
|
// let's get the lock
|
|
if !atomic.CompareAndSwapUint32(&s.isBusy, 0, 1) {
|
|
return false
|
|
}
|
|
|
|
isFound := false
|
|
|
|
a.cond.L.Lock()
|
|
for idx, val := range a.slots {
|
|
if val.id == s.id {
|
|
a.slots[0], a.slots[idx] = a.slots[idx], a.slots[0]
|
|
isFound = true
|
|
break
|
|
}
|
|
}
|
|
if isFound {
|
|
a.slots = a.slots[1:]
|
|
}
|
|
a.cond.L.Unlock()
|
|
|
|
s.slot.Close()
|
|
// now we have the lock, push the trigger
|
|
close(s.trigger)
|
|
return true
|
|
}
|
|
|
|
func (a *slotQueue) destroySlotQueue() {
|
|
doSignal := false
|
|
a.cond.L.Lock()
|
|
if !a.isClosed {
|
|
a.isClosed = true
|
|
doSignal = true
|
|
}
|
|
a.cond.L.Unlock()
|
|
if doSignal {
|
|
a.cond.Signal()
|
|
}
|
|
}
|
|
|
|
func (a *slotQueue) getDequeueChan() chan *slotToken {
|
|
return a.output
|
|
}
|
|
|
|
func (a *slotQueue) queueSlot(slot Slot) *slotToken {
|
|
|
|
token := &slotToken{slot, make(chan struct{}), 0, 0}
|
|
isClosed := false
|
|
|
|
a.cond.L.Lock()
|
|
if !a.isClosed {
|
|
token.id = a.nextId
|
|
a.slots = append(a.slots, token)
|
|
a.nextId += 1
|
|
} else {
|
|
isClosed = true
|
|
}
|
|
a.cond.L.Unlock()
|
|
|
|
if !isClosed {
|
|
a.cond.Signal()
|
|
return token
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (a *slotQueue) getStats() slotQueueStats {
|
|
var out slotQueueStats
|
|
a.statsLock.Lock()
|
|
out = a.stats
|
|
a.statsLock.Unlock()
|
|
return out
|
|
}
|
|
|
|
func (a *slotQueue) isNewContainerNeeded() (bool, slotQueueStats) {
|
|
|
|
stats := a.getStats()
|
|
|
|
waiters := stats.states[SlotQueueWaiter]
|
|
if waiters == 0 {
|
|
return false, stats
|
|
}
|
|
|
|
// while a container is starting, do not start more than waiters
|
|
starters := stats.states[SlotQueueStarter]
|
|
if starters >= waiters {
|
|
return false, stats
|
|
}
|
|
|
|
// no executors? Start a container now.
|
|
executors := starters + stats.states[SlotQueueRunner]
|
|
if executors == 0 {
|
|
return true, stats
|
|
}
|
|
|
|
runLat := stats.latencies[SlotQueueRunner]
|
|
waitLat := stats.latencies[SlotQueueWaiter]
|
|
startLat := stats.latencies[SlotQueueStarter]
|
|
|
|
// no wait latency? No need to spin up new container
|
|
if waitLat == 0 {
|
|
return false, stats
|
|
}
|
|
|
|
// this determines the aggresiveness of the container launch.
|
|
if runLat/executors*2 < waitLat {
|
|
return true, stats
|
|
}
|
|
if runLat < waitLat {
|
|
return true, stats
|
|
}
|
|
if startLat < waitLat {
|
|
return true, stats
|
|
}
|
|
|
|
return false, stats
|
|
}
|
|
|
|
func (a *slotQueue) enterState(metricIdx SlotQueueMetricType) {
|
|
a.statsLock.Lock()
|
|
a.stats.states[metricIdx] += 1
|
|
a.statsLock.Unlock()
|
|
}
|
|
|
|
func (a *slotQueue) exitState(metricIdx SlotQueueMetricType) {
|
|
a.statsLock.Lock()
|
|
if a.stats.states[metricIdx] == 0 {
|
|
panic(fmt.Sprintf("BUG: metric tracking fault idx=%v", metricIdx))
|
|
}
|
|
a.stats.states[metricIdx] -= 1
|
|
a.statsLock.Unlock()
|
|
}
|
|
|
|
func (a *slotQueue) recordLatencyLocked(metricIdx SlotQueueMetricType, latency uint64) {
|
|
// exponentially weighted moving average with smoothing factor of 0.5
|
|
// 0.5 is a high value to age older observations fast while filtering
|
|
// some noise. For our purposes, newer observations are much more important
|
|
// than older, but we still would like to low pass some noise.
|
|
// first samples are ignored.
|
|
if a.stats.latencyCount[metricIdx] != 0 {
|
|
a.stats.latencies[metricIdx] = (a.stats.latencies[metricIdx]*5 + latency*5) / 10
|
|
}
|
|
a.stats.latencyCount[metricIdx] += 1
|
|
if a.stats.latencyCount[metricIdx] == 0 {
|
|
a.stats.latencyCount[metricIdx] += 1
|
|
}
|
|
}
|
|
|
|
func (a *slotQueue) recordLatency(metricIdx SlotQueueMetricType, latency uint64) {
|
|
a.statsLock.Lock()
|
|
a.recordLatencyLocked(metricIdx, latency)
|
|
a.statsLock.Unlock()
|
|
}
|
|
|
|
func (a *slotQueue) exitStateWithLatency(metricIdx SlotQueueMetricType, latency uint64) {
|
|
a.statsLock.Lock()
|
|
if a.stats.states[metricIdx] == 0 {
|
|
panic(fmt.Sprintf("BUG: metric tracking fault idx=%v", metricIdx))
|
|
}
|
|
a.stats.states[metricIdx] -= 1
|
|
a.recordLatencyLocked(metricIdx, latency)
|
|
a.statsLock.Unlock()
|
|
}
|
|
|
|
// getSlot must ensure that if it receives a slot, it will be returned, otherwise
|
|
// a container will be locked up forever waiting for slot to free.
|
|
func (a *slotQueueMgr) getHotSlotQueue(call *call) *slotQueue {
|
|
|
|
key := getSlotQueueKey(call)
|
|
|
|
a.hMu.RLock()
|
|
slots, ok := a.hot[key]
|
|
a.hMu.RUnlock()
|
|
if !ok {
|
|
a.hMu.Lock()
|
|
slots, ok = a.hot[key]
|
|
if !ok {
|
|
slots = NewSlotQueue(key)
|
|
a.hot[key] = slots
|
|
}
|
|
a.hMu.Unlock()
|
|
}
|
|
return slots
|
|
}
|
|
|
|
// currently unused. But at some point, we need to age/delete old
|
|
// slotQueues.
|
|
func (a *slotQueueMgr) destroySlotQueue(slots *slotQueue) {
|
|
slots.destroySlotQueue()
|
|
a.hMu.Lock()
|
|
delete(a.hot, slots.key)
|
|
a.hMu.Unlock()
|
|
}
|
|
|
|
func getSlotQueueKey(call *call) string {
|
|
// return a sha1 hash of a (hopefully) unique string of all the config
|
|
// values, to make map lookups quicker [than the giant unique string]
|
|
|
|
hash := sha1.New()
|
|
fmt.Fprint(hash, call.AppName, "\x00")
|
|
fmt.Fprint(hash, call.Path, "\x00")
|
|
fmt.Fprint(hash, call.Image, "\x00")
|
|
fmt.Fprint(hash, call.Timeout, "\x00")
|
|
fmt.Fprint(hash, call.IdleTimeout, "\x00")
|
|
fmt.Fprint(hash, call.Memory, "\x00")
|
|
fmt.Fprint(hash, call.Format, "\x00")
|
|
|
|
// we have to sort these before printing, yay. TODO do better
|
|
keys := make([]string, 0, len(call.BaseEnv))
|
|
for k := range call.BaseEnv {
|
|
keys = append(keys, k)
|
|
}
|
|
|
|
sort.Strings(keys)
|
|
for _, k := range keys {
|
|
fmt.Fprint(hash, k, "\x00", call.BaseEnv[k], "\x00")
|
|
}
|
|
|
|
var buf [sha1.Size]byte
|
|
return string(hash.Sum(buf[:0]))
|
|
}
|