mirror of
https://github.com/fnproject/fn.git
synced 2022-10-28 21:29:17 +03:00
fn: make call.End() to blocking to reduce complexity (#1208)
agent/lb-agent/runner roles execute call.End() in the background in some cases to reduce latency. With this change, we simplify this and switch to non-background execution of call.End(). This fixes hard to detect issues such as non-deterministic calculation of call.CompletedAt or incomplete Call.Stats in runners. Downstream projects if impacted by the now blocking call.End() latency should take steps to handle this according to their requirements.
This commit is contained in:
committed by
Reed Allman
parent
54ba179078
commit
586d5c4735
@@ -7,7 +7,6 @@ import (
|
||||
"io"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/fnproject/fn/api/agent/drivers"
|
||||
@@ -103,7 +102,6 @@ type agent struct {
|
||||
// used to track running calls / safe shutdown
|
||||
shutWg *common.WaitGroup
|
||||
shutonce sync.Once
|
||||
callEndCount int64
|
||||
disableAsyncDequeue bool
|
||||
|
||||
callOverrider CallOverrider
|
||||
@@ -293,68 +291,28 @@ func (a *agent) submit(ctx context.Context, call *call) error {
|
||||
statsDequeueAndStart(ctx)
|
||||
|
||||
// We are about to execute the function, set container Exec Deadline (call.Timeout)
|
||||
ctx, cancel := context.WithTimeout(ctx, time.Duration(call.Timeout)*time.Second)
|
||||
slotCtx, cancel := context.WithTimeout(ctx, time.Duration(call.Timeout)*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// Pass this error (nil or otherwise) to end directly, to store status, etc.
|
||||
err = slot.exec(ctx, call)
|
||||
err = slot.exec(slotCtx, call)
|
||||
return a.handleCallEnd(ctx, call, slot, err, true)
|
||||
}
|
||||
|
||||
func (a *agent) scheduleCallEnd(fn func()) {
|
||||
atomic.AddInt64(&a.callEndCount, 1)
|
||||
go func() {
|
||||
fn()
|
||||
atomic.AddInt64(&a.callEndCount, -1)
|
||||
a.shutWg.DoneSession()
|
||||
}()
|
||||
}
|
||||
func (a *agent) handleCallEnd(ctx context.Context, call *call, slot Slot, err error, isStarted bool) error {
|
||||
|
||||
func (a *agent) finalizeCallEnd(ctx context.Context, err error, isRetriable, isScheduled bool) error {
|
||||
// if scheduled in background, let scheduleCallEnd() handle
|
||||
// the shutWg group, otherwise decrement here.
|
||||
if !isScheduled {
|
||||
a.shutWg.DoneSession()
|
||||
}
|
||||
handleStatsEnd(ctx, err)
|
||||
return transformTimeout(err, isRetriable)
|
||||
}
|
||||
|
||||
func (a *agent) handleCallEnd(ctx context.Context, call *call, slot Slot, err error, isCommitted bool) error {
|
||||
|
||||
// For hot-containers, slot close is a simple channel close... No need
|
||||
// to handle it async. Execute it here ASAP
|
||||
if slot != nil && protocol.IsStreamable(protocol.Protocol(call.Format)) {
|
||||
slot.Close(ctx)
|
||||
slot = nil
|
||||
}
|
||||
|
||||
// This means call was routed (executed), in order to reduce latency here
|
||||
// we perform most of these tasks in go-routine asynchronously.
|
||||
if isCommitted {
|
||||
a.scheduleCallEnd(func() {
|
||||
ctx = common.BackgroundContext(ctx)
|
||||
if slot != nil {
|
||||
slot.Close(ctx) // (no timeout)
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(ctx, a.cfg.CallEndTimeout)
|
||||
call.End(ctx, err)
|
||||
cancel()
|
||||
})
|
||||
return a.finalizeCallEnd(ctx, err, false, true)
|
||||
}
|
||||
|
||||
// The call did not succeed. And it is retriable. We close the slot
|
||||
// ASAP in the background if we haven't already done so (cold-container case),
|
||||
// in order to keep latency down.
|
||||
if slot != nil {
|
||||
a.scheduleCallEnd(func() {
|
||||
slot.Close(common.BackgroundContext(ctx)) // (no timeout)
|
||||
})
|
||||
return a.finalizeCallEnd(ctx, err, true, true)
|
||||
slot.Close(common.BackgroundContext(ctx))
|
||||
}
|
||||
|
||||
return a.finalizeCallEnd(ctx, err, true, false)
|
||||
// This means call was routed (executed)
|
||||
if isStarted {
|
||||
call.End(ctx, err)
|
||||
}
|
||||
|
||||
handleStatsEnd(ctx, err)
|
||||
a.shutWg.DoneSession()
|
||||
return transformTimeout(err, !isStarted)
|
||||
}
|
||||
|
||||
func transformTimeout(e error, isRetriable bool) error {
|
||||
@@ -417,11 +375,6 @@ func (a *agent) getSlot(ctx context.Context, call *call) (Slot, error) {
|
||||
ctx, span := trace.StartSpan(ctx, "agent_get_slot")
|
||||
defer span.End()
|
||||
|
||||
// first check any excess case of call.End() stacking.
|
||||
if atomic.LoadInt64(&a.callEndCount) >= int64(a.cfg.MaxCallEndStacking) {
|
||||
return nil, context.DeadlineExceeded
|
||||
}
|
||||
|
||||
if protocol.IsStreamable(protocol.Protocol(call.Format)) {
|
||||
// For hot requests, we use a long lived slot queue, which we use to manage hot containers
|
||||
var isNew bool
|
||||
|
||||
Reference in New Issue
Block a user