fn: perform call.End() after request is processed (#918)

* fn: perform call.End() after request is processed

call.End() performs several tasks in sequence; insert call,
insert log, (todo) remove mq entry, fireAfterCall callback, etc.
These currently add up to the request latency as return
from agent.Submit() is blocked on these. We also haven't been
able to apply any timeouts on these operations since they are
handled during request processing and it is hard to come up
with a strategy for it. Also the error cases
(couldn't insert call or log) are not propagated to the caller.

With this change, call.End() handling becomes asynchronous where
we perform these tasks after the request is done. This improves
latency and we no longer have to block the call on these operations.
The changes will also free up the agent slot token more quickly
and now we are no longer tied to hiccups in call.End().

Now, a timeout policy is also added to this which can
be adjusted with an env variable. (default 10 minutes)

This accentuates the fact that call/log/fireAfterCall are not
completed when request is done. So, there's a window there where
call is done, but call/log/fireAfterCall are not yet propagated.
This was already the case especially for error cases.

There's slight risk of accumulating call.End() operations in
case of hiccups in these log/call/callback systems.

* fn: address risk of overstacking of call.End() calls.
This commit is contained in:
Tolga Ceylan
2018-04-05 14:42:12 -07:00
committed by GitHub
parent 82bf532fa7
commit 81954bcf53
3 changed files with 59 additions and 23 deletions

View File

@@ -5,6 +5,7 @@ import (
"io"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/fnproject/fn/api/agent/drivers"
@@ -111,9 +112,10 @@ type agent struct {
resources ResourceTracker
// used to track running calls / safe shutdown
wg sync.WaitGroup // TODO rename
shutonce sync.Once
shutdown chan struct{}
wg sync.WaitGroup // TODO rename
shutonce sync.Once
shutdown chan struct{}
callEndCount int64
}
// New creates an Agent that executes functions locally as Docker containers.
@@ -250,14 +252,23 @@ func (a *agent) submit(ctx context.Context, call *call) error {
// pass this error (nil or otherwise) to end directly, to store status, etc
err = slot.exec(ctx, call)
handleStatsEnd(ctx, err)
// TODO: we need to allocate more time to store the call + logs in case the call timed out,
// but this could put us over the timeout if the call did not reply yet (need better policy).
ctx = common.BackgroundContext(ctx)
err = call.End(ctx, err)
a.handleCallEnd(ctx, call, err)
return transformTimeout(err, false)
}
func (a *agent) handleCallEnd(ctx context.Context, call *call, err error) {
a.wg.Add(1)
atomic.AddInt64(&a.callEndCount, 1)
go func() {
ctx = common.BackgroundContext(ctx)
ctx, cancel := context.WithTimeout(ctx, a.cfg.CallEndTimeout)
call.End(ctx, err)
cancel()
atomic.AddInt64(&a.callEndCount, -1)
a.wg.Done()
}()
}
func transformTimeout(e error, isRetriable bool) error {
if e == context.DeadlineExceeded {
if isRetriable {
@@ -308,6 +319,11 @@ func (a *agent) getSlot(ctx context.Context, call *call) (Slot, error) {
ctx, span := trace.StartSpan(ctx, "agent_get_slot")
defer span.End()
// first check any excess case of call.End() stacking.
if atomic.LoadInt64(&a.callEndCount) >= int64(a.cfg.MaxCallEndStacking) {
return nil, context.DeadlineExceeded
}
if protocol.IsStreamable(protocol.Protocol(call.Format)) {
// For hot requests, we use a long lived slot queue, which we use to manage hot containers
var isNew bool