mirror of
https://github.com/fnproject/fn.git
synced 2022-10-28 21:29:17 +03:00
* add spans to async * clean up / add spans to agent * there were a few methods which had multiple contexts which existed in the same scope (this doesn't end well, usually), flattened those out. * loop bound context cancels now rely on defer (also was brittle) * runHot had a lot of ctx shuffling, flattened that. * added some additional spans in certain paths for added granularity * linked up the hot launcher / run hot / wait hot to _a_ root span, the first 2 are follows from spans, but at least we can see the source of these and also can see containers launched over a hot launcher's lifetime I left TODO around the FollowsFrom because OpenCensus doesn't, at least at the moment, appear to have any idea of FollowsFrom and it was an extra OpenTracing method (we have to get the span out, start a new span with the option, then add it to the context... some shuffling required). anyway, was on the fence about adding at least. * resource waiters need to manage their own goroutine lifecycle * if we get an impossible memory request, bail instead of infinite loop * handle timeout slippery case * still sucks, but hotLauncher doesn't leak anything. even the time.After timer goroutines * simplify GetResourceToken GetCall can guard against the impossible to allocate resource tasks entering the system by erroring instead of doling them out. this makes GetResourceToken logic more straightforward for callers, who now simply have the contract that they won't ever get a token if they let tasks into the agent that can't run (but GetCall guards this, and there's a test for it). sorry, I was going to make this only do that, but when I went to fix up the tests, my last patch went haywire so I fixed that too. this also at least tries to simplify the hotLaunch loop, which will now no longer leak time.After timers (which were long, and with signaller, they were many -- I got a stack trace :) -- this breaks out the bottom half of the logic to check to see if we need to launch into its own function, and handles the cleaning duties only in the caller instead of in 2 different select statements. played with this a bit, no doubt further cleaning could be done, but this _seems_ better. * fix vet * add units to exported method contract docs * oops
338 lines
9.2 KiB
Go
338 lines
9.2 KiB
Go
package agent
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/fnproject/fn/api/agent/drivers"
|
|
"github.com/fnproject/fn/api/common"
|
|
"github.com/fnproject/fn/api/id"
|
|
"github.com/fnproject/fn/api/models"
|
|
"github.com/go-openapi/strfmt"
|
|
"github.com/opentracing/opentracing-go"
|
|
"github.com/sirupsen/logrus"
|
|
)
|
|
|
|
type Call interface {
|
|
// Model will return the underlying models.Call configuration for this call.
|
|
// TODO we could respond to async correctly from agent but layering, this
|
|
// is only because the front end has different responses based on call type.
|
|
// try to discourage use elsewhere until this gets pushed down more...
|
|
Model() *models.Call
|
|
|
|
// Start will be called before this call is executed, it may be used to
|
|
// guarantee mutual exclusion, check docker permissions, update timestamps,
|
|
// etc.
|
|
// TODO Start and End can likely be unexported as they are only used in the agent,
|
|
// and on a type which is constructed in a specific agent. meh.
|
|
Start(ctx context.Context) error
|
|
|
|
// End will be called immediately after attempting a call execution,
|
|
// regardless of whether the execution failed or not. An error will be passed
|
|
// to End, which if nil indicates a successful execution. Any error returned
|
|
// from End will be returned as the error from Submit.
|
|
End(ctx context.Context, err error) error
|
|
}
|
|
|
|
// TODO build w/o closures... lazy
|
|
type CallOpt func(a *agent, c *call) error
|
|
|
|
type Param struct {
|
|
Key string
|
|
Value string
|
|
}
|
|
type Params []Param
|
|
|
|
func FromRequest(appName, path string, req *http.Request) CallOpt {
|
|
return func(a *agent, c *call) error {
|
|
app, err := a.da.GetApp(req.Context(), appName)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
route, err := a.da.GetRoute(req.Context(), appName, path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if route.Format == "" {
|
|
route.Format = models.FormatDefault
|
|
}
|
|
|
|
id := id.New().String()
|
|
|
|
// TODO this relies on ordering of opts, but tests make sure it works, probably re-plumb/destroy headers
|
|
// TODO async should probably supply an http.ResponseWriter that records the logs, to attach response headers to
|
|
if rw, ok := c.w.(http.ResponseWriter); ok {
|
|
rw.Header().Add("FN_CALL_ID", id)
|
|
for k, vs := range route.Headers {
|
|
for _, v := range vs {
|
|
// pre-write in these headers to response
|
|
rw.Header().Add(k, v)
|
|
}
|
|
}
|
|
}
|
|
|
|
// add our per call headers in here
|
|
req.Header.Set("FN_METHOD", req.Method)
|
|
req.Header.Set("FN_REQUEST_URL", reqURL(req))
|
|
req.Header.Set("FN_CALL_ID", id)
|
|
|
|
// this ensures that there is an image, path, timeouts, memory, etc are valid.
|
|
// NOTE: this means assign any changes above into route's fields
|
|
err = route.Validate()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
c.Call = &models.Call{
|
|
ID: id,
|
|
AppName: appName,
|
|
Path: route.Path,
|
|
Image: route.Image,
|
|
// Delay: 0,
|
|
Type: route.Type,
|
|
Format: route.Format,
|
|
// Payload: TODO,
|
|
Priority: new(int32), // TODO this is crucial, apparently
|
|
Timeout: route.Timeout,
|
|
IdleTimeout: route.IdleTimeout,
|
|
Memory: route.Memory,
|
|
CPUs: route.CPUs,
|
|
Config: buildConfig(app, route),
|
|
Headers: req.Header,
|
|
CreatedAt: strfmt.DateTime(time.Now()),
|
|
URL: reqURL(req),
|
|
Method: req.Method,
|
|
}
|
|
|
|
c.req = req
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func buildConfig(app *models.App, route *models.Route) models.Config {
|
|
conf := make(models.Config, 8+len(app.Config)+len(route.Config))
|
|
for k, v := range app.Config {
|
|
conf[k] = v
|
|
}
|
|
for k, v := range route.Config {
|
|
conf[k] = v
|
|
}
|
|
|
|
conf["FN_FORMAT"] = route.Format
|
|
conf["FN_APP_NAME"] = app.Name
|
|
conf["FN_PATH"] = route.Path
|
|
// TODO: might be a good idea to pass in: "FN_BASE_PATH" = fmt.Sprintf("/r/%s", appName) || "/" if using DNS entries per app
|
|
conf["FN_MEMORY"] = fmt.Sprintf("%d", route.Memory)
|
|
conf["FN_TYPE"] = route.Type
|
|
|
|
CPUs := route.CPUs.String()
|
|
if CPUs != "" {
|
|
conf["FN_CPUS"] = CPUs
|
|
}
|
|
return conf
|
|
}
|
|
|
|
func reqURL(req *http.Request) string {
|
|
if req.URL.Scheme == "" {
|
|
if req.TLS == nil {
|
|
req.URL.Scheme = "http"
|
|
} else {
|
|
req.URL.Scheme = "https"
|
|
}
|
|
}
|
|
if req.URL.Host == "" {
|
|
req.URL.Host = req.Host
|
|
}
|
|
return req.URL.String()
|
|
}
|
|
|
|
// TODO this currently relies on FromRequest having happened before to create the model
|
|
// here, to be a fully qualified model. We probably should double check but having a way
|
|
// to bypass will likely be what's used anyway unless forced.
|
|
func FromModel(mCall *models.Call) CallOpt {
|
|
return func(a *agent, c *call) error {
|
|
c.Call = mCall
|
|
|
|
req, err := http.NewRequest(c.Method, c.URL, strings.NewReader(c.Payload))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
req.Header = c.Headers
|
|
|
|
c.req = req
|
|
// TODO anything else really?
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// TODO this should be required
|
|
func WithWriter(w io.Writer) CallOpt {
|
|
return func(a *agent, c *call) error {
|
|
c.w = w
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func WithContext(ctx context.Context) CallOpt {
|
|
return func(a *agent, c *call) error {
|
|
c.req = c.req.WithContext(ctx)
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// GetCall builds a Call that can be used to submit jobs to the agent.
|
|
//
|
|
// TODO where to put this? async and sync both call this
|
|
func (a *agent) GetCall(opts ...CallOpt) (Call, error) {
|
|
var c call
|
|
|
|
for _, o := range opts {
|
|
err := o(a, &c)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
// TODO typed errors to test
|
|
if c.req == nil || c.Call == nil {
|
|
return nil, errors.New("no model or request provided for call")
|
|
}
|
|
|
|
if !a.resources.IsResourcePossible(c.Memory, uint64(c.CPUs), c.Type == models.TypeAsync) {
|
|
// if we're not going to be able to run this call on this machine, bail here.
|
|
return nil, models.ErrCallTimeoutServerBusy
|
|
}
|
|
|
|
c.da = a.da
|
|
c.ct = a
|
|
|
|
ctx, _ := common.LoggerWithFields(c.req.Context(),
|
|
logrus.Fields{"id": c.ID, "app": c.AppName, "route": c.Path})
|
|
c.req = c.req.WithContext(ctx)
|
|
|
|
// setup stderr logger separate (don't inherit ctx vars)
|
|
logger := logrus.WithFields(logrus.Fields{"user_log": true, "app_name": c.AppName, "path": c.Path, "image": c.Image, "call_id": c.ID})
|
|
c.stderr = setupLogger(logger)
|
|
if c.w == nil {
|
|
// send STDOUT to logs if no writer given (async...)
|
|
// TODO we could/should probably make this explicit to GetCall, ala 'WithLogger', but it's dupe code (who cares?)
|
|
c.w = c.stderr
|
|
}
|
|
|
|
now := time.Now()
|
|
slotDeadline := now.Add(time.Duration(c.Call.Timeout) * time.Second / 2)
|
|
execDeadline := now.Add(time.Duration(c.Call.Timeout) * time.Second)
|
|
|
|
c.slotDeadline = slotDeadline
|
|
c.execDeadline = execDeadline
|
|
|
|
execDeadlineStr := strfmt.DateTime(execDeadline).String()
|
|
|
|
// these 2 headers buckets are the same but for posterity!
|
|
if c.Headers == nil {
|
|
c.Headers = make(http.Header)
|
|
c.req.Header = c.Headers
|
|
}
|
|
c.Headers.Set("FN_DEADLINE", execDeadlineStr)
|
|
c.req.Header.Set("FN_DEADLINE", execDeadlineStr)
|
|
|
|
return &c, nil
|
|
}
|
|
|
|
type call struct {
|
|
*models.Call
|
|
|
|
da DataAccess
|
|
w io.Writer
|
|
req *http.Request
|
|
stderr io.ReadWriteCloser
|
|
ct callTrigger
|
|
slots *slotQueue
|
|
slotDeadline time.Time
|
|
execDeadline time.Time
|
|
}
|
|
|
|
func (c *call) Model() *models.Call { return c.Call }
|
|
|
|
func (c *call) Start(ctx context.Context) error {
|
|
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_call_start")
|
|
defer span.Finish()
|
|
|
|
// Check context timeouts, errors
|
|
if ctx.Err() != nil {
|
|
return ctx.Err()
|
|
}
|
|
|
|
c.StartedAt = strfmt.DateTime(time.Now())
|
|
c.Status = "running"
|
|
|
|
if rw, ok := c.w.(http.ResponseWriter); ok { // TODO need to figure out better way to wire response headers in
|
|
rw.Header().Set("XXX-FXLB-WAIT", time.Time(c.StartedAt).Sub(time.Time(c.CreatedAt)).String())
|
|
}
|
|
|
|
if c.Type == models.TypeAsync {
|
|
// XXX (reed): make sure MQ reservation is lengthy. to skirt MQ semantics,
|
|
// we could add a new message to MQ w/ delay of call.Timeout and delete the
|
|
// old one (in that order), after marking the call as running in the db
|
|
// (see below)
|
|
|
|
// XXX (reed): should we store the updated started_at + status? we could
|
|
// use this so that if we pick up a call from mq and find its status is
|
|
// running to avoid running the call twice and potentially mark it as
|
|
// errored (built in long running task detector, so to speak...)
|
|
|
|
err := c.da.Start(ctx, c.Model())
|
|
if err != nil {
|
|
return err // let another thread try this
|
|
}
|
|
}
|
|
|
|
err := c.ct.fireBeforeCall(ctx, c.Model())
|
|
if err != nil {
|
|
return fmt.Errorf("BeforeCall: %v", err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *call) End(ctx context.Context, errIn error) error {
|
|
span, ctx := opentracing.StartSpanFromContext(ctx, "agent_call_end")
|
|
defer span.Finish()
|
|
|
|
c.CompletedAt = strfmt.DateTime(time.Now())
|
|
|
|
switch errIn {
|
|
case nil:
|
|
c.Status = "success"
|
|
case context.DeadlineExceeded:
|
|
c.Status = "timeout"
|
|
default:
|
|
c.Status = "error"
|
|
c.Error = errIn.Error()
|
|
}
|
|
|
|
// ensure stats histogram is reasonably bounded
|
|
c.Call.Stats = drivers.Decimate(240, c.Call.Stats)
|
|
|
|
if err := c.da.Finish(ctx, c.Model(), c.stderr, c.Type == models.TypeAsync); err != nil {
|
|
common.Logger(ctx).WithError(err).Error("error finalizing call on datastore/mq")
|
|
// note: Not returning err here since the job could have already finished successfully.
|
|
}
|
|
|
|
// NOTE call this after InsertLog or the buffer will get reset
|
|
c.stderr.Close()
|
|
|
|
if err := c.ct.fireAfterCall(ctx, c.Model()); err != nil {
|
|
return fmt.Errorf("AfterCall: %v", err)
|
|
}
|
|
|
|
return errIn // original error, important for use in sync call returns
|
|
}
|