hybrid mergy (#581)

* so it begins

* add clarification to /dequeue, change response to list to future proof

* Specify that runner endpoints are also under /v1

* Add a flag to choose operation mode (node type).

This is specified using the `FN_NODE_TYPE` environment variable. The
default is the existing behaviour, where the server supports all
operations (full API plus asynchronous and synchronous runners).

The additional modes are:
* API - the full API is available, but no functions are executed by the
  node. Async calls are placed into a message queue, and synchronous
  calls are not supported (invoking them results in an API error).
* Runner - only the invocation/route API is present. Asynchronous and
  synchronous invocation requests are supported, but asynchronous
  requests are placed onto the message queue, so might be handled by
  another runner.

* Add agent type and checks on Submit

* Sketch of a factored out data access abstraction for api/runner agents

* Fix tests, adding node/agent types to constructors

* Add tests for full, API, and runner server modes.

* Added atomic UpdateCall to datastore

* adds in server side endpoints

* Made ServerNodeType public because tests use it

* Made ServerNodeType public because tests use it

* fix test build

* add hybrid runner client

pretty simple go api client that covers surface area needed for hybrid,
returning structs from models that the agent can use directly. not exactly
sure where to put this, so put it in `/clients/hybrid` but maybe we should
make `/api/runner/client` or something and shove it in there. want to get
integration tests set up and use the real endpoints next and then wrap this up
in the DataAccessLayer stuff.

* gracefully handles errors from fn
* handles backoff & retry on 500s
* will add to existing spans for debuggo action

* minor fixes

* meh
This commit is contained in:
Reed Allman
2017-12-11 10:43:19 -08:00
committed by GitHub
parent 1df4b46c56
commit 2ebc9c7480
26 changed files with 1157 additions and 94 deletions

View File

@@ -115,12 +115,18 @@ type Agent interface {
AddCallListener(fnext.CallListener)
}
type AgentNodeType int32
const (
AgentTypeFull AgentNodeType = iota
AgentTypeAPI
AgentTypeRunner
)
type agent struct {
// TODO maybe these should be on GetCall? idk. was getting bloated.
mq models.MessageQueue
ds models.Datastore
ls models.LogStore
da DataAccess
callListeners []fnext.CallListener
tp AgentNodeType
driver drivers.Driver
@@ -140,14 +146,13 @@ type agent struct {
promHandler http.Handler
}
func New(ds models.Datastore, ls models.LogStore, mq models.MessageQueue) Agent {
func New(ds models.Datastore, ls models.LogStore, mq models.MessageQueue, tp AgentNodeType) Agent {
// TODO: Create drivers.New(runnerConfig)
driver := docker.NewDocker(drivers.Config{})
a := &agent{
ds: ds,
ls: ls,
mq: mq,
tp: tp,
da: NewDirectDataAccess(ds, ls, mq),
driver: driver,
hot: make(map[string]chan slot),
resources: NewResourceTracker(),
@@ -155,7 +160,12 @@ func New(ds models.Datastore, ls models.LogStore, mq models.MessageQueue) Agent
promHandler: promhttp.Handler(),
}
go a.asyncDequeue() // safe shutdown can nanny this fine
switch tp {
case AgentTypeAPI:
// Don't start dequeuing
default:
go a.asyncDequeue() // safe shutdown can nanny this fine
}
return a
}
@@ -181,6 +191,10 @@ func transformTimeout(e error, isRetriable bool) error {
}
func (a *agent) Submit(callI Call) error {
if a.tp == AgentTypeAPI {
return errors.New("API agent cannot execute calls")
}
a.wg.Add(1)
defer a.wg.Done()

View File

@@ -49,7 +49,7 @@ func TestCallConfigurationRequest(t *testing.T) {
}, nil,
)
a := New(ds, ds, new(mqs.Mock))
a := New(ds, ds, new(mqs.Mock), AgentTypeFull)
defer a.Close()
w := httptest.NewRecorder()
@@ -247,7 +247,7 @@ func TestCallConfigurationModel(t *testing.T) {
// FromModel doesn't need a datastore, for now...
ds := datastore.NewMockInit(nil, nil, nil)
a := New(ds, ds, new(mqs.Mock))
a := New(ds, ds, new(mqs.Mock), AgentTypeFull)
defer a.Close()
callI, err := a.GetCall(FromModel(cm))
@@ -353,7 +353,7 @@ func TestSubmitError(t *testing.T) {
// FromModel doesn't need a datastore, for now...
ds := datastore.NewMockInit(nil, nil, nil)
a := New(ds, ds, new(mqs.Mock))
a := New(ds, ds, new(mqs.Mock), AgentTypeFull)
defer a.Close()
callI, err := a.GetCall(FromModel(cm))

View File

@@ -23,7 +23,7 @@ func (a *agent) asyncDequeue() {
}
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) // TODO ???
model, err := a.mq.Reserve(ctx)
model, err := a.da.Dequeue(ctx)
cancel()
if err != nil || model == nil {
if err != nil {

View File

@@ -50,12 +50,12 @@ type Params []Param
func FromRequest(appName, path string, req *http.Request, params Params) CallOpt {
return func(a *agent, c *call) error {
app, err := a.ds.GetApp(req.Context(), appName)
app, err := a.da.GetApp(req.Context(), appName)
if err != nil {
return err
}
route, err := a.ds.GetRoute(req.Context(), appName, path)
route, err := a.da.GetRoute(req.Context(), appName, path)
if err != nil {
return err
}
@@ -248,9 +248,7 @@ func (a *agent) GetCall(opts ...CallOpt) (Call, error) {
return nil, errors.New("no model or request provided for call")
}
c.ds = a.ds
c.ls = a.ls
c.mq = a.mq
c.da = a.da
c.ct = a
ctx, _ := common.LoggerWithFields(c.req.Context(),
@@ -272,9 +270,7 @@ func (a *agent) GetCall(opts ...CallOpt) (Call, error) {
type call struct {
*models.Call
ds models.Datastore
ls models.LogStore
mq models.MessageQueue
da DataAccess
w io.Writer
req *http.Request
stderr io.ReadWriteCloser
@@ -316,7 +312,7 @@ func (c *call) Start(ctx context.Context) error {
// running to avoid running the call twice and potentially mark it as
// errored (built in long running task detector, so to speak...)
err := c.mq.Delete(ctx, c.Call)
err := c.da.Start(ctx, c.Model())
if err != nil {
return err // let another thread try this
}
@@ -346,29 +342,14 @@ func (c *call) End(ctx context.Context, errIn error) error {
c.Error = errIn.Error()
}
if c.Type == models.TypeAsync {
// XXX (reed): delete MQ message, eventually
}
// ensure stats histogram is reasonably bounded
c.Call.Stats = drivers.Decimate(240, c.Call.Stats)
// this means that we could potentially store an error / timeout status for a
// call that ran successfully [by a user's perspective]
// TODO: this should be update, really
if err := c.ds.InsertCall(ctx, c.Call); err != nil {
common.Logger(ctx).WithError(err).Error("error inserting call into datastore")
if err := c.da.Finish(ctx, c.Model(), c.stderr, c.Type == models.TypeAsync); err != nil {
common.Logger(ctx).WithError(err).Error("error finalizing call on datastore/mq")
// note: Not returning err here since the job could have already finished successfully.
}
if err := c.ls.InsertLog(ctx, c.AppName, c.ID, c.stderr); err != nil {
common.Logger(ctx).WithError(err).Error("error uploading log")
// note: Not returning err here since the job could have already finished successfully.
}
// NOTE call this after InsertLog or the buffer will get reset
c.stderr.Close()
if err := c.ct.fireAfterCall(ctx, c.Model()); err != nil {
return fmt.Errorf("AfterCall: %v", err)
}

102
api/agent/data_access.go Normal file
View File

@@ -0,0 +1,102 @@
package agent
import (
"context"
"github.com/fnproject/fn/api/common"
"github.com/fnproject/fn/api/models"
"io"
)
// DataAccess abstracts the datastore and message queue operations done by the
// agent, so that API nodes and runner nodes can work with the same interface
// but actually operate on the data in different ways (by direct access or by
// mediation through an API node).
type DataAccess interface {
// GetApp abstracts querying the datastore for an app.
GetApp(ctx context.Context, appName string) (*models.App, error)
// GetRoute abstracts querying the datastore for a route within an app.
GetRoute(ctx context.Context, appName string, routePath string) (*models.Route, error)
// Enqueue will add a Call to the queue (ultimately forwards to mq.Push).
Enqueue(ctx context.Context, mCall *models.Call) (*models.Call, error)
// Dequeue will query the queue for the next available Call that can be run
// by this Agent, and reserve it (ultimately forwards to mq.Reserve).
Dequeue(ctx context.Context) (*models.Call, error)
// Start will attempt to start the provided Call within an appropriate
// context.
Start(ctx context.Context, mCall *models.Call) error
// Finish will notify the system that the Call has been processed, and
// fulfill the reservation in the queue if the call came from a queue.
Finish(ctx context.Context, mCall *models.Call, stderr io.ReadWriteCloser, async bool) error
}
type directDataAccess struct {
mq models.MessageQueue
ds models.Datastore
ls models.LogStore
}
func NewDirectDataAccess(ds models.Datastore, ls models.LogStore, mq models.MessageQueue) DataAccess {
da := &directDataAccess{
mq: mq,
ds: ds,
ls: ls,
}
return da
}
func (da *directDataAccess) GetApp(ctx context.Context, appName string) (*models.App, error) {
return da.ds.GetApp(ctx, appName)
}
func (da *directDataAccess) GetRoute(ctx context.Context, appName string, routePath string) (*models.Route, error) {
return da.ds.GetRoute(ctx, appName, routePath)
}
func (da *directDataAccess) Enqueue(ctx context.Context, mCall *models.Call) (*models.Call, error) {
return da.mq.Push(ctx, mCall)
// TODO: Insert a call in the datastore with the 'queued' state
}
func (da *directDataAccess) Dequeue(ctx context.Context) (*models.Call, error) {
return da.mq.Reserve(ctx)
}
func (da *directDataAccess) Start(ctx context.Context, mCall *models.Call) error {
// TODO Access datastore and try a Compare-And-Swap to set the call to
// 'running'. If it fails, delete the message from the MQ and return an
// error. If it is successful, don't do anything - the message will be
// removed when the call Finish'es.
// At the moment we don't have the queued/running/finished mechanics so we
// remove the message here.
return da.mq.Delete(ctx, mCall)
}
func (da *directDataAccess) Finish(ctx context.Context, mCall *models.Call, stderr io.ReadWriteCloser, async bool) error {
// this means that we could potentially store an error / timeout status for a
// call that ran successfully [by a user's perspective]
// TODO: this should be update, really
if err := da.ds.InsertCall(ctx, mCall); err != nil {
common.Logger(ctx).WithError(err).Error("error inserting call into datastore")
// note: Not returning err here since the job could have already finished successfully.
}
if err := da.ls.InsertLog(ctx, mCall.AppName, mCall.ID, stderr); err != nil {
common.Logger(ctx).WithError(err).Error("error uploading log")
// note: Not returning err here since the job could have already finished successfully.
}
// NOTE call this after InsertLog or the buffer will get reset
stderr.Close()
if async {
// XXX (reed): delete MQ message, eventually
// YYY (hhexo): yes, once we have the queued/running/finished mechanics
// return da.mq.Delete(ctx, mCall)
}
return nil
}

View File

@@ -113,7 +113,7 @@ func (d *dockerWrap) retry(ctx context.Context, f func() error) error {
err := filter(ctx, f())
if common.IsTemporary(err) || isDocker50x(err) {
logger.WithError(err).Warn("docker temporary error, retrying")
b.Sleep()
b.Sleep(ctx)
span.LogFields(log.String("task", "tmperror.docker"))
continue
}