hybrid mergy (#581)

* so it begins * add clarification to /dequeue, change response to list to future proof * Specify that runner endpoints are also under /v1 * Add a flag to choose operation mode (node type). This is specified using the `FN_NODE_TYPE` environment variable. The default is the existing behaviour, where the server supports all operations (full API plus asynchronous and synchronous runners). The additional modes are: * API - the full API is available, but no functions are executed by the node. Async calls are placed into a message queue, and synchronous calls are not supported (invoking them results in an API error). * Runner - only the invocation/route API is present. Asynchronous and synchronous invocation requests are supported, but asynchronous requests are placed onto the message queue, so might be handled by another runner. * Add agent type and checks on Submit * Sketch of a factored out data access abstraction for api/runner agents * Fix tests, adding node/agent types to constructors * Add tests for full, API, and runner server modes. * Added atomic UpdateCall to datastore * adds in server side endpoints * Made ServerNodeType public because tests use it * Made ServerNodeType public because tests use it * fix test build * add hybrid runner client pretty simple go api client that covers surface area needed for hybrid, returning structs from models that the agent can use directly. not exactly sure where to put this, so put it in `/clients/hybrid` but maybe we should make `/api/runner/client` or something and shove it in there. want to get integration tests set up and use the real endpoints next and then wrap this up in the DataAccessLayer stuff. * gracefully handles errors from fn * handles backoff & retry on 500s * will add to existing spans for debuggo action * minor fixes * meh
2022-10-28 21:29:17 +03:00 · 2017-12-11 10:43:19 -08:00
parent 1df4b46c56
commit 2ebc9c7480
26 changed files with 1157 additions and 94 deletions
--- a/api/agent/agent.go
+++ b/api/agent/agent.go
@@ -115,12 +115,18 @@ type Agent interface {
 	AddCallListener(fnext.CallListener)
 }

+type AgentNodeType int32
+
+const (
+	AgentTypeFull AgentNodeType = iota
+	AgentTypeAPI
+	AgentTypeRunner
+)
+
 type agent struct {
-	// TODO maybe these should be on GetCall? idk. was getting bloated.
-	mq            models.MessageQueue
-	ds            models.Datastore
-	ls            models.LogStore
+	da            DataAccess
 	callListeners []fnext.CallListener
+	tp            AgentNodeType

 	driver drivers.Driver

@@ -140,14 +146,13 @@ type agent struct {
 	promHandler http.Handler
 }

-func New(ds models.Datastore, ls models.LogStore, mq models.MessageQueue) Agent {
+func New(ds models.Datastore, ls models.LogStore, mq models.MessageQueue, tp AgentNodeType) Agent {
 	// TODO: Create drivers.New(runnerConfig)
 	driver := docker.NewDocker(drivers.Config{})

 	a := &agent{
-		ds:          ds,
-		ls:          ls,
-		mq:          mq,
+		tp:          tp,
+		da:          NewDirectDataAccess(ds, ls, mq),
 		driver:      driver,
 		hot:         make(map[string]chan slot),
 		resources:   NewResourceTracker(),
@@ -155,7 +160,12 @@ func New(ds models.Datastore, ls models.LogStore, mq models.MessageQueue) Agent
 		promHandler: promhttp.Handler(),
 	}

-	go a.asyncDequeue() // safe shutdown can nanny this fine
+	switch tp {
+	case AgentTypeAPI:
+		// Don't start dequeuing
+	default:
+		go a.asyncDequeue() // safe shutdown can nanny this fine
+	}

 	return a
 }
@@ -181,6 +191,10 @@ func transformTimeout(e error, isRetriable bool) error {
 }

 func (a *agent) Submit(callI Call) error {
+	if a.tp == AgentTypeAPI {
+		return errors.New("API agent cannot execute calls")
+	}
+
 	a.wg.Add(1)
 	defer a.wg.Done()

--- a/api/agent/agent_test.go
+++ b/api/agent/agent_test.go
@@ -49,7 +49,7 @@ func TestCallConfigurationRequest(t *testing.T) {
 		}, nil,
 	)

-	a := New(ds, ds, new(mqs.Mock))
+	a := New(ds, ds, new(mqs.Mock), AgentTypeFull)
 	defer a.Close()

 	w := httptest.NewRecorder()
@@ -247,7 +247,7 @@ func TestCallConfigurationModel(t *testing.T) {
 	// FromModel doesn't need a datastore, for now...
 	ds := datastore.NewMockInit(nil, nil, nil)

-	a := New(ds, ds, new(mqs.Mock))
+	a := New(ds, ds, new(mqs.Mock), AgentTypeFull)
 	defer a.Close()

 	callI, err := a.GetCall(FromModel(cm))
@@ -353,7 +353,7 @@ func TestSubmitError(t *testing.T) {
 	// FromModel doesn't need a datastore, for now...
 	ds := datastore.NewMockInit(nil, nil, nil)

-	a := New(ds, ds, new(mqs.Mock))
+	a := New(ds, ds, new(mqs.Mock), AgentTypeFull)
 	defer a.Close()

 	callI, err := a.GetCall(FromModel(cm))
--- a/api/agent/async.go
+++ b/api/agent/async.go
@@ -23,7 +23,7 @@ func (a *agent) asyncDequeue() {
 		}

 		ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) // TODO ???
-		model, err := a.mq.Reserve(ctx)
+		model, err := a.da.Dequeue(ctx)
 		cancel()
 		if err != nil || model == nil {
 			if err != nil {
--- a/api/agent/call.go
+++ b/api/agent/call.go
@@ -50,12 +50,12 @@ type Params []Param

 func FromRequest(appName, path string, req *http.Request, params Params) CallOpt {
 	return func(a *agent, c *call) error {
-		app, err := a.ds.GetApp(req.Context(), appName)
+		app, err := a.da.GetApp(req.Context(), appName)
 		if err != nil {
 			return err
 		}

-		route, err := a.ds.GetRoute(req.Context(), appName, path)
+		route, err := a.da.GetRoute(req.Context(), appName, path)
 		if err != nil {
 			return err
 		}
@@ -248,9 +248,7 @@ func (a *agent) GetCall(opts ...CallOpt) (Call, error) {
 		return nil, errors.New("no model or request provided for call")
 	}

-	c.ds = a.ds
-	c.ls = a.ls
-	c.mq = a.mq
+	c.da = a.da
 	c.ct = a

 	ctx, _ := common.LoggerWithFields(c.req.Context(),
@@ -272,9 +270,7 @@ func (a *agent) GetCall(opts ...CallOpt) (Call, error) {
 type call struct {
 	*models.Call

-	ds     models.Datastore
-	ls     models.LogStore
-	mq     models.MessageQueue
+	da     DataAccess
 	w      io.Writer
 	req    *http.Request
 	stderr io.ReadWriteCloser
@@ -316,7 +312,7 @@ func (c *call) Start(ctx context.Context) error {
 		// running to avoid running the call twice and potentially mark it as
 		// errored (built in long running task detector, so to speak...)

-		err := c.mq.Delete(ctx, c.Call)
+		err := c.da.Start(ctx, c.Model())
 		if err != nil {
 			return err // let another thread try this
 		}
@@ -346,29 +342,14 @@ func (c *call) End(ctx context.Context, errIn error) error {
 		c.Error = errIn.Error()
 	}

-	if c.Type == models.TypeAsync {
-		// XXX (reed): delete MQ message, eventually
-	}
-
 	// ensure stats histogram is reasonably bounded
 	c.Call.Stats = drivers.Decimate(240, c.Call.Stats)

-	// this means that we could potentially store an error / timeout status for a
-	// call that ran successfully [by a user's perspective]
-	// TODO: this should be update, really
-	if err := c.ds.InsertCall(ctx, c.Call); err != nil {
-		common.Logger(ctx).WithError(err).Error("error inserting call into datastore")
+	if err := c.da.Finish(ctx, c.Model(), c.stderr, c.Type == models.TypeAsync); err != nil {
+		common.Logger(ctx).WithError(err).Error("error finalizing call on datastore/mq")
 		// note: Not returning err here since the job could have already finished successfully.
 	}

-	if err := c.ls.InsertLog(ctx, c.AppName, c.ID, c.stderr); err != nil {
-		common.Logger(ctx).WithError(err).Error("error uploading log")
-		// note: Not returning err here since the job could have already finished successfully.
-	}
-
-	// NOTE call this after InsertLog or the buffer will get reset
-	c.stderr.Close()
-
 	if err := c.ct.fireAfterCall(ctx, c.Model()); err != nil {
 		return fmt.Errorf("AfterCall: %v", err)
 	}
--- a/api/agent/data_access.go
+++ b/api/agent/data_access.go
@@ -0,0 +1,102 @@
+package agent
+
+import (
+	"context"
+	"github.com/fnproject/fn/api/common"
+	"github.com/fnproject/fn/api/models"
+	"io"
+)
+
+// DataAccess abstracts the datastore and message queue operations done by the
+// agent, so that API nodes and runner nodes can work with the same interface
+// but actually operate on the data in different ways (by direct access or by
+// mediation through an API node).
+type DataAccess interface {
+	// GetApp abstracts querying the datastore for an app.
+	GetApp(ctx context.Context, appName string) (*models.App, error)
+
+	// GetRoute abstracts querying the datastore for a route within an app.
+	GetRoute(ctx context.Context, appName string, routePath string) (*models.Route, error)
+
+	// Enqueue will add a Call to the queue (ultimately forwards to mq.Push).
+	Enqueue(ctx context.Context, mCall *models.Call) (*models.Call, error)
+
+	// Dequeue will query the queue for the next available Call that can be run
+	// by this Agent, and reserve it (ultimately forwards to mq.Reserve).
+	Dequeue(ctx context.Context) (*models.Call, error)
+
+	// Start will attempt to start the provided Call within an appropriate
+	// context.
+	Start(ctx context.Context, mCall *models.Call) error
+
+	// Finish will notify the system that the Call has been processed, and
+	// fulfill the reservation in the queue if the call came from a queue.
+	Finish(ctx context.Context, mCall *models.Call, stderr io.ReadWriteCloser, async bool) error
+}
+
+type directDataAccess struct {
+	mq models.MessageQueue
+	ds models.Datastore
+	ls models.LogStore
+}
+
+func NewDirectDataAccess(ds models.Datastore, ls models.LogStore, mq models.MessageQueue) DataAccess {
+	da := &directDataAccess{
+		mq: mq,
+		ds: ds,
+		ls: ls,
+	}
+	return da
+}
+
+func (da *directDataAccess) GetApp(ctx context.Context, appName string) (*models.App, error) {
+	return da.ds.GetApp(ctx, appName)
+}
+
+func (da *directDataAccess) GetRoute(ctx context.Context, appName string, routePath string) (*models.Route, error) {
+	return da.ds.GetRoute(ctx, appName, routePath)
+}
+
+func (da *directDataAccess) Enqueue(ctx context.Context, mCall *models.Call) (*models.Call, error) {
+	return da.mq.Push(ctx, mCall)
+	// TODO: Insert a call in the datastore with the 'queued' state
+}
+
+func (da *directDataAccess) Dequeue(ctx context.Context) (*models.Call, error) {
+	return da.mq.Reserve(ctx)
+}
+
+func (da *directDataAccess) Start(ctx context.Context, mCall *models.Call) error {
+	// TODO Access datastore and try a Compare-And-Swap to set the call to
+	// 'running'. If it fails, delete the message from the MQ and return an
+	// error. If it is successful, don't do anything - the message will be
+	// removed when the call Finish'es.
+
+	// At the moment we don't have the queued/running/finished mechanics so we
+	// remove the message here.
+	return da.mq.Delete(ctx, mCall)
+}
+
+func (da *directDataAccess) Finish(ctx context.Context, mCall *models.Call, stderr io.ReadWriteCloser, async bool) error {
+	// this means that we could potentially store an error / timeout status for a
+	// call that ran successfully [by a user's perspective]
+	// TODO: this should be update, really
+	if err := da.ds.InsertCall(ctx, mCall); err != nil {
+		common.Logger(ctx).WithError(err).Error("error inserting call into datastore")
+		// note: Not returning err here since the job could have already finished successfully.
+	}
+
+	if err := da.ls.InsertLog(ctx, mCall.AppName, mCall.ID, stderr); err != nil {
+		common.Logger(ctx).WithError(err).Error("error uploading log")
+		// note: Not returning err here since the job could have already finished successfully.
+	}
+	// NOTE call this after InsertLog or the buffer will get reset
+	stderr.Close()
+
+	if async {
+		// XXX (reed): delete MQ message, eventually
+		// YYY (hhexo): yes, once we have the queued/running/finished mechanics
+		// return da.mq.Delete(ctx, mCall)
+	}
+	return nil
+}
--- a/api/agent/drivers/docker/docker_client.go
+++ b/api/agent/drivers/docker/docker_client.go
@@ -113,7 +113,7 @@ func (d *dockerWrap) retry(ctx context.Context, f func() error) error {
 		err := filter(ctx, f())
 		if common.IsTemporary(err) || isDocker50x(err) {
 			logger.WithError(err).Warn("docker temporary error, retrying")
-			b.Sleep()
+			b.Sleep(ctx)
 			span.LogFields(log.String("task", "tmperror.docker"))
 			continue
 		}