Hybrid plumby (#585)

* fix configuration of agent and server to be future proof and plumb in the hybrid client agent * fixes up the tests, turns off /r/ on api nodes * fix up defaults for runner nodes * shove the runner async push code down into agent land to use client * plumb up async-age * return full call from async dequeue endpoint, since we're storing a whole call in the MQ we don't need to worry about caching of app/route [for now] * fast safe shutdown of dequeue looper in runner / tidying of agent * nice errors for path not found against /r/, /v1/ or other path not found * removed some stale TODO in agent * mq backends are only loud mouths in debug mode now * update tests * Add caching to hybrid client * Fix HTTP error handling in hybrid client. The type switch was on the value rather than a pointer. * Gofmt. * Better caching with a nice caching wrapper * Remove datastore cache which is now unused * Don't need to manually wrap interface methods * Go fmt
2022-10-28 21:29:17 +03:00 · 2017-12-12 15:54:55 -08:00
parent 05ce2e3868
commit bb92547b95
18 changed files with 433 additions and 375 deletions
--- a/api/agent/agent.go
+++ b/api/agent/agent.go
@@ -24,7 +24,6 @@ import (
 	"github.com/sirupsen/logrus"
 )

-// TODO make sure some errors that user should see (like image doesn't exist) bubble up
 // TODO we should prob store async calls in db immediately since we're returning id (will 404 until post-execution)
 // TODO async calls need to add route.Headers as well
 // TODO need to shut off reads/writes in dispatch to the pipes when call times out so that
@@ -32,15 +31,8 @@ import (
 // TODO add spans back around container launching for hot (follows from?) + other more granular spans
 // TODO handle timeouts / no response in sync & async (sync is json+503 atm, not 504, async is empty log+status)
 // see also: server/runner.go wrapping the response writer there, but need to handle async too (push down?)
-// TODO herd launch prevention part deux
 // TODO storing logs / call can push call over the timeout
-// TODO all Datastore methods need to take unit of tenancy (app or route) at least (e.g. not just call id)
 // TODO discuss concrete policy for hot launch or timeout / timeout vs time left
-// TODO it may be nice to have an interchange type for Dispatch that can have
-// all the info we need to build e.g. http req, grpc req, json, etc.  so that
-// we can easily do e.g. http->grpc, grpc->http, http->json. ofc grpc<->http is
-// weird for proto specifics like e.g. proto version, method, headers, et al.
-// discuss.
 // TODO if we don't cap the number of any one container we could get into a situation
 // where the machine is full but all the containers are idle up to the idle timeout. meh.
 // TODO async is still broken, but way less so. we need to modify mq semantics
@@ -49,9 +41,7 @@ import (
 // dies). need coordination w/ db.
 // TODO if a cold call times out but container is created but hasn't replied, could
 // end up that the client doesn't get a reply until long after the timeout (b/c of container removal, async it?)
-// TODO the call api should fill in all the fields
 // TODO the log api should be plaintext (or at least offer it)
-// TODO we should probably differentiate ran-but-timeout vs timeout-before-run
 // TODO between calls, logs and stderr can contain output/ids from previous call. need elegant solution. grossness.
 // TODO if async would store requests (or interchange format) it would be slick, but
 // if we're going to store full calls in db maybe we should only queue pointers to ids?
@@ -113,20 +103,15 @@ type Agent interface {
 	// Return the http.Handler used to handle Prometheus metric requests
 	PromHandler() http.Handler
 	AddCallListener(fnext.CallListener)
+
+	// Enqueue is to use the agent's sweet sweet client bindings to remotely
+	// queue async tasks and should be removed from Agent interface ASAP.
+	Enqueue(context.Context, *models.Call) error
 }

-type AgentNodeType int32
-
-const (
-	AgentTypeFull AgentNodeType = iota
-	AgentTypeAPI
-	AgentTypeRunner
-)
-
 type agent struct {
 	da            DataAccess
 	callListeners []fnext.CallListener
-	tp            AgentNodeType

 	driver drivers.Driver

@@ -146,13 +131,12 @@ type agent struct {
 	promHandler http.Handler
 }

-func New(ds models.Datastore, ls models.LogStore, mq models.MessageQueue, tp AgentNodeType) Agent {
+func New(da DataAccess) Agent {
 	// TODO: Create drivers.New(runnerConfig)
 	driver := docker.NewDocker(drivers.Config{})

 	a := &agent{
-		tp:          tp,
-		da:          NewDirectDataAccess(ds, ls, mq),
+		da:          da,
 		driver:      driver,
 		hot:         make(map[string]chan slot),
 		resources:   NewResourceTracker(),
@@ -160,16 +144,17 @@ func New(ds models.Datastore, ls models.LogStore, mq models.MessageQueue, tp Age
 		promHandler: promhttp.Handler(),
 	}

-	switch tp {
-	case AgentTypeAPI:
-		// Don't start dequeuing
-	default:
-		go a.asyncDequeue() // safe shutdown can nanny this fine
-	}
+	// TODO assert that agent doesn't get started for API nodes up above ?
+	go a.asyncDequeue() // safe shutdown can nanny this fine

 	return a
 }

+// TODO shuffle this around somewhere else (maybe)
+func (a *agent) Enqueue(ctx context.Context, call *models.Call) error {
+	return a.da.Enqueue(ctx, call)
+}
+
 func (a *agent) Close() error {
 	select {
 	case <-a.shutdown:
@@ -191,10 +176,6 @@ func transformTimeout(e error, isRetriable bool) error {
 }

 func (a *agent) Submit(callI Call) error {
-	if a.tp == AgentTypeAPI {
-		return errors.New("API agent cannot execute calls")
-	}
-
 	a.wg.Add(1)
 	defer a.wg.Done()

--- a/api/agent/agent_test.go
+++ b/api/agent/agent_test.go
@@ -49,7 +49,7 @@ func TestCallConfigurationRequest(t *testing.T) {
 		}, nil,
 	)

-	a := New(ds, ds, new(mqs.Mock), AgentTypeFull)
+	a := New(NewCachedDataAccess(NewDirectDataAccess(ds, ds, new(mqs.Mock))))
 	defer a.Close()

 	w := httptest.NewRecorder()
@@ -247,7 +247,7 @@ func TestCallConfigurationModel(t *testing.T) {
 	// FromModel doesn't need a datastore, for now...
 	ds := datastore.NewMockInit(nil, nil, nil)

-	a := New(ds, ds, new(mqs.Mock), AgentTypeFull)
+	a := New(NewCachedDataAccess(NewDirectDataAccess(ds, ds, new(mqs.Mock))))
 	defer a.Close()

 	callI, err := a.GetCall(FromModel(cm))
@@ -353,7 +353,7 @@ func TestSubmitError(t *testing.T) {
 	// FromModel doesn't need a datastore, for now...
 	ds := datastore.NewMockInit(nil, nil, nil)

-	a := New(ds, ds, new(mqs.Mock), AgentTypeFull)
+	a := New(NewCachedDataAccess(NewDirectDataAccess(ds, ds, new(mqs.Mock))))
 	defer a.Close()

 	callI, err := a.GetCall(FromModel(cm))
--- a/api/agent/async.go
+++ b/api/agent/async.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"time"

+	"github.com/fnproject/fn/api/models"
 	"github.com/sirupsen/logrus"
 )

@@ -11,6 +12,10 @@ func (a *agent) asyncDequeue() {
 	a.wg.Add(1)
 	defer a.wg.Done() // we can treat this thread like one big task and get safe shutdown fo free

+	// this is just so we can hang up the dequeue request if we get shut down
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
 	for {
 		select {
 		case <-a.shutdown:
@@ -22,42 +27,64 @@ func (a *agent) asyncDequeue() {
 			// out of RAM so..
 		}

-		ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) // TODO ???
-		model, err := a.da.Dequeue(ctx)
-		cancel()
-		if err != nil || model == nil {
-			if err != nil {
-				logrus.WithError(err).Error("error fetching queued calls")
+		// we think we can get a cookie now, so go get a cookie
+		select {
+		case <-a.shutdown:
+			return
+		case model, ok := <-a.asyncChew(ctx):
+			if ok {
+				a.wg.Add(1) // need to add 1 in this thread to ensure safe shutdown
+				go func(model *models.Call) {
+					a.asyncRun(model)
+					a.wg.Done() // can shed it after this is done, Submit will add 1 too but it's fine
+				}(model)
 			}
-			time.Sleep(1 * time.Second) // backoff a little
-			continue
 		}
-
-		// TODO output / logger should be here too...
-
-		a.wg.Add(1) // need to add 1 in this thread to ensure safe shutdown
-		go func() {
-			defer a.wg.Done() // can shed it after this is done, Submit will add 1 too but it's fine
-
-			call, err := a.GetCall(FromModel(model))
-			if err != nil {
-				logrus.WithError(err).Error("error getting async call")
-				return
-			}
-
-			// TODO if the task is cold and doesn't require reading STDIN, it could
-			// run but we may not listen for output since the task timed out. these
-			// are at least once semantics, which is really preferable to at most
-			// once, so let's do it for now
-
-			err = a.Submit(call)
-			if err != nil {
-				// NOTE: these could be errors / timeouts from the call that we're
-				// logging here (i.e. not our fault), but it's likely better to log
-				// these than suppress them so...
-				id := call.Model().ID
-				logrus.WithFields(logrus.Fields{"id": id}).WithError(err).Error("error running async call")
-			}
-		}()
+	}
+}
+
+func (a *agent) asyncChew(ctx context.Context) <-chan *models.Call {
+	ch := make(chan *models.Call, 1)
+
+	go func() {
+		ctx, cancel := context.WithTimeout(ctx, 60*time.Second)
+		defer cancel()
+
+		call, err := a.da.Dequeue(ctx)
+		if call != nil {
+			ch <- call
+		} else { // call is nil / error
+			if err != nil && err != context.DeadlineExceeded {
+				logrus.WithError(err).Error("error fetching queued calls")
+			}
+			// queue may be empty / unavailable
+			time.Sleep(1 * time.Second) // backoff a little before sending no cookie message
+			close(ch)
+		}
+	}()
+
+	return ch
+}
+
+func (a *agent) asyncRun(model *models.Call) {
+	// TODO output / logger should be here too...
+	call, err := a.GetCall(FromModel(model))
+	if err != nil {
+		logrus.WithError(err).Error("error getting async call")
+		return
+	}
+
+	// TODO if the task is cold and doesn't require reading STDIN, it could
+	// run but we may not listen for output since the task timed out. these
+	// are at least once semantics, which is really preferable to at most
+	// once, so let's do it for now
+
+	err = a.Submit(call)
+	if err != nil {
+		// NOTE: these could be errors / timeouts from the call that we're
+		// logging here (i.e. not our fault), but it's likely better to log
+		// these than suppress them so...
+		id := call.Model().ID
+		logrus.WithFields(logrus.Fields{"id": id}).WithError(err).Error("error running async call")
 	}
 }
--- a/api/agent/call.go
+++ b/api/agent/call.go
@@ -350,6 +350,9 @@ func (c *call) End(ctx context.Context, errIn error) error {
 		// note: Not returning err here since the job could have already finished successfully.
 	}

+	// NOTE call this after InsertLog or the buffer will get reset
+	c.stderr.Close()
+
 	if err := c.ct.fireAfterCall(ctx, c.Model()); err != nil {
 		return fmt.Errorf("AfterCall: %v", err)
 	}
--- a/api/agent/data_access.go
+++ b/api/agent/data_access.go
@@ -2,9 +2,13 @@ package agent

 import (
 	"context"
-	"github.com/fnproject/fn/api/common"
-	"github.com/fnproject/fn/api/models"
 	"io"
+	"time"
+
+	"github.com/fnproject/fn/api/common"
+	"github.com/fnproject/fn/api/common/singleflight"
+	"github.com/fnproject/fn/api/models"
+	"github.com/patrickmn/go-cache"
 )

 // DataAccess abstracts the datastore and message queue operations done by the
@@ -19,7 +23,7 @@ type DataAccess interface {
 	GetRoute(ctx context.Context, appName string, routePath string) (*models.Route, error)

 	// Enqueue will add a Call to the queue (ultimately forwards to mq.Push).
-	Enqueue(ctx context.Context, mCall *models.Call) (*models.Call, error)
+	Enqueue(ctx context.Context, mCall *models.Call) error

 	// Dequeue will query the queue for the next available Call that can be run
 	// by this Agent, and reserve it (ultimately forwards to mq.Reserve).
@@ -31,7 +35,71 @@ type DataAccess interface {

 	// Finish will notify the system that the Call has been processed, and
 	// fulfill the reservation in the queue if the call came from a queue.
-	Finish(ctx context.Context, mCall *models.Call, stderr io.ReadWriteCloser, async bool) error
+	Finish(ctx context.Context, mCall *models.Call, stderr io.Reader, async bool) error
+}
+
+// CachedDataAccess wraps a DataAccess and caches the results of GetApp and GetRoute.
+type CachedDataAccess struct {
+	DataAccess
+
+	cache        *cache.Cache
+	singleflight singleflight.SingleFlight
+}
+
+func NewCachedDataAccess(da DataAccess) DataAccess {
+	cda := &CachedDataAccess{
+		DataAccess: da,
+		cache:      cache.New(5*time.Second, 1*time.Minute),
+	}
+	return cda
+}
+
+func routeCacheKey(appname, path string) string {
+	return "r:" + appname + "\x00" + path
+}
+
+func appCacheKey(appname string) string {
+	return "a:" + appname
+}
+
+func (da *CachedDataAccess) GetApp(ctx context.Context, appName string) (*models.App, error) {
+	key := appCacheKey(appName)
+	app, ok := da.cache.Get(key)
+	if ok {
+		return app.(*models.App), nil
+	}
+
+	resp, err := da.singleflight.Do(key,
+		func() (interface{}, error) {
+			return da.DataAccess.GetApp(ctx, appName)
+		})
+
+	if err != nil {
+		return nil, err
+	}
+	app = resp.(*models.App)
+	da.cache.Set(key, app, cache.DefaultExpiration)
+	return app.(*models.App), nil
+}
+
+func (da *CachedDataAccess) GetRoute(ctx context.Context, appName string, routePath string) (*models.Route, error) {
+	key := routeCacheKey(appName, routePath)
+	r, ok := da.cache.Get(key)
+	if ok {
+		return r.(*models.Route), nil
+	}
+
+	resp, err := da.singleflight.Do(key,
+		func() (interface{}, error) {
+			return da.DataAccess.GetRoute(ctx, appName, routePath)
+		})
+
+	if err != nil {
+		return nil, err
+	}
+	r = resp.(*models.Route)
+	da.cache.Set(key, r, cache.DefaultExpiration)
+	return r.(*models.Route), nil
 }

 type directDataAccess struct {
@@ -57,8 +125,9 @@ func (da *directDataAccess) GetRoute(ctx context.Context, appName string, routeP
 	return da.ds.GetRoute(ctx, appName, routePath)
 }

-func (da *directDataAccess) Enqueue(ctx context.Context, mCall *models.Call) (*models.Call, error) {
-	return da.mq.Push(ctx, mCall)
+func (da *directDataAccess) Enqueue(ctx context.Context, mCall *models.Call) error {
+	_, err := da.mq.Push(ctx, mCall)
+	return err
 	// TODO: Insert a call in the datastore with the 'queued' state
 }

@@ -77,7 +146,7 @@ func (da *directDataAccess) Start(ctx context.Context, mCall *models.Call) error
 	return da.mq.Delete(ctx, mCall)
 }

-func (da *directDataAccess) Finish(ctx context.Context, mCall *models.Call, stderr io.ReadWriteCloser, async bool) error {
+func (da *directDataAccess) Finish(ctx context.Context, mCall *models.Call, stderr io.Reader, async bool) error {
 	// this means that we could potentially store an error / timeout status for a
 	// call that ran successfully [by a user's perspective]
 	// TODO: this should be update, really
@@ -90,8 +159,6 @@ func (da *directDataAccess) Finish(ctx context.Context, mCall *models.Call, stde
 		common.Logger(ctx).WithError(err).Error("error uploading log")
 		// note: Not returning err here since the job could have already finished successfully.
 	}
-	// NOTE call this after InsertLog or the buffer will get reset
-	stderr.Close()

 	if async {
 		// XXX (reed): delete MQ message, eventually
--- a/api/agent/hybrid/client.go
+++ b/api/agent/hybrid/client.go
@@ -0,0 +1,241 @@
+package hybrid
+
+import (
+	"bytes"
+	"context"
+	"crypto/tls"
+	"encoding/json"
+	"errors"
+	"io"
+	"io/ioutil"
+	"net"
+	"net/http"
+	"net/url"
+	"strings"
+	"time"
+
+	"github.com/fnproject/fn/api/agent"
+	"github.com/fnproject/fn/api/common"
+	"github.com/fnproject/fn/api/models"
+	opentracing "github.com/opentracing/opentracing-go"
+)
+
+// client implements agent.DataAccess
+type client struct {
+	base string
+	http *http.Client
+}
+
+func NewClient(u string) (agent.DataAccess, error) {
+	uri, err := url.Parse(u)
+	if err != nil {
+		return nil, err
+	}
+
+	if uri.Host == "" {
+		return nil, errors.New("no host specified for client")
+	}
+	if uri.Scheme == "" {
+		uri.Scheme = "http"
+	}
+	host := uri.Scheme + "://" + uri.Host + "/v1/"
+
+	httpClient := &http.Client{
+		Timeout: 60 * time.Second,
+		Transport: &http.Transport{
+			Proxy: http.ProxyFromEnvironment,
+			Dial: (&net.Dialer{
+				Timeout:   30 * time.Second,
+				KeepAlive: 30 * time.Second,
+			}).Dial,
+			MaxIdleConns:        512,
+			MaxIdleConnsPerHost: 128,
+			IdleConnTimeout:     90 * time.Second,
+			TLSHandshakeTimeout: 10 * time.Second,
+			TLSClientConfig: &tls.Config{
+				ClientSessionCache: tls.NewLRUClientSessionCache(8096),
+			},
+			ExpectContinueTimeout: 1 * time.Second,
+		},
+	}
+
+	return &client{
+		base: host,
+		http: httpClient,
+	}, nil
+}
+
+func (cl *client) Enqueue(ctx context.Context, c *models.Call) error {
+	span, ctx := opentracing.StartSpanFromContext(ctx, "hybrid_client_enqueue")
+	defer span.Finish()
+
+	err := cl.do(ctx, c, nil, "PUT", "runner", "async")
+	return err
+}
+
+func (cl *client) Dequeue(ctx context.Context) (*models.Call, error) {
+	span, ctx := opentracing.StartSpanFromContext(ctx, "hybrid_client_dequeue")
+	defer span.Finish()
+
+	var c struct {
+		C []*models.Call `json:"calls"`
+	}
+	err := cl.do(ctx, nil, &c, "GET", "runner", "async")
+	if len(c.C) > 0 {
+		return c.C[0], nil
+	}
+	return nil, err
+}
+
+func (cl *client) Start(ctx context.Context, c *models.Call) error {
+	span, ctx := opentracing.StartSpanFromContext(ctx, "hybrid_client_start")
+	defer span.Finish()
+
+	err := cl.do(ctx, c, nil, "POST", "runner", "start")
+	return err
+}
+
+func (cl *client) Finish(ctx context.Context, c *models.Call, r io.Reader, async bool) error {
+	span, ctx := opentracing.StartSpanFromContext(ctx, "hybrid_client_end")
+	defer span.Finish()
+
+	var b bytes.Buffer // TODO pool / we should multipart this?
+	_, err := io.Copy(&b, r)
+	if err != nil {
+		return err
+	}
+	bod := struct {
+		C *models.Call `json:"call"`
+		L string       `json:"log"`
+	}{
+		C: c,
+		L: b.String(),
+	}
+
+	// TODO add async bit to query params or body
+	err = cl.do(ctx, bod, nil, "POST", "runner", "finish")
+	return err
+}
+
+func (cl *client) GetApp(ctx context.Context, appName string) (*models.App, error) {
+	span, ctx := opentracing.StartSpanFromContext(ctx, "hybrid_client_get_app")
+	defer span.Finish()
+
+	var a struct {
+		A models.App `json:"app"`
+	}
+	err := cl.do(ctx, nil, &a, "GET", "apps", appName)
+	return &a.A, err
+}
+
+func (cl *client) GetRoute(ctx context.Context, appName, route string) (*models.Route, error) {
+	span, ctx := opentracing.StartSpanFromContext(ctx, "hybrid_client_get_route")
+	defer span.Finish()
+
+	// TODO trim prefix is pretty odd here eh?
+	var r struct {
+		R models.Route `json:"route"`
+	}
+	err := cl.do(ctx, nil, &r, "GET", "apps", appName, "routes", strings.TrimPrefix(route, "/"))
+	return &r.R, err
+}
+
+type httpErr struct {
+	code int
+	error
+}
+
+func (cl *client) do(ctx context.Context, request, result interface{}, method string, url ...string) error {
+	// TODO determine policy (should we count to infinity?)
+
+	var b common.Backoff
+	var err error
+	for i := 0; i < 5; i++ {
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		default:
+		}
+
+		// TODO this isn't re-using buffers very efficiently, but retries should be rare...
+		err = cl.once(ctx, request, result, method, url...)
+		switch err := err.(type) {
+		case nil:
+			return err
+		case *httpErr:
+			if err.code < 500 {
+				return err
+			}
+			// retry 500s...
+		default:
+			// this error wasn't from us [most likely], probably a conn refused/timeout, just retry it out
+		}
+
+		common.Logger(ctx).WithError(err).Error("error from API server, retrying")
+
+		b.Sleep(ctx)
+	}
+
+	// return last error
+	return err
+}
+
+func (cl *client) once(ctx context.Context, request, result interface{}, method string, url ...string) error {
+	span, ctx := opentracing.StartSpanFromContext(ctx, "hybrid_client_http_do")
+	defer span.Finish()
+
+	var b bytes.Buffer // TODO pool
+	if request != nil {
+		err := json.NewEncoder(&b).Encode(request)
+		if err != nil {
+			return err
+		}
+	}
+
+	req, err := http.NewRequest(method, cl.url(url...), &b)
+	if err != nil {
+		return err
+	}
+	req = req.WithContext(ctx)
+	// shove the span headers in so that the server will continue this span
+	opentracing.GlobalTracer().Inject(
+		span.Context(),
+		opentracing.HTTPHeaders,
+		opentracing.HTTPHeadersCarrier(req.Header))
+
+	resp, err := cl.http.Do(req)
+	if err != nil {
+		return err
+	}
+	defer func() { io.Copy(ioutil.Discard, resp.Body); resp.Body.Close() }()
+
+	if resp.StatusCode >= 300 {
+		// one of our errors
+		var msg struct {
+			Err *struct {
+				Msg string `json:"message"`
+			} `json:"error"`
+		}
+		// copy into a buffer in case it wasn't from us
+		var b bytes.Buffer
+		io.Copy(&b, resp.Body)
+		json.Unmarshal(b.Bytes(), &msg)
+		if msg.Err != nil {
+			return &httpErr{code: resp.StatusCode, error: errors.New(msg.Err.Msg)}
+		}
+		return &httpErr{code: resp.StatusCode, error: errors.New(b.String())}
+	}
+
+	if result != nil {
+		err := json.NewDecoder(resp.Body).Decode(&result)
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (cl *client) url(args ...string) string {
+	return cl.base + strings.Join(args, "/")
+}