fn: locked mutex while blocked on I/O considered harmful (#935)

* fn: mutex while waiting I/O considered harmful

*) Removed hold mutex while wait I/O cases these
included possible disk I/O and network I/O.

*) Error/Context Close/Shutdown semantics changed since
the context timeout and comments were misleading. Close
always waits for pending gRPC session to complete.
Context usage here was merely 'wait up to x secs to
report an error' which only logs the error anyway.
Instead, the runner can log the error. And context
still can be passed around perhaps for future opencensus
instrumentation.
This commit is contained in:
Tolga Ceylan
2018-04-13 11:23:29 -07:00
committed by GitHub
parent 8a35c9876a
commit c0ee3ce736
8 changed files with 224 additions and 103 deletions

View File

@@ -23,7 +23,6 @@ const (
noCapacityWaitInterval = 1 * time.Second noCapacityWaitInterval = 1 * time.Second
// amount of time to wait to place a request on a runner // amount of time to wait to place a request on a runner
placementTimeout = 15 * time.Second placementTimeout = 15 * time.Second
runnerPoolShutdownTimeout = 5 * time.Second
) )
type lbAgent struct { type lbAgent struct {
@@ -118,9 +117,7 @@ func (a *lbAgent) Close() error {
ch := a.shutWg.CloseGroupNB() ch := a.shutWg.CloseGroupNB()
// finally shutdown the runner pool // finally shutdown the runner pool
ctx, cancel := context.WithTimeout(context.Background(), runnerPoolShutdownTimeout) err := a.rp.Shutdown(context.Background())
defer cancel()
err := a.rp.Shutdown(ctx)
if err != nil { if err != nil {
logrus.WithError(err).Warn("Runner pool shutdown error") logrus.WithError(err).Warn("Runner pool shutdown error")
} }

View File

@@ -105,7 +105,7 @@ func (r *mockRunner) TryExec(ctx context.Context, call pool.RunnerCall) (bool, e
return true, nil return true, nil
} }
func (r *mockRunner) Close(ctx context.Context) error { func (r *mockRunner) Close(context.Context) error {
go func() { go func() {
r.wg.Wait() r.wg.Wait()
}() }()

View File

@@ -42,22 +42,9 @@ func SecureGRPCRunnerFactory(addr, runnerCertCN string, pki *pool.PKIData) (pool
}, nil }, nil
} }
// Close waits until the context is closed for all inflight requests func (r *gRPCRunner) Close(context.Context) error {
// to complete prior to terminating the underlying grpc connection
func (r *gRPCRunner) Close(ctx context.Context) error {
err := make(chan error, 1)
go func() {
defer close(err)
r.shutWg.CloseGroup() r.shutWg.CloseGroup()
err <- r.conn.Close() return r.conn.Close()
}()
select {
case e := <-err:
return e
case <-ctx.Done():
return ctx.Err() // context timed out while waiting
}
} }
func runnerConnection(address, runnerCertCN string, pki *pool.PKIData) (*grpc.ClientConn, pb.RunnerProtocolClient, error) { func runnerConnection(address, runnerCertCN string, pki *pool.PKIData) (*grpc.ClientConn, pb.RunnerProtocolClient, error) {

View File

@@ -2,15 +2,16 @@ package agent
import ( import (
"context" "context"
"errors"
"sync" "sync"
"time"
pool "github.com/fnproject/fn/api/runnerpool" pool "github.com/fnproject/fn/api/runnerpool"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
) )
const ( var (
staticPoolShutdownTimeout = 5 * time.Second ErrorPoolClosed = errors.New("Runner pool closed")
ErrorPoolRunnerExists = errors.New("Runner already exists")
) )
// manages a single set of runners ignoring lb groups // manages a single set of runners ignoring lb groups
@@ -20,6 +21,7 @@ type staticRunnerPool struct {
runnerCN string runnerCN string
rMtx *sync.RWMutex rMtx *sync.RWMutex
runners []pool.Runner runners []pool.Runner
isClosed bool
} }
func DefaultStaticRunnerPool(runnerAddresses []string) pool.RunnerPool { func DefaultStaticRunnerPool(runnerAddresses []string) pool.RunnerPool {
@@ -32,7 +34,7 @@ func NewStaticRunnerPool(runnerAddresses []string, pki *pool.PKIData, runnerCN s
for _, addr := range runnerAddresses { for _, addr := range runnerAddresses {
r, err := runnerFactory(addr, runnerCN, pki) r, err := runnerFactory(addr, runnerCN, pki)
if err != nil { if err != nil {
logrus.WithField("runner_addr", addr).Warn("Invalid runner") logrus.WithError(err).WithField("runner_addr", addr).Warn("Invalid runner")
continue continue
} }
logrus.WithField("runner_addr", addr).Debug("Adding runner to pool") logrus.WithField("runner_addr", addr).Debug("Adding runner to pool")
@@ -47,87 +49,112 @@ func NewStaticRunnerPool(runnerAddresses []string, pki *pool.PKIData, runnerCN s
} }
} }
func (rp *staticRunnerPool) Runners(call pool.RunnerCall) ([]pool.Runner, error) { func (rp *staticRunnerPool) shutdown() []pool.Runner {
rp.rMtx.RLock()
defer rp.rMtx.RUnlock()
r := make([]pool.Runner, len(rp.runners))
copy(r, rp.runners)
return r, nil
}
func (rp *staticRunnerPool) AddRunner(address string) error {
rp.rMtx.Lock() rp.rMtx.Lock()
defer rp.rMtx.Unlock() defer rp.rMtx.Unlock()
r, err := rp.generator(address, rp.runnerCN, rp.pki) if rp.isClosed {
if err != nil {
logrus.WithField("runner_addr", address).Warn("Failed to add runner")
return err
}
rp.runners = append(rp.runners, r)
return nil return nil
} }
func (rp *staticRunnerPool) RemoveRunner(address string) { rp.isClosed = true
toRemove := rp.runners[:]
rp.runners = nil
return toRemove
}
func (rp *staticRunnerPool) addRunner(runner pool.Runner) error {
rp.rMtx.Lock() rp.rMtx.Lock()
defer rp.rMtx.Unlock() defer rp.rMtx.Unlock()
ctx, cancel := context.WithTimeout(context.Background(), staticPoolShutdownTimeout) if rp.isClosed {
defer cancel() return ErrorPoolClosed
}
for _, r := range rp.runners {
if r.Address() == runner.Address() {
return ErrorPoolRunnerExists
}
}
rp.runners = append(rp.runners, runner)
return nil
}
func (rp *staticRunnerPool) removeRunner(address string) pool.Runner {
rp.rMtx.Lock()
defer rp.rMtx.Unlock()
for i, r := range rp.runners { for i, r := range rp.runners {
if r.Address() == address { if r.Address() == address {
err := r.Close(ctx)
if err != nil {
logrus.WithError(err).WithField("runner_addr", r.Address()).Error("Failed to close runner")
}
// delete runner from list
rp.runners = append(rp.runners[:i], rp.runners[i+1:]...) rp.runners = append(rp.runners[:i], rp.runners[i+1:]...)
return return r
}
}
}
// Shutdown blocks waiting for all runners to close, or until ctx is done
func (rp *staticRunnerPool) Shutdown(ctx context.Context) (e error) {
rp.rMtx.Lock()
defer rp.rMtx.Unlock()
ctx, cancel := context.WithTimeout(context.Background(), staticPoolShutdownTimeout)
defer cancel()
errors := make(chan error, len(rp.runners))
var wg sync.WaitGroup
for _, r := range rp.runners {
wg.Add(1)
go func(runner pool.Runner) {
defer wg.Done()
err := runner.Close(ctx)
if err != nil {
logrus.WithError(err).WithField("runner_addr", runner.Address()).Error("Failed to close runner")
errors <- err
}
}(r)
}
done := make(chan interface{})
go func() {
defer close(done)
wg.Wait()
}()
select {
case <-done:
close(errors)
for e := range errors {
// return the first error
if e != nil {
return e
} }
} }
return nil return nil
case <-ctx.Done(): }
return ctx.Err() // context timed out while waiting
func (rp *staticRunnerPool) getRunners() ([]pool.Runner, error) {
rp.rMtx.RLock()
defer rp.rMtx.RUnlock()
if rp.isClosed {
return nil, ErrorPoolClosed
}
r := make([]pool.Runner, len(rp.runners))
copy(r, rp.runners)
return r, nil
}
func (rp *staticRunnerPool) Runners(call pool.RunnerCall) ([]pool.Runner, error) {
return rp.getRunners()
}
func (rp *staticRunnerPool) AddRunner(address string) error {
r, err := rp.generator(address, rp.runnerCN, rp.pki)
if err != nil {
logrus.WithError(err).WithField("runner_addr", address).Warn("Failed to add runner")
return err
}
err = rp.addRunner(r)
if err != nil {
err2 := r.Close(context.Background())
if err2 != nil {
logrus.WithError(err2).WithField("runner_addr", address).Warn("Error closing runner on AddRunner failure")
} }
} }
return err
}
func (rp *staticRunnerPool) RemoveRunner(address string) {
toRemove := rp.removeRunner(address)
if toRemove == nil {
return
}
err := toRemove.Close(context.Background())
if err != nil {
logrus.WithError(err).WithField("runner_addr", toRemove.Address()).Error("Error closing runner")
}
}
func (rp *staticRunnerPool) Shutdown(ctx context.Context) error {
toRemove := rp.shutdown()
var retErr error
for _, r := range toRemove {
err := r.Close(ctx)
if err != nil {
logrus.WithError(err).WithField("runner_addr", r.Address()).Error("Error closing runner")
// grab the first error only for now.
if retErr == nil {
retErr = err
}
}
}
return retErr
}

View File

@@ -2,6 +2,7 @@ package agent
import ( import (
"context" "context"
"errors"
"testing" "testing"
pool "github.com/fnproject/fn/api/runnerpool" pool "github.com/fnproject/fn/api/runnerpool"
@@ -11,6 +12,10 @@ func setupStaticPool(runners []string) pool.RunnerPool {
return NewStaticRunnerPool(runners, nil, "", mockRunnerFactory) return NewStaticRunnerPool(runners, nil, "", mockRunnerFactory)
} }
var (
ErrorGarbanzoBeans = errors.New("yes, that's right. Garbanzo beans...")
)
type mockStaticRunner struct { type mockStaticRunner struct {
address string address string
} }
@@ -19,8 +24,8 @@ func (r *mockStaticRunner) TryExec(ctx context.Context, call pool.RunnerCall) (b
return true, nil return true, nil
} }
func (r *mockStaticRunner) Close(ctx context.Context) error { func (r *mockStaticRunner) Close(context.Context) error {
return nil return ErrorGarbanzoBeans
} }
func (r *mockStaticRunner) Address() string { func (r *mockStaticRunner) Address() string {
@@ -44,10 +49,57 @@ func TestNewStaticPool(t *testing.T) {
} }
} }
func TestEmptyPool(t *testing.T) {
np := setupStaticPool(nil).(*staticRunnerPool)
runners, err := np.Runners(nil)
if err != nil {
t.Fatalf("Failed to list runners %v", err)
}
if len(runners) != 0 {
t.Fatalf("Invalid number of runners %v", len(runners))
}
err = np.AddRunner("127.0.0.1:8082")
if err != nil {
t.Fatalf("Failed to add runner %v", err)
}
runners, err = np.Runners(nil)
if err != nil {
t.Fatalf("Failed to list runners %v", err)
}
if len(runners) != 1 {
t.Fatalf("Invalid number of runners %v", len(runners))
}
err = np.Shutdown(context.Background())
if err != ErrorGarbanzoBeans {
t.Fatalf("Expected garbanzo beans error from shutdown %v", err)
}
runners, err = np.Runners(nil)
if err == nil {
t.Fatalf("Should fail to list runners (shutdown)")
}
if len(runners) != 0 {
t.Fatalf("Invalid number of runners %v", len(runners))
}
}
func TestAddNodeToPool(t *testing.T) { func TestAddNodeToPool(t *testing.T) {
addrs := []string{"127.0.0.1:8080", "127.0.0.1:8081"} addrs := []string{"127.0.0.1:8080", "127.0.0.1:8081"}
np := setupStaticPool(addrs).(*staticRunnerPool) np := setupStaticPool(addrs).(*staticRunnerPool)
np.AddRunner("127.0.0.1:8082")
err := np.AddRunner("127.0.0.1:8082")
if err != nil {
t.Fatalf("Add Should not fail %v", err)
}
err = np.AddRunner("127.0.0.1:8082")
if err != ErrorPoolRunnerExists {
t.Fatalf("Add Should fail since duplicate %v", err)
}
runners, err := np.Runners(nil) runners, err := np.Runners(nil)
if err != nil { if err != nil {
@@ -56,11 +108,25 @@ func TestAddNodeToPool(t *testing.T) {
if len(runners) != 3 { if len(runners) != 3 {
t.Fatalf("Invalid number of runners %v", len(runners)) t.Fatalf("Invalid number of runners %v", len(runners))
} }
err = np.Shutdown(context.Background())
if err != ErrorGarbanzoBeans {
t.Fatalf("Expected garbanzo beans error from shutdown %v", err)
}
runners, err = np.Runners(nil)
if err == nil {
t.Fatalf("Should fail to list runners (shutdown)")
}
if len(runners) != 0 {
t.Fatalf("Invalid number of runners %v", len(runners))
}
} }
func TestRemoveNodeFromPool(t *testing.T) { func TestRemoveNodeFromPool(t *testing.T) {
addrs := []string{"127.0.0.1:8080", "127.0.0.1:8081"} addrs := []string{"127.0.0.1:8080", "127.0.0.1:8081"}
np := setupStaticPool(addrs).(*staticRunnerPool) np := setupStaticPool(addrs).(*staticRunnerPool)
np.RemoveRunner("127.0.0.1:8081") np.RemoveRunner("127.0.0.1:8081")
runners, err := np.Runners(nil) runners, err := np.Runners(nil)
@@ -81,4 +147,42 @@ func TestRemoveNodeFromPool(t *testing.T) {
if len(runners) != 1 { if len(runners) != 1 {
t.Fatalf("Invalid number of runners %v", len(runners)) t.Fatalf("Invalid number of runners %v", len(runners))
} }
np.RemoveRunner("127.0.0.1:8080")
runners, err = np.Runners(nil)
if err != nil {
t.Fatalf("Failed to list runners %v", err)
}
if len(runners) != 0 {
t.Fatalf("Invalid number of runners %v", len(runners))
}
np.RemoveRunner("127.0.0.1:8080")
runners, err = np.Runners(nil)
if err != nil {
t.Fatalf("Failed to list runners %v", err)
}
if len(runners) != 0 {
t.Fatalf("Invalid number of runners %v", len(runners))
}
// Let's try a double shutdown
err = np.Shutdown(context.Background())
if err != nil {
t.Fatalf("Not expected error from shutdown I (empty pool) %v", err)
}
err = np.Shutdown(context.Background())
if err != nil {
t.Fatalf("Not expected error from shutdown II (empty pool) %v", err)
}
runners, err = np.Runners(nil)
if err == nil {
t.Fatalf("Should fail to list runners (shutdown)")
}
if len(runners) != 0 {
t.Fatalf("Invalid number of runners %v", len(runners))
}
} }

View File

@@ -18,7 +18,7 @@ type Placer interface {
// RunnerPool is the abstraction for getting an ordered list of runners to try for a call // RunnerPool is the abstraction for getting an ordered list of runners to try for a call
type RunnerPool interface { type RunnerPool interface {
Runners(call RunnerCall) ([]Runner, error) Runners(call RunnerCall) ([]Runner, error)
Shutdown(context.Context) error Shutdown(ctx context.Context) error
} }
// PKIData encapsulates TLS certificate data // PKIData encapsulates TLS certificate data

View File

@@ -14,7 +14,6 @@ import (
"testing" "testing"
"time" "time"
"github.com/fnproject/fn/api/common"
"github.com/fnproject/fn/api/server" "github.com/fnproject/fn/api/server"
"github.com/fnproject/fn_go/client" "github.com/fnproject/fn_go/client"
httptransport "github.com/go-openapi/runtime/client" httptransport "github.com/go-openapi/runtime/client"
@@ -24,11 +23,7 @@ import (
const lBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" const lBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
func GetAPIURL() (string, *url.URL) { func GetAPIURL() (string, *url.URL) {
apiURL := os.Getenv("FN_API_URL") apiURL := getEnv("FN_API_URL", "http://localhost:8080")
if apiURL == "" {
apiURL = "http://localhost:8080"
}
u, err := url.Parse(apiURL) u, err := url.Parse(apiURL)
if err != nil { if err != nil {
log.Fatalf("Couldn't parse API URL: %s error: %s", apiURL, err) log.Fatalf("Couldn't parse API URL: %s error: %s", apiURL, err)
@@ -79,12 +74,18 @@ func checkServer(ctx context.Context) error {
return ctx.Err() return ctx.Err()
} }
func getEnv(key, fallback string) string {
if value, ok := os.LookupEnv(key); ok {
return value
}
return fallback
}
func startServer() { func startServer() {
getServer.Do(func() { getServer.Do(func() {
ctx := context.Background() ctx := context.Background()
common.SetLogLevel("fatal")
timeString := time.Now().Format("2006_01_02_15_04_05") timeString := time.Now().Format("2006_01_02_15_04_05")
dbURL := os.Getenv(server.EnvDBURL) dbURL := os.Getenv(server.EnvDBURL)
tmpDir := os.TempDir() tmpDir := os.TempDir()
@@ -95,7 +96,12 @@ func startServer() {
dbURL = fmt.Sprintf("sqlite3://%s", tmpDb) dbURL = fmt.Sprintf("sqlite3://%s", tmpDb)
} }
s = server.New(ctx, server.WithDBURL(dbURL), server.WithMQURL(mqURL), server.WithFullAgent()) s = server.New(ctx,
server.WithLogLevel(getEnv(server.EnvLogLevel, server.DefaultLogLevel)),
server.WithDBURL(dbURL),
server.WithMQURL(mqURL),
server.WithFullAgent(),
)
go func() { go func() {
s.Start(ctx) s.Start(ctx)

View File

@@ -114,7 +114,7 @@ func SetUpAPINode(ctx context.Context) (*server.Server, error) {
opts := make([]server.ServerOption, 0) opts := make([]server.ServerOption, 0)
opts = append(opts, server.WithWebPort(8085)) opts = append(opts, server.WithWebPort(8085))
opts = append(opts, server.WithType(nodeType)) opts = append(opts, server.WithType(nodeType))
opts = append(opts, server.WithLogLevel(server.DefaultLogLevel)) opts = append(opts, server.WithLogLevel(getEnv(server.EnvLogLevel, server.DefaultLogLevel)))
opts = append(opts, server.WithLogDest(server.DefaultLogDest, "API")) opts = append(opts, server.WithLogDest(server.DefaultLogDest, "API"))
opts = append(opts, server.WithDBURL(getEnv(server.EnvDBURL, defaultDB))) opts = append(opts, server.WithDBURL(getEnv(server.EnvDBURL, defaultDB)))
opts = append(opts, server.WithMQURL(getEnv(server.EnvMQURL, defaultMQ))) opts = append(opts, server.WithMQURL(getEnv(server.EnvMQURL, defaultMQ)))
@@ -129,7 +129,7 @@ func SetUpLBNode(ctx context.Context) (*server.Server, error) {
opts := make([]server.ServerOption, 0) opts := make([]server.ServerOption, 0)
opts = append(opts, server.WithWebPort(8081)) opts = append(opts, server.WithWebPort(8081))
opts = append(opts, server.WithType(nodeType)) opts = append(opts, server.WithType(nodeType))
opts = append(opts, server.WithLogLevel(server.DefaultLogLevel)) opts = append(opts, server.WithLogLevel(getEnv(server.EnvLogLevel, server.DefaultLogLevel)))
opts = append(opts, server.WithLogDest(server.DefaultLogDest, "LB")) opts = append(opts, server.WithLogDest(server.DefaultLogDest, "LB"))
opts = append(opts, server.WithDBURL("")) opts = append(opts, server.WithDBURL(""))
opts = append(opts, server.WithMQURL("")) opts = append(opts, server.WithMQURL(""))
@@ -194,7 +194,7 @@ func SetUpPureRunnerNode(ctx context.Context, nodeNum int) (*server.Server, erro
opts = append(opts, server.WithWebPort(8082+nodeNum)) opts = append(opts, server.WithWebPort(8082+nodeNum))
opts = append(opts, server.WithGRPCPort(9190+nodeNum)) opts = append(opts, server.WithGRPCPort(9190+nodeNum))
opts = append(opts, server.WithType(nodeType)) opts = append(opts, server.WithType(nodeType))
opts = append(opts, server.WithLogLevel("debug")) opts = append(opts, server.WithLogLevel(getEnv(server.EnvLogLevel, server.DefaultLogLevel)))
opts = append(opts, server.WithLogDest(server.DefaultLogDest, "PURE-RUNNER")) opts = append(opts, server.WithLogDest(server.DefaultLogDest, "PURE-RUNNER"))
opts = append(opts, server.WithDBURL("")) opts = append(opts, server.WithDBURL(""))
opts = append(opts, server.WithMQURL("")) opts = append(opts, server.WithMQURL(""))