fn: non-blocking resource tracker and notification (#841)

* fn: non-blocking resource tracker and notification For some types of errors, we might want to notify the actual caller if the error is directly 1-1 tied to that request. If hotLauncher is triggered with signaller, then here we send a back communication error notification channel. This is passed to checkLaunch to send back synchronous responses to the caller that initiated this hot container launch. This is useful if we want to run the agent in quick fail mode, where instead of waiting for CPU/Mem to become available, we prefer to fail quick in order not to hold up the caller. To support this, non-blocking resource tracker option/functions are now available. * fn: test env var rename tweak * fn: fixup merge * fn: rebase test fix * fn: merge fixup * fn: test tweak down to 70MB for 128MB total * fn: refactor token creation and use broadcast regardless * fn: nb description * fn: bugfix
2022-10-28 21:29:17 +03:00 · 2018-04-24 21:59:33 -07:00
parent 51197d4985
commit 54ba49be65
6 changed files with 359 additions and 92 deletions
--- a/api/agent/resource.go
+++ b/api/agent/resource.go
@@ -23,6 +23,8 @@ const (
 	Mem1GB = 1024 * 1024 * 1024
 )

+var CapacityFull = errors.New("max capacity reached")
+
 // A simple resource (memory, cpu, disk, etc.) tracker for scheduling.
 // TODO: add cpu, disk, network IO for future
 type ResourceTracker interface {
@@ -34,8 +36,10 @@ type ResourceTracker interface {
 	// the channel will never receive anything. If it is not possible to fulfill this resource, the channel
 	// will never receive anything (use IsResourcePossible). If a resource token is available for the provided
 	// resource parameters, it will otherwise be sent once on the returned channel. The channel is never closed.
-	// Memory is expected to be provided in MB units.
-	GetResourceToken(ctx context.Context, memory, cpuQuota uint64, isAsync bool) <-chan ResourceToken
+	// if isNB is set, resource check is done and error token is returned without blocking.
+	// if isAsync is set, resource allocation specific for async requests is considered. (eg. always allow
+	// a sync only reserve area) Memory is expected to be provided in MB units.
+	GetResourceToken(ctx context.Context, memory, cpuQuota uint64, isAsync, isNB bool) <-chan ResourceToken

 	// IsResourcePossible returns whether it's possible to fulfill the requested resources on this
 	// machine. It must be called before GetResourceToken or GetResourceToken may hang.
@@ -88,13 +92,19 @@ func NewResourceTracker(cfg *AgentConfig) ResourceTracker {
 type ResourceToken interface {
 	// Close must be called by any thread that receives a token.
 	io.Closer
+	Error() error
 }

 type resourceToken struct {
 	once      sync.Once
+	err       error
 	decrement func()
 }

+func (t *resourceToken) Error() error {
+	return t.err
+}
+
 func (t *resourceToken) Close() error {
 	t.once.Do(func() {
 		t.decrement()
@@ -140,10 +150,93 @@ func (a *resourceTracker) GetResourceTokenWaiterCount() uint64 {
 	return waiters
 }

+func (a *resourceTracker) allocResourcesLocked(memory, cpuQuota uint64, isAsync bool) ResourceToken {
+
+	var asyncMem, syncMem uint64
+	var asyncCPU, syncCPU uint64
+
+	if isAsync {
+		// async uses async pool only
+		asyncMem = memory
+		asyncCPU = cpuQuota
+	} else {
+		// if sync fits async + sync pool
+		syncMem = minUint64(a.ramSyncTotal-a.ramSyncUsed, memory)
+		syncCPU = minUint64(a.cpuSyncTotal-a.cpuSyncUsed, cpuQuota)
+
+		asyncMem = memory - syncMem
+		asyncCPU = cpuQuota - syncCPU
+	}
+
+	a.ramAsyncUsed += asyncMem
+	a.ramSyncUsed += syncMem
+	a.cpuAsyncUsed += asyncCPU
+	a.cpuSyncUsed += syncCPU
+
+	return &resourceToken{decrement: func() {
+
+		a.cond.L.Lock()
+		a.ramAsyncUsed -= asyncMem
+		a.ramSyncUsed -= syncMem
+		a.cpuAsyncUsed -= asyncCPU
+		a.cpuSyncUsed -= syncCPU
+		a.cond.L.Unlock()
+
+		// WARNING: yes, we wake up everyone even async waiters when only sync pool has space, but
+		// the cost of this spurious wake up is unlikely to impact much performance. Simpler
+		// to use one cond variable for the time being.
+		a.cond.Broadcast()
+	}}
+}
+
+func (a *resourceTracker) getResourceTokenNB(memory uint64, cpuQuota uint64, isAsync bool) ResourceToken {
+	if !a.IsResourcePossible(memory, cpuQuota, isAsync) {
+		return &resourceToken{decrement: func() {}, err: CapacityFull}
+	}
+	memory = memory * Mem1MB
+
+	var t ResourceToken
+
+	a.cond.L.Lock()
+
+	if !a.isResourceAvailableLocked(memory, cpuQuota, isAsync) {
+		t = &resourceToken{decrement: func() {}, err: CapacityFull}
+	} else {
+		t = a.allocResourcesLocked(memory, cpuQuota, isAsync)
+	}
+
+	a.cond.L.Unlock()
+	return t
+}
+
+func (a *resourceTracker) getResourceTokenNBChan(ctx context.Context, memory uint64, cpuQuota uint64, isAsync bool) <-chan ResourceToken {
+	ctx, span := trace.StartSpan(ctx, "agent_get_resource_token_nbio_chan")
+
+	ch := make(chan ResourceToken)
+	go func() {
+		defer span.End()
+		t := a.getResourceTokenNB(memory, cpuQuota, isAsync)
+
+		select {
+		case ch <- t:
+		case <-ctx.Done():
+			// if we can't send b/c nobody is waiting anymore, need to decrement here
+			t.Close()
+		}
+	}()
+
+	return ch
+}
+
 // the received token should be passed directly to launch (unconditionally), launch
 // will close this token (i.e. the receiver should not call Close)
-func (a *resourceTracker) GetResourceToken(ctx context.Context, memory uint64, cpuQuota uint64, isAsync bool) <-chan ResourceToken {
+func (a *resourceTracker) GetResourceToken(ctx context.Context, memory uint64, cpuQuota uint64, isAsync, isNB bool) <-chan ResourceToken {
+	if isNB {
+		return a.getResourceTokenNBChan(ctx, memory, cpuQuota, isAsync)
+	}
+
 	ch := make(chan ResourceToken)
+
 	if !a.IsResourcePossible(memory, cpuQuota, isAsync) {
 		// return the channel, but never send anything.
 		return ch
@@ -186,43 +279,9 @@ func (a *resourceTracker) GetResourceToken(ctx context.Context, memory uint64, c
 			return
 		}

-		var asyncMem, syncMem uint64
-		var asyncCPU, syncCPU uint64
-
-		if isAsync {
-			// async uses async pool only
-			asyncMem = memory
-			asyncCPU = cpuQuota
-		} else {
-			// if sync fits async + sync pool
-			syncMem = minUint64(a.ramSyncTotal-a.ramSyncUsed, memory)
-			syncCPU = minUint64(a.cpuSyncTotal-a.cpuSyncUsed, cpuQuota)
-
-			asyncMem = memory - syncMem
-			asyncCPU = cpuQuota - syncCPU
-		}
-
-		a.ramAsyncUsed += asyncMem
-		a.ramSyncUsed += syncMem
-		a.cpuAsyncUsed += asyncCPU
-		a.cpuSyncUsed += syncCPU
+		t := a.allocResourcesLocked(memory, cpuQuota, isAsync)
 		c.L.Unlock()

-		t := &resourceToken{decrement: func() {
-
-			c.L.Lock()
-			a.ramAsyncUsed -= asyncMem
-			a.ramSyncUsed -= syncMem
-			a.cpuAsyncUsed -= asyncCPU
-			a.cpuSyncUsed -= syncCPU
-			c.L.Unlock()
-
-			// WARNING: yes, we wake up everyone even async waiters when only sync pool has space, but
-			// the cost of this spurious wake up is unlikely to impact much performance. Simpler
-			// to use one cond variable for the time being.
-			c.Broadcast()
-		}}
-
 		select {
 		case ch <- t:
 		case <-ctx.Done():