fn: agent/lb/runner error handling adjustments (#1214)

1) Early call validation and return due to cpu/mem impossible to meet (eg. request cpu/mem larger than max-mem or max-cpu on server) now emits HTTP Bad Request (400) instead of 503. This case is most likely due to client/service configuration and/or validation issue. 2) 'failed' metric is now removed. 'failed' versus 'errors' were too confusing. 'errors' is now a catch all error case. 3) new 'canceled' counter for client side cancels. 4) 'server_busy' now covers more cases than it previously did.
2022-10-28 21:29:17 +03:00 · 2018-09-14 16:50:14 -07:00
parent 75bd0d3414
commit aa13a40168
11 changed files with 115 additions and 172 deletions
--- a/api/agent/agent.go
+++ b/api/agent/agent.go
@@ -244,15 +244,17 @@ func (a *agent) Close() error {
 }

 func (a *agent) Submit(callI Call) error {
+	call := callI.(*call)
+	ctx, span := trace.StartSpan(call.req.Context(), "agent_submit")
+	defer span.End()
+
+	statsCalls(ctx)
+
 	if !a.shutWg.AddSession(1) {
+		statsTooBusy(ctx)
 		return models.ErrCallTimeoutServerBusy
 	}
-
-	call := callI.(*call)
-
-	ctx := call.req.Context()
-	ctx, span := trace.StartSpan(ctx, "agent_submit")
-	defer span.End()
+	defer a.shutWg.DoneSession()

 	err := a.submit(ctx, call)
 	return err
@@ -294,7 +296,8 @@ func (a *agent) submit(ctx context.Context, call *call) error {
 		return a.handleCallEnd(ctx, call, slot, err, false)
 	}

-	statsDequeueAndStart(ctx)
+	statsDequeue(ctx)
+	statsStartRun(ctx)

 	// We are about to execute the function, set container Exec Deadline (call.Timeout)
 	slotCtx, cancel := context.WithTimeout(ctx, time.Duration(call.Timeout)*time.Second)
@@ -314,52 +317,26 @@ func (a *agent) handleCallEnd(ctx context.Context, call *call, slot Slot, err er
 	// This means call was routed (executed)
 	if isStarted {
 		call.End(ctx, err)
-	}
-
-	handleStatsEnd(ctx, err)
-	a.shutWg.DoneSession()
-	return transformTimeout(err, !isStarted)
-}
-
-func transformTimeout(e error, isRetriable bool) error {
-	if e == context.DeadlineExceeded {
-		if isRetriable {
+		statsStopRun(ctx)
+		if err == nil {
+			statsComplete(ctx)
+		}
+	} else {
+		if err == CapacityFull || err == context.DeadlineExceeded {
+			statsTooBusy(ctx)
 			return models.ErrCallTimeoutServerBusy
 		}
-		return models.ErrCallTimeout
-	} else if e == CapacityFull {
-		return models.ErrCallTimeoutServerBusy
 	}
-	return e
-}

-// handleStatsDequeue handles stats for dequeuing for early exit (getSlot or Start)
-// cases. Only timeouts can be a simple dequeue while other cases are actual errors.
-func handleStatsDequeue(ctx context.Context, err error) {
 	if err == context.DeadlineExceeded {
-		statsDequeue(ctx)
-		statsTooBusy(ctx)
-	} else {
-		statsDequeueAndFail(ctx)
+		statsTimedout(ctx)
+		return models.ErrCallTimeout
+	} else if err == context.Canceled {
+		statsCanceled(ctx)
+	} else if err != nil {
 		statsErrors(ctx)
 	}
-}
-
-// handleStatsEnd handles stats for after a call is ran, depending on error.
-func handleStatsEnd(ctx context.Context, err error) {
-	if err == nil {
-		// decrement running count, increment completed count
-		statsComplete(ctx)
-	} else {
-		// decrement running count, increment failed count
-		statsFailed(ctx)
-		// increment the timeout or errors count, as appropriate
-		if err == context.DeadlineExceeded {
-			statsTimedout(ctx)
-		} else {
-			statsErrors(ctx)
-		}
-	}
+	return err
 }

 // getSlot returns a Slot (or error) for the request to run. Depending on hot/cold