From 4af53025d8511fa2fb0d0f8e56399a7f33430068 Mon Sep 17 00:00:00 2001 From: Tolga Ceylan Date: Tue, 5 Jun 2018 14:41:13 -0700 Subject: [PATCH] fn: lb-agent: Initial TryCall result can be retriable. (#1035) Before this change, we assumed data may end up in a container once we placed a TryCall() and if gRPC send failed, we did not retry. However, a send failure cannot result in data in a container, since only upon successful receipt of a TryCall can pure-runner schedule a call into a container. Here we trust gRPC and if gRPC layer says it could not send a msg, then the receiver did not receive it. --- api/agent/runner_client.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/api/agent/runner_client.go b/api/agent/runner_client.go index 6e54744f6..98213128c 100644 --- a/api/agent/runner_client.go +++ b/api/agent/runner_client.go @@ -123,17 +123,19 @@ func (r *gRPCRunner) TryExec(ctx context.Context, call pool.RunnerCall) (bool, e return false, err } - // After this point, we assume "COMMITTED" unless pure runner - // send explicit NACK err = runnerConnection.Send(&pb.ClientMsg{Body: &pb.ClientMsg_Try{Try: &pb.TryCall{ ModelsCallJson: string(modelJSON), SlotHashId: hex.EncodeToString([]byte(call.SlotHashId())), }}}) if err != nil { logrus.WithError(err).Error("Failed to send message to runner node") - return true, err + // Try on next runner + return false, err } + // After this point TryCall was sent, we assume "COMMITTED" unless pure runner + // send explicit NACK + recvDone := make(chan error, 1) go receiveFromRunner(runnerConnection, call, recvDone)