From 67e95e220189d3e95be16ef2cc67f7a9d92f6d05 Mon Sep 17 00:00:00 2001 From: Vincent Latombe Date: Thu, 24 Oct 2024 12:49:41 +0200 Subject: [PATCH 1/5] Refactor websocket connection logic We have observed cases where the websocket connection would fail temporarily with a handshake error that was not retried and causing the agent to fail. Also, in CloudBees CI HA, there are some cases when an agent is disconnected on purpose and we expect a reconnect attempt as soon as possible. The default 10 seconds delay appeared too long in that case. To reconcile this use case with the typical failure scenario, I have implemented exponential backoff for retries (immediate, 1 sec, 3 seconds, 7 seconds, 10 seconds). --- src/main/java/hudson/remoting/Engine.java | 136 +++++++++++++++++----- 1 file changed, 107 insertions(+), 29 deletions(-) diff --git a/src/main/java/hudson/remoting/Engine.java b/src/main/java/hudson/remoting/Engine.java index 78620b73c..aeac760d0 100644 --- a/src/main/java/hudson/remoting/Engine.java +++ b/src/main/java/hudson/remoting/Engine.java @@ -200,8 +200,6 @@ public Thread newThread(@NonNull final Runnable r) { private Duration noReconnectAfter; - private Instant firstAttempt; - /** * Determines whether the socket will have {@link Socket#setKeepAlive(boolean)} set or not. * @@ -785,31 +783,7 @@ public void closeRead() throws IOException { client.getProperties().put(ClientProperties.SSL_ENGINE_CONFIGURATOR, sslEngineConfigurator); } } - container.connectToServer( - new AgentEndpoint(), - ClientEndpointConfig.Builder.create() - .configurator(headerHandler) - .build(), - URI.create(wsUrl + "wsagents/")); - while (ch.get() == null) { - Thread.sleep(100); - } - this.protocolName = "WebSocket"; - events.status("Connected"); - ch.get().join(); - events.status("Terminated"); - if (noReconnect) { - return; - } - firstAttempt = Instant.now(); - events.onDisconnect(); - while (true) { - // TODO refactor various sleep statements into a common method - if (Util.shouldBailOut(firstAttempt, noReconnectAfter)) { - events.status("Bailing out after " + DurationFormatter.format(noReconnectAfter)); - return; - } - TimeUnit.SECONDS.sleep(10); + if (!succeedsWithRetries(() -> { // Unlike JnlpAgentEndpointResolver, we do not use $jenkins/tcpSlaveAgentListener/, as that will be // a 404 if the TCP port is disabled. URL ping = new URL(hudsonUrl, "login"); @@ -818,14 +792,39 @@ public void closeRead() throws IOException { int status = conn.getResponseCode(); conn.disconnect(); if (status == 200) { - break; + return true; } else { events.status(ping + " is not ready: " + status); } } catch (IOException x) { events.status(ping + " is not ready", x); } + return false; + })) { + return; + } + if (!succeedsWithRetries(() -> { + container.connectToServer( + new AgentEndpoint(), + ClientEndpointConfig.Builder.create() + .configurator(headerHandler) + .build(), + URI.create(wsUrl + "wsagents/")); + return true; + })) { + return; + } + while (ch.get() == null) { + Thread.sleep(100); + } + this.protocolName = "WebSocket"; + events.status("Connected"); + ch.get().join(); + events.status("Terminated"); + if (noReconnect) { + return; } + events.onDisconnect(); reconnect(); } } catch (Exception e) { @@ -833,6 +832,85 @@ public void closeRead() throws IOException { } } + /** + * Evaluates a condition with exponential backoff until it succeeds or the timeout is reached. + * @param condition the condition to attempt to succeed with exponential backoff + * @return true if the condition succeeded, false if the condition failed and the timeout was reached + * @throws InterruptedException if the thread was interrupted while waiting. + */ + private boolean succeedsWithRetries(SupplierThrowingException condition) throws InterruptedException { + var exponentialRetry = new ExponentialRetry(noReconnectAfter); + while (exponentialRetry != null) { + try { + if (condition.get()) { + return true; + } + } catch (Exception x) { + events.status("Failed to connect: " + x.getMessage()); + } + exponentialRetry = exponentialRetry.next(events); + } + return false; + } + + private static class ExponentialRetry { + final int factor; + final Instant beginning; + final Duration delay; + final Duration timeout; + final Duration incrementDelay; + final Duration maxDelay; + + ExponentialRetry(Duration timeout) { + this(Duration.ofSeconds(0), timeout, 2, Duration.ofSeconds(1), Duration.ofSeconds(10)); + } + + ExponentialRetry( + Duration initialDelay, Duration timeout, int factor, Duration incrementDelay, Duration maxDelay) { + this.beginning = Instant.now(); + this.delay = initialDelay; + this.timeout = timeout; + this.factor = factor; + this.incrementDelay = incrementDelay; + this.maxDelay = maxDelay; + } + + ExponentialRetry(ExponentialRetry previous) { + beginning = previous.beginning; + factor = previous.factor; + timeout = previous.timeout; + incrementDelay = previous.incrementDelay; + maxDelay = previous.maxDelay; + delay = min(maxDelay, previous.delay.multipliedBy(previous.factor).plus(incrementDelay)); + } + + private static Duration min(Duration a, Duration b) { + return a.compareTo(b) < 0 ? a : b; + } + + boolean timeoutExceeded() { + return Util.shouldBailOut(beginning, timeout); + } + + ExponentialRetry next(EngineListenerSplitter events) throws InterruptedException { + var next = new ExponentialRetry(this); + if (next.timeoutExceeded()) { + events.status("Bailing out after " + DurationFormatter.format(next.timeout)); + return null; + } else { + var delaySeconds = next.delay.toSeconds(); + events.status("Waiting " + delaySeconds + " seconds before retry"); + TimeUnit.SECONDS.sleep(delaySeconds); + } + return next; + } + } + + @FunctionalInterface + private interface SupplierThrowingException { + T get() throws Exception; + } + private void reconnect() { try { events.status("Performing onReconnect operation."); @@ -862,7 +940,7 @@ private void innerRun(IOHub hub, SSLContext context, ExecutorService service) { try { boolean first = true; - firstAttempt = Instant.now(); + var firstAttempt = Instant.now(); while (true) { if (first) { first = false; From 5953278173f9abf2098eb968e9e752dd17d52b2a Mon Sep 17 00:00:00 2001 From: Vincent Latombe Date: Thu, 24 Oct 2024 14:22:05 +0200 Subject: [PATCH 2/5] Spotbugs --- src/main/java/hudson/remoting/Engine.java | 43 +++++++++++++---------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/src/main/java/hudson/remoting/Engine.java b/src/main/java/hudson/remoting/Engine.java index aeac760d0..a3779afa7 100644 --- a/src/main/java/hudson/remoting/Engine.java +++ b/src/main/java/hudson/remoting/Engine.java @@ -591,7 +591,7 @@ public void run() { } @SuppressFBWarnings( - value = {"REC_CATCH_EXCEPTION", "URLCONNECTION_SSRF_FD"}, + value = {"REC_CATCH_EXCEPTION"}, justification = "checked exceptions were a mistake to begin with; connecting to Jenkins from agent") private void runWebSocket() { try { @@ -783,24 +783,7 @@ public void closeRead() throws IOException { client.getProperties().put(ClientProperties.SSL_ENGINE_CONFIGURATOR, sslEngineConfigurator); } } - if (!succeedsWithRetries(() -> { - // Unlike JnlpAgentEndpointResolver, we do not use $jenkins/tcpSlaveAgentListener/, as that will be - // a 404 if the TCP port is disabled. - URL ping = new URL(hudsonUrl, "login"); - try { - HttpURLConnection conn = (HttpURLConnection) ping.openConnection(); - int status = conn.getResponseCode(); - conn.disconnect(); - if (status == 200) { - return true; - } else { - events.status(ping + " is not ready: " + status); - } - } catch (IOException x) { - events.status(ping + " is not ready", x); - } - return false; - })) { + if (!succeedsWithRetries(this::pingSuccessful)) { return; } if (!succeedsWithRetries(() -> { @@ -853,6 +836,28 @@ private boolean succeedsWithRetries(SupplierThrowingException condition return false; } + @SuppressFBWarnings( + value = {"URLCONNECTION_SSRF_FD"}, + justification = "url is provided by the user, and we are trying to connect to it") + private Boolean pingSuccessful() throws MalformedURLException { + // Unlike JnlpAgentEndpointResolver, we do not use $jenkins/tcpSlaveAgentListener/, as that will be + // a 404 if the TCP port is disabled. + URL ping = new URL(hudsonUrl, "login"); + try { + HttpURLConnection conn = (HttpURLConnection) ping.openConnection(); + int status = conn.getResponseCode(); + conn.disconnect(); + if (status == 200) { + return true; + } else { + events.status(ping + " is not ready: " + status); + } + } catch (IOException x) { + events.status(ping + " is not ready", x); + } + return false; + } + private static class ExponentialRetry { final int factor; final Instant beginning; From 99dce14404932fba7100843db16fc5c64ccd7285 Mon Sep 17 00:00:00 2001 From: Vincent Latombe Date: Fri, 25 Oct 2024 11:22:14 +0200 Subject: [PATCH 3/5] Suppress custom functional interface --- src/main/java/hudson/remoting/Engine.java | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/main/java/hudson/remoting/Engine.java b/src/main/java/hudson/remoting/Engine.java index a3779afa7..fb6dd6908 100644 --- a/src/main/java/hudson/remoting/Engine.java +++ b/src/main/java/hudson/remoting/Engine.java @@ -821,11 +821,11 @@ public void closeRead() throws IOException { * @return true if the condition succeeded, false if the condition failed and the timeout was reached * @throws InterruptedException if the thread was interrupted while waiting. */ - private boolean succeedsWithRetries(SupplierThrowingException condition) throws InterruptedException { + private boolean succeedsWithRetries(java.util.concurrent.Callable condition) throws InterruptedException { var exponentialRetry = new ExponentialRetry(noReconnectAfter); while (exponentialRetry != null) { try { - if (condition.get()) { + if (condition.call()) { return true; } } catch (Exception x) { @@ -911,11 +911,6 @@ ExponentialRetry next(EngineListenerSplitter events) throws InterruptedException } } - @FunctionalInterface - private interface SupplierThrowingException { - T get() throws Exception; - } - private void reconnect() { try { events.status("Performing onReconnect operation."); From 86373ff1eec01064a96f035e413105a261ec910f Mon Sep 17 00:00:00 2001 From: Vincent Latombe Date: Fri, 25 Oct 2024 11:29:24 +0200 Subject: [PATCH 4/5] Format delay --- src/main/java/hudson/remoting/Engine.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main/java/hudson/remoting/Engine.java b/src/main/java/hudson/remoting/Engine.java index fb6dd6908..eaea34ecf 100644 --- a/src/main/java/hudson/remoting/Engine.java +++ b/src/main/java/hudson/remoting/Engine.java @@ -903,9 +903,8 @@ ExponentialRetry next(EngineListenerSplitter events) throws InterruptedException events.status("Bailing out after " + DurationFormatter.format(next.timeout)); return null; } else { - var delaySeconds = next.delay.toSeconds(); - events.status("Waiting " + delaySeconds + " seconds before retry"); - TimeUnit.SECONDS.sleep(delaySeconds); + events.status("Waiting " + DurationFormatter.format(next.delay) + " before retry"); + TimeUnit.SECONDS.sleep(next.delay.toSeconds()); } return next; } From 5e79c6f03967a34ea793bfdcae01fedeefb70548 Mon Sep 17 00:00:00 2001 From: Vincent Latombe Date: Mon, 4 Nov 2024 17:04:12 +0100 Subject: [PATCH 5/5] Simplify Co-authored-by: Jesse Glick --- src/main/java/hudson/remoting/Engine.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/hudson/remoting/Engine.java b/src/main/java/hudson/remoting/Engine.java index eaea34ecf..2850b794b 100644 --- a/src/main/java/hudson/remoting/Engine.java +++ b/src/main/java/hudson/remoting/Engine.java @@ -904,7 +904,7 @@ ExponentialRetry next(EngineListenerSplitter events) throws InterruptedException return null; } else { events.status("Waiting " + DurationFormatter.format(next.delay) + " before retry"); - TimeUnit.SECONDS.sleep(next.delay.toSeconds()); + Thread.sleep(next.delay.toMillis()); } return next; }