Skip to content

Commit

Permalink
Remediate some issues to successfully pass bazel tests
Browse files Browse the repository at this point in the history
  • Loading branch information
chenj-hub committed Aug 13, 2024
1 parent 1d5eafa commit d7d6604
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 97 deletions.
25 changes: 23 additions & 2 deletions src/main/java/build/buildfarm/common/redis/RedisClient.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Consumer;
import redis.clients.jedis.UnifiedJedis;

import java.util.function.Supplier;
import java.util.logging.Level;
import lombok.extern.java.Log;
import redis.clients.jedis.exceptions.JedisClusterOperationException;
import redis.clients.jedis.exceptions.JedisConnectionException;
import redis.clients.jedis.exceptions.JedisDataException;
Expand Down Expand Up @@ -76,14 +80,31 @@ public JedisMisconfigurationException(final String message, final Throwable caus
}
}

private final UnifiedJedis jedis;
// We store the factory in case we want to re-create the jedis client.
private Supplier<UnifiedJedis> unifiedJedisFactory;

private UnifiedJedis jedis;

private boolean closed = false;

public RedisClient(UnifiedJedis jedis) {
this.jedis = jedis;
}

public RedisClient(
Supplier<UnifiedJedis> unifiedJedisFactory,
int reconnectClientAttempts,
int reconnectClientWaitDurationMs) {
try {
this.jedis = unifiedJedisFactory.get();
} catch (Exception e) {
log.log(Level.SEVERE, "Unable to establish redis client: " + e.toString());
}
this.unifiedJedisFactory = unifiedJedisFactory;
this.reconnectClientAttempts = reconnectClientAttempts;
this.reconnectClientWaitDurationMs = reconnectClientWaitDurationMs;
}

@Override
public synchronized void close() {
closed = true;
Expand Down Expand Up @@ -176,7 +197,7 @@ private <T> T callImpl(JedisContext<T> withJedis) throws IOException {
private void rebuildJedisCluser() {
try {
log.log(Level.SEVERE, "Rebuilding redis client");
jedis = jedisClusterFactory.get();
jedis = unifiedJedisFactory.get();
} catch (Exception e) {
redisClientRebuildErrorCounter.inc();
log.log(Level.SEVERE, "Failed to rebuild redis client");
Expand Down
35 changes: 4 additions & 31 deletions src/main/java/build/buildfarm/instance/shard/ServerInstance.java
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ public class ServerInstance extends NodeInstance {
private Cache<RequestMetadata, Boolean> recentCacheServedExecutions;

private final Random rand = new Random();
private final Writes writes;
private final Writes writes = new Writes(this::writeInstanceSupplier);
private final int maxCpu;
private final int maxRequeueAttempts;

Expand Down Expand Up @@ -382,7 +382,6 @@ public ServerInstance(
this.actionCacheFetchService = actionCacheFetchService;
backplane.setOnUnsubscribe(this::stop);

this.writes = new Writes(writeInstanceCacheLoader());
initializeCaches();

remoteInputStreamFactory =
Expand Down Expand Up @@ -1248,35 +1247,9 @@ public void onSuccess(List<String> workersList) {
protected abstract void onQueue(Deque<String> workers);
}

private CacheLoader<BlobWriteKey, Instance> writeInstanceCacheLoader() {
return new CacheLoader<BlobWriteKey, Instance>() {
@SuppressWarnings("NullableProblems")
@Override
public Instance load(BlobWriteKey key) {
String instance = null;
// Per the REAPI the identifier should end up as a unique UUID per a
// client level - adding bytes to further mitigate collisions and not
// store the entire BlobWriteKey.
String blobKey = key.getIdentifier() + "." + key.getDigest().getSizeBytes();
try {
instance = backplane.getWriteInstance(blobKey);
if (instance != null) {
return workerStub(instance);
}
} catch (IOException e) {
log.log(Level.WARNING, "error getting write instance for " + instance, e);
}

instance = getRandomWorker();
try {
backplane.setWriteInstance(blobKey, instance);
log.log(Level.INFO, "set write-instance: " + blobKey + " -> " + instance); // TODO: [jmarino]: remove
} catch (IOException e) {
log.log(Level.WARNING, "error getting write instance for " + instance, e);
}
return workerStub(instance);
}
};
private Instance writeInstanceSupplier() {
String worker = getRandomWorker();
return workerStub(worker);
}

String getRandomWorker() {
Expand Down
14 changes: 2 additions & 12 deletions src/main/java/build/buildfarm/worker/Pipeline.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

package build.buildfarm.worker;

import com.google.common.util.concurrent.SettableFuture;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
Expand All @@ -25,7 +24,6 @@
@Log
public class Pipeline {
private final Map<PipelineStage, Thread> stageThreads;
private final PipelineStageThreadGroup stageThreadGroup;
private final Map<PipelineStage, Integer> stageClosePriorities;
private Thread joiningThread = null;
private boolean closing = false;
Expand All @@ -35,25 +33,17 @@ public class Pipeline {
public Pipeline() {
stageThreads = new HashMap<>();
stageClosePriorities = new HashMap<>();
stageThreadGroup = new PipelineStageThreadGroup();
}

public void add(PipelineStage stage, int closePriority) {
stageThreads.put(stage, new Thread(stageThreadGroup, stage, stage.name()));
stageThreads.put(stage, new Thread(stage));
if (closePriority < 0) {
throw new IllegalArgumentException("closePriority cannot be negative");
}
stageClosePriorities.put(stage, closePriority);
}

/**
* Start the pipeline.
*
* <p>You can provide callback which is invoked when any stage has an uncaught exception, for
* instance to shutdown the worker gracefully
*/
public void start(SettableFuture<Void> uncaughtExceptionFuture) {
stageThreadGroup.setUncaughtExceptionFuture(uncaughtExceptionFuture);
public void start() {
for (Thread stageThread : stageThreads.values()) {
stageThread.start();
}
Expand Down
51 changes: 1 addition & 50 deletions src/main/java/build/buildfarm/worker/shard/Worker.java
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,6 @@
import javax.annotation.Nullable;
import javax.naming.ConfigurationException;
import lombok.extern.java.Log;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.context.ApplicationContext;
import org.springframework.context.annotation.ComponentScan;

@Log
public final class Worker extends LoggingMain {
Expand Down Expand Up @@ -146,7 +141,6 @@ public final class Worker extends LoggingMain {
private LoadingCache<String, Instance> workerStubs;
private AtomicBoolean released = new AtomicBoolean(true);

@Autowired private ApplicationContext springContext;
/**
* The method will prepare the worker for graceful shutdown when the worker is ready. Note on
* using stderr here instead of log. By the time this is called in PreDestroy, the log is no
Expand Down Expand Up @@ -198,43 +192,6 @@ private Worker() {
super("BuildFarmShardWorker");
}

private void exitPostPipelineFailure() {
// Shutdown the worker if a pipeline fails. By means of the spring lifecycle
// hooks - e.g. the `PreDestroy` hook here - it will attempt to gracefully
// spin down the pipeline

// By calling these spring shutdown facilities; we're open to the risk that
// a subsystem may be hanging a criticial thread indeffinitly. Deadline the
// shutdown workflow to ensure we don't leave a zombie worker in this
// situation
ScheduledExecutorService shutdownDeadlineExecutor = newSingleThreadScheduledExecutor();

// This may be shorter than the action timeout; assume we have interrupted
// actions in a fatal uncaught exception.
int forceShutdownDeadline = 60;
ScheduledFuture<?> termFuture =
shutdownDeadlineExecutor.schedule(
new Runnable() {
public void run() {
log.log(
Level.SEVERE,
String.format(
"Force terminating due to shutdown deadline exceeded (%d seconds)",
forceShutdownDeadline));
System.exit(1);
}
},
forceShutdownDeadline,
SECONDS);

// Consider defining exit codes to better afford out of band instance
// recovery
int code = SpringApplication.exit(springContext, () -> 1);
termFuture.cancel(false);
shutdownDeadlineExecutor.shutdown();
System.exit(code);
}

private Operation stripOperation(Operation operation) {
return instance.stripOperation(operation);
}
Expand Down Expand Up @@ -678,13 +635,7 @@ public void start() throws ConfigurationException, InterruptedException, IOExcep
PrometheusPublisher.startHttpServer(configs.getPrometheusPort());
startFailsafeRegistration();

// Listen for pipeline unhandled exceptions
ExecutorService pipelineExceptionExecutor = newSingleThreadExecutor();
SettableFuture<Void> pipelineExceptionFuture = SettableFuture.create();
pipelineExceptionFuture.addListener(this::exitPostPipelineFailure, pipelineExceptionExecutor);

pipeline.start(pipelineExceptionFuture);

pipeline.start();
healthCheckMetric.labels("start").inc();
executionSlotsTotal.set(configs.getWorker().getExecuteStageWidth());
inputFetchSlotsTotal.set(configs.getWorker().getInputFetchStageWidth());
Expand Down
4 changes: 2 additions & 2 deletions src/test/java/build/buildfarm/worker/PipelineTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public void stageThreadReturnCompletesJoin() throws InterruptedException {
public void run() {}
},
1);
pipeline.start(null);
pipeline.start();
pipeline.join();
}

Expand All @@ -73,7 +73,7 @@ public void run() {
}
},
1);
pipeline.start(null);
pipeline.start();
pipeline.join();
}

Expand Down

0 comments on commit d7d6604

Please sign in to comment.