Skip to content

Commit

Permalink
[fix][sf] fix bug with PyPredictor to remove worker, add specific fla… (
Browse files Browse the repository at this point in the history
  • Loading branch information
siddvenk committed Sep 3, 2024
1 parent 8a1ad37 commit ab70165
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,13 @@ public PyPredictor(
@Override
@SuppressWarnings("unchecked")
public List<O> batchPredict(List<I> inputs) throws TranslateException {
if (process.isModelUnrecoverable()) {
throw new EngineException("Backend Python process is unrecoverable.");
}
if (!process.isReady()) {
// TODO: wait for restart
throw new TranslateException("Backend Python process is stopped.");
}
if (process.isModelUnrecoverable()) {
throw new EngineException("Backend Python process is unrecoverable.");
}
Object first = inputs.get(0);
if (first instanceof Input) {
int size = inputs.size();
Expand Down
12 changes: 12 additions & 0 deletions serving/src/main/java/ai/djl/serving/models/ModelManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import ai.djl.serving.wlm.WorkerPool;
import ai.djl.serving.wlm.WorkerPoolConfig;
import ai.djl.serving.workflow.Workflow;
import ai.djl.util.Utils;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -416,6 +417,17 @@ public CompletableFuture<Map<String, Object>> workerStatus() {
if (wlm.getWorkerPool(wpc).isFullyScaled()) {
data.put(modelName, new StatusResponse("Healthy"));
} else {
boolean sageMakerHealthCheckOverride =
Boolean.parseBoolean(
Utils.getEnvOrSystemProperty(
"SAGEMAKER_HEALTH_CHECK_OVERRIDE"));
if (sageMakerHealthCheckOverride) {
logger.info(
"SAGEMAKER_HEALTH_CHECK_OVERRIDE is"
+ " enabled. Failing ping as"
+ " requested");
hasFailure = true;
}
data.put(modelName, new StatusResponse("Unhealthy"));
}
break;
Expand Down
2 changes: 1 addition & 1 deletion wlm/src/main/java/ai/djl/serving/wlm/ModelInfo.java
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ public Status getStatus() {
Boolean.parseBoolean(
Utils.getEnvOrSystemProperty("SERVING_HEALTH_CHECK_OVERRIDE"));
if (isHealthCheckOverrideEnabled) {
logger.error(
logger.info(
"SERVING_HEALTH_CHECK_OVERRIDE is enabled. At least 1 model worker"
+ " has exhausted all retries. Not marking model as failed");
return Status.READY;
Expand Down

0 comments on commit ab70165

Please sign in to comment.