Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for detecting the number of available Neuron cores #1002

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ Most of those properties are designed for performance tuning. Adjusting those nu
* enable_envvars_config: Enable configuring MMS through environment variables. When this option is set to "true", all the static configurations of MMS can come through environment variables as well. default: false
* number_of_netty_threads: number frontend netty thread, default: number of logical processors available to the JVM.
* netty_client_threads: number of backend netty thread, default: number of logical processors available to the JVM.
* default_workers_per_model: number of workers to create for each model that loaded at startup time, default: available GPUs in system or number of logical processors available to the JVM.
* default_workers_per_model: number of workers to create for each model that loaded at startup time, default: available GPUs in system, available Neuron cores in system, or number of logical processors available to the JVM.
* job_queue_size: number inference jobs that frontend will queue before backend can serve, default 100. Useful in cases where certain requests take predictably longer to complete.
* async_logging: enable asynchronous logging for higher throughput, log output may be delayed if this is enabled, default: false.
* default_response_timeout: Timeout, in seconds, used for model's backend workers before they are deemed unresponsive and rebooted. default: 120 seconds.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,16 @@
*/
package com.amazonaws.ml.mms.util;

import com.google.gson.annotations.SerializedName;
import io.netty.handler.ssl.SslContext;
import io.netty.handler.ssl.SslContextBuilder;
import io.netty.handler.ssl.util.SelfSignedCertificate;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.lang.reflect.Field;
import java.net.InetAddress;
import java.net.UnknownHostException;
Expand Down Expand Up @@ -67,6 +70,7 @@ public final class ConfigManager {
private static final String MMS_NETTY_CLIENT_THREADS = "netty_client_threads";
private static final String MMS_JOB_QUEUE_SIZE = "job_queue_size";
private static final String MMS_NUMBER_OF_GPU = "number_of_gpu";
private static final String MMS_NUMBER_OF_NEURON_CORES = "number_of_neuron_cores";
private static final String MMS_ASYNC_LOGGING = "async_logging";
private static final String MMS_CORS_ALLOWED_ORIGIN = "cors_allowed_origin";
private static final String MMS_CORS_ALLOWED_METHODS = "cors_allowed_methods";
Expand Down Expand Up @@ -143,6 +147,13 @@ private ConfigManager(Arguments args) {
getAvailableGpu(),
getIntProperty(MMS_NUMBER_OF_GPU, Integer.MAX_VALUE))));

prop.setProperty(
MMS_NUMBER_OF_NEURON_CORES,
String.valueOf(
Integer.min(
getAvailableNeuronCores(),
getIntProperty(MMS_NUMBER_OF_NEURON_CORES, Integer.MAX_VALUE))));

String pythonExecutable = args.getPythonExecutable();
if (pythonExecutable != null) {
prop.setProperty("PYTHON_EXECUTABLE", pythonExecutable);
Expand Down Expand Up @@ -258,6 +269,10 @@ public int getNumberOfGpu() {
return getIntProperty(MMS_NUMBER_OF_GPU, 0);
}

public int getNumberOfNeuronCores() {
return getIntProperty(MMS_NUMBER_OF_NEURON_CORES, 0);
}

public String getMmsDefaultServiceHandler() {
return getProperty(MMS_DEFAULT_SERVICE_HANDLER, null);
}
Expand All @@ -283,6 +298,9 @@ public int getDefaultWorkers() {
if (workers == 0) {
workers = getNumberOfGpu();
}
if (workers == 0) {
workers = getNumberOfNeuronCores();
}
if (workers == 0) {
workers = Runtime.getRuntime().availableProcessors();
}
Expand Down Expand Up @@ -453,6 +471,8 @@ public String dumpConfigurations() {
+ System.getProperty("java.io.tmpdir")
+ "\nNumber of GPUs: "
+ getNumberOfGpu()
+ "\nNumber of Neuron Cores: "
+ getNumberOfNeuronCores()
+ "\nNumber of CPUs: "
+ runtime.availableProcessors()
+ "\nMax heap size: "
Expand Down Expand Up @@ -587,6 +607,26 @@ private static int getAvailableGpu() {
}
}

private static int getAvailableNeuronCores() {
try {
Process process = Runtime.getRuntime().exec("neuron-ls --json-output");
int ret = process.waitFor();
if (ret != 0) {
return 0;
}
Reader reader = new InputStreamReader(process.getInputStream(), StandardCharsets.UTF_8);
NeuronConfig[] results = JsonUtils.GSON.fromJson(reader, NeuronConfig[].class);
return Arrays.stream(results).mapToInt(c -> c.numNeuronCores).sum();
} catch (IOException | InterruptedException e) {
return 0;
}
}

private static final class NeuronConfig {
@SerializedName("nc_count")
int numNeuronCores;
}

public static final class Arguments {

private String mmsConfigFile;
Expand Down