From c23af39c76476df26003294e05569f04feb45b79 Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Mon, 19 Aug 2024 10:34:06 +0100 Subject: [PATCH 1/6] Set devices to 1 if multi-gpu is configured --- src/anomalib/engine/engine.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/anomalib/engine/engine.py b/src/anomalib/engine/engine.py index 485ab8e66e..a8bcecc461 100644 --- a/src/anomalib/engine/engine.py +++ b/src/anomalib/engine/engine.py @@ -324,8 +324,11 @@ def _setup_trainer(self, model: AnomalyModule) -> None: # Setup anomalib callbacks to be used with the trainer self._setup_anomalib_callbacks() - # Temporarily set devices to 1 to avoid issues with multiple processes - self._cache.args["devices"] = 1 + # TODO(ashwinvaidya17, djdameln, samet-akcay): Add Multi-GPU support to Anomalib + # https://github.com/openvinotoolkit/anomalib/issues/1449 + if len(self._cache.args["gpus"]) > 1: + logger.warning("Multi-GPU support is not available yet. Setting devices to 1.") + self._cache.args["devices"] = 1 # Instantiate the trainer if it is not already instantiated if self._trainer is None: From c16227c33e2d44b70403708b1026b6f50bce1d4f Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Mon, 19 Aug 2024 11:05:30 +0100 Subject: [PATCH 2/6] Fix typo: "gpus" -> "devices" --- src/anomalib/engine/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anomalib/engine/engine.py b/src/anomalib/engine/engine.py index a8bcecc461..c0c263096a 100644 --- a/src/anomalib/engine/engine.py +++ b/src/anomalib/engine/engine.py @@ -326,7 +326,7 @@ def _setup_trainer(self, model: AnomalyModule) -> None: # TODO(ashwinvaidya17, djdameln, samet-akcay): Add Multi-GPU support to Anomalib # https://github.com/openvinotoolkit/anomalib/issues/1449 - if len(self._cache.args["gpus"]) > 1: + if len(self._cache.args["devices"]) > 1: logger.warning("Multi-GPU support is not available yet. Setting devices to 1.") self._cache.args["devices"] = 1 From b070a34bff1ee7dfe62527e18fc8a659375cac80 Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Mon, 19 Aug 2024 12:25:25 +0100 Subject: [PATCH 3/6] Address various multi-device cases --- src/anomalib/engine/engine.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/anomalib/engine/engine.py b/src/anomalib/engine/engine.py index c0c263096a..25ed5dab69 100644 --- a/src/anomalib/engine/engine.py +++ b/src/anomalib/engine/engine.py @@ -326,9 +326,10 @@ def _setup_trainer(self, model: AnomalyModule) -> None: # TODO(ashwinvaidya17, djdameln, samet-akcay): Add Multi-GPU support to Anomalib # https://github.com/openvinotoolkit/anomalib/issues/1449 - if len(self._cache.args["devices"]) > 1: - logger.warning("Multi-GPU support is not available yet. Setting devices to 1.") - self._cache.args["devices"] = 1 + devices = self._cache.args.get("devices") + if devices and str(devices).count(","): + logger.warning("Multi-GPU support is not available yet. Using the first specified GPU.") + self._cache.args["devices"] = str(devices).split(",")[0].strip() # Instantiate the trainer if it is not already instantiated if self._trainer is None: From 9e07da044b940f6b4f349144488862dc69ddba44 Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Tue, 20 Aug 2024 14:25:52 +0100 Subject: [PATCH 4/6] Cover all the edge cases for device --- src/anomalib/engine/engine.py | 53 +++++++++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 5 deletions(-) diff --git a/src/anomalib/engine/engine.py b/src/anomalib/engine/engine.py index 25ed5dab69..6a9e985a0b 100644 --- a/src/anomalib/engine/engine.py +++ b/src/anomalib/engine/engine.py @@ -315,6 +315,52 @@ def _setup_workspace( root_dir = Path(self._cache.args["default_root_dir"]) / model.name / dataset_name / category self._cache.args["default_root_dir"] = create_versioned_dir(root_dir) if versioned_dir else root_dir / "latest" + def _setup_device(self) -> None: + """TO BE DEPRECATED: Setup the device for the trainer. + + This method configures the device (GPU) to be used by the trainer. + It handles various input formats for device specification and + ensures compatibility with the current single-GPU limitation. + + Note: + This method is a temporary solution until multi-GPU support + is added to Anomalib. + + Todo: + Add Multi-GPU support to Anomalib. + https://github.com/openvinotoolkit/anomalib/issues/1449 + + Raises: + ValueError: If an invalid device specification is provided. + """ + devices = self._cache.args.get("devices") + if devices is not None: + if isinstance(devices, int) or (isinstance(devices, str) and devices.isdigit()): + # If devices is a single integer, treat it as a specific GPU ID + if int(devices) > 1: + logger.warning("Multi-GPU support is not available yet. Using only the first GPU.") + self._cache.args["devices"] = [int(devices)] + logger.info(f"Using GPU with ID: {devices}") + elif isinstance(devices, list): + # If devices is a list, use only the first GPU ID + if len(devices) > 0: + self._cache.args["devices"] = [devices[0]] + if len(devices) > 1: + logger.warning("Multi-GPU support is not available yet. Using only the first GPU.") + logger.info(f"Using GPU with ID: {devices[0]}") + else: + logger.warning("Empty list provided for 'devices'. Using default GPU selection.") + self._cache.args["devices"] = None + elif isinstance(devices, str) and "," in devices: + # If devices is a comma-separated string, use only the first GPU + first_gpu = int(devices.split(",")[0].strip()) + logger.warning(f"Multi-GPU support is not available yet. Using only the first GPU (ID: {first_gpu}).") + self._cache.args["devices"] = [first_gpu] + else: + # For any other input, use the default behavior + logger.warning("Unrecognized 'devices' format. Using default GPU selection.") + self._cache.args["devices"] = None + def _setup_trainer(self, model: AnomalyModule) -> None: """Instantiate the trainer based on the model parameters.""" # Check if the cache requires an update @@ -324,12 +370,9 @@ def _setup_trainer(self, model: AnomalyModule) -> None: # Setup anomalib callbacks to be used with the trainer self._setup_anomalib_callbacks() - # TODO(ashwinvaidya17, djdameln, samet-akcay): Add Multi-GPU support to Anomalib + # TODO (ashwinvaidya17, djdameln, samet-akcay): Remove this when multi-GPU support is added to Anomalib # https://github.com/openvinotoolkit/anomalib/issues/1449 - devices = self._cache.args.get("devices") - if devices and str(devices).count(","): - logger.warning("Multi-GPU support is not available yet. Using the first specified GPU.") - self._cache.args["devices"] = str(devices).split(",")[0].strip() + self._setup_device() # Instantiate the trainer if it is not already instantiated if self._trainer is None: From eb6e1db37836c92a05bc74d474ccd4a8b2aed04a Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Thu, 22 Aug 2024 06:48:04 +0100 Subject: [PATCH 5/6] Auto-assign the device if no device is specified --- src/anomalib/engine/engine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/anomalib/engine/engine.py b/src/anomalib/engine/engine.py index 6a9e985a0b..3cf3dafe84 100644 --- a/src/anomalib/engine/engine.py +++ b/src/anomalib/engine/engine.py @@ -358,8 +358,8 @@ def _setup_device(self) -> None: self._cache.args["devices"] = [first_gpu] else: # For any other input, use the default behavior - logger.warning("Unrecognized 'devices' format. Using default GPU selection.") - self._cache.args["devices"] = None + logger.warning("Unrecognized 'devices' format. Using the 'auto' selection.") + self._cache.args["devices"] = "auto" def _setup_trainer(self, model: AnomalyModule) -> None: """Instantiate the trainer based on the model parameters.""" From 14b31cb0a7491326c5e3eb226c80cb51ef46b685 Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Thu, 22 Aug 2024 09:13:07 +0100 Subject: [PATCH 6/6] Add few other edge cases --- src/anomalib/engine/engine.py | 72 ++++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/src/anomalib/engine/engine.py b/src/anomalib/engine/engine.py index 3cf3dafe84..64e5fb3a8b 100644 --- a/src/anomalib/engine/engine.py +++ b/src/anomalib/engine/engine.py @@ -6,7 +6,7 @@ import logging from collections.abc import Iterable from pathlib import Path -from typing import Any +from typing import Any, Literal import torch from lightning.pytorch.callbacks import Callback, RichModelSummary, RichProgressBar @@ -333,33 +333,53 @@ def _setup_device(self) -> None: Raises: ValueError: If an invalid device specification is provided. """ + accelerator = self._cache.args.get("accelerator") devices = self._cache.args.get("devices") - if devices is not None: - if isinstance(devices, int) or (isinstance(devices, str) and devices.isdigit()): - # If devices is a single integer, treat it as a specific GPU ID - if int(devices) > 1: - logger.warning("Multi-GPU support is not available yet. Using only the first GPU.") - self._cache.args["devices"] = [int(devices)] - logger.info(f"Using GPU with ID: {devices}") - elif isinstance(devices, list): - # If devices is a list, use only the first GPU ID - if len(devices) > 0: - self._cache.args["devices"] = [devices[0]] - if len(devices) > 1: - logger.warning("Multi-GPU support is not available yet. Using only the first GPU.") - logger.info(f"Using GPU with ID: {devices[0]}") - else: - logger.warning("Empty list provided for 'devices'. Using default GPU selection.") - self._cache.args["devices"] = None - elif isinstance(devices, str) and "," in devices: - # If devices is a comma-separated string, use only the first GPU - first_gpu = int(devices.split(",")[0].strip()) - logger.warning(f"Multi-GPU support is not available yet. Using only the first GPU (ID: {first_gpu}).") - self._cache.args["devices"] = [first_gpu] + + # Only proceed with GPU setup if the accelerator is set to "gpu" + if accelerator != "gpu": + return + + # Helper function to log warning and return single GPU specification + def use_single_gpu(message: str) -> Literal[1]: + """Log a warning and return a single GPU specification.""" + logger.warning(f"{message} Defaulting to a single GPU.") + return 1 # Let Lightning choose the GPU + + # Handle various input types + if devices is None: + self._cache.args["devices"] = 1 + logger.info("No specific GPU selected. Using Lightning's default selection.") + elif isinstance(devices, int): + if devices > 1 or devices <= 0: # Treat 0 and negative values (except -1) as a request for all GPUs + message = f"Multiple GPUs requested ({devices}), but multi-GPU is not supported." + self._cache.args["devices"] = use_single_gpu(message) + else: # devices == 1 + self._cache.args["devices"] = 1 # Let Lightning choose the GPU + elif isinstance(devices, str): + if devices.lower() in ("-1", "auto", "0"): + self._cache.args["devices"] = use_single_gpu("All GPUs requested, but multi-GPU is not supported.") + elif "," in devices: + message = f"Multiple GPUs specified ({devices}), but multi-GPU is not supported." + self._cache.args["devices"] = use_single_gpu(message) else: - # For any other input, use the default behavior - logger.warning("Unrecognized 'devices' format. Using the 'auto' selection.") - self._cache.args["devices"] = "auto" + try: + gpu_id = int(devices) + self._cache.args["devices"] = ( + [gpu_id] if gpu_id > 0 else use_single_gpu(f"Invalid GPU specification: {devices}.") + ) + except ValueError: + self._cache.args["devices"] = use_single_gpu(f"Invalid GPU specification: {devices}.") + elif isinstance(devices, list): + if len(devices) > 1: + message = f"Multiple GPUs specified {devices}, but multi-GPU is not supported." + self._cache.args["devices"] = use_single_gpu(message) + elif len(devices) == 1: + self._cache.args["devices"] = devices # Keep the single GPU specified + else: # Empty list + self._cache.args["devices"] = use_single_gpu("Empty list provided for 'devices'.") + else: + self._cache.args["devices"] = use_single_gpu(f"Unrecognized 'devices' format: {devices}.") def _setup_trainer(self, model: AnomalyModule) -> None: """Instantiate the trainer based on the model parameters."""