diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 7d62f00720..4d3f4e11db 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -30,6 +30,8 @@
       title: How to accelerate training
     - local: onnxruntime/usage_guides/gpu
       title: Accelerated inference on NVIDIA GPUs
+    - local: onnxruntime/usage_guides/amdgpu
+      title: Accelerated inference on AMD GPUs
     title: How-to guides
     isExpanded: false
   - sections:
diff --git a/docs/source/onnxruntime/usage_guides/amdgpu.mdx b/docs/source/onnxruntime/usage_guides/amdgpu.mdx
new file mode 100644
index 0000000000..1859637464
--- /dev/null
+++ b/docs/source/onnxruntime/usage_guides/amdgpu.mdx
@@ -0,0 +1,124 @@
+# Accelerated inference on AMD GPUs supported by ROCm
+
+By default, ONNX Runtime runs inference on CPU devices. However, it is possible to place supported operations on an AMD Instinct GPU, while leaving any unsupported ones on CPU. In most cases, this allows costly operations to be placed on GPU and significantly accelerate inference.
+
+Our testing involved AMD Instinct GPUs, and for specific GPU compatibility, please refer to the official support list of GPUs available [here](https://rocm.docs.amd.com/en/latest/release/gpu_os_support.html).
+
+This guide will show you how to run inference on the `ROCMExecutionProvider` execution provider that ONNX Runtime supports for AMD GPUs.
+
+## Installation
+The following setup installs the ONNX Runtime support with ROCM Execution Provider with ROCm 5.7. 
+
+#### 1. ROCm Installation
+
+To install ROCM 5.7, please follow the [ROCm installation guide](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html).
+
+#### 2. PyTorch Installation with ROCm Support
+Optimum ONNX Runtime integration relies on some functionalities of Transformers that require PyTorch. For now, we recommend to use Pytorch compiled against RoCm 5.7, that can be installed following [PyTorch installation guide](https://pytorch.org/get-started/locally/): 
+
+```bash
+pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm5.7
+```
+
+<Tip>
+For docker installation, the following base image is recommended: `rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1`
+</Tip>
+
+### 3. ONNX Runtime installation with ROCm Execution Provider
+
+```bash
+# pre-requisites
+pip install -U pip
+pip install cmake onnx
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
+
+# Install ONNXRuntime from source
+git clone --recursive  https://github.com/ROCmSoftwarePlatform/onnxruntime.git
+git checkout rocm5.7_internal_testing_eigen-3.4.zip_hash
+cd onnxruntime
+
+./build.sh --config Release --build_wheel --update --build --parallel --cmake_extra_defines ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER) --use_rocm --rocm_home=/opt/rocm
+pip install build/Linux/Release/dist/*
+```
+
+<Tip>
+To avoid conflicts between `onnxruntime` and `onnxruntime-rocm`, make sure the package `onnxruntime` is not installed by running `pip uninstall onnxruntime` prior to installing `onnxruntime-rocm`.
+</Tip>
+
+### Checking the ROCm installation is successful
+
+Before going further, run the following sample code to check whether the install was successful:
+
+```python
+>>> from optimum.onnxruntime import ORTModelForSequenceClassification
+>>> from transformers import AutoTokenizer
+
+>>> ort_model = ORTModelForSequenceClassification.from_pretrained(
+...   "philschmid/tiny-bert-sst2-distilled",
+...   export=True,
+...   provider="ROCMExecutionProvider",
+... )
+
+>>> tokenizer = AutoTokenizer.from_pretrained("philschmid/tiny-bert-sst2-distilled")
+>>> inputs = tokenizer("expectations were low, actual enjoyment was high", return_tensors="pt", padding=True)
+
+>>> outputs = ort_model(**inputs)
+>>> assert ort_model.providers == ["ROCMExecutionProvider", "CPUExecutionProvider"]
+```
+
+In case this code runs gracefully, congratulations, the installation is successfull! If you encounter the following error or similar,
+
+```
+ValueError: Asked to use ROCMExecutionProvider as an ONNX Runtime execution provider, but the available execution providers are ['CPUExecutionProvider'].
+```
+
+then something is wrong with the ROCM or ONNX Runtime installation.
+
+### Use ROCM Execution Provider with ORT models
+
+For ORT models, the use is straightforward. Simply specify the `provider` argument in the `ORTModel.from_pretrained()` method. Here's an example:
+
+```python
+>>> from optimum.onnxruntime import ORTModelForSequenceClassification
+
+>>> ort_model = ORTModelForSequenceClassification.from_pretrained(
+...   "distilbert-base-uncased-finetuned-sst-2-english",
+...   export=True,
+...   provider="ROCMExecutionProvider",
+... )
+```
+
+The model can then be used with the common 🤗 Transformers API for inference and evaluation, such as [pipelines](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/pipelines).
+When using Transformers pipeline, note that the `device` argument should be set to perform pre- and post-processing on GPU, following the example below:
+
+```python
+>>> from optimum.pipelines import pipeline
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
+
+>>> pipe = pipeline(task="text-classification", model=ort_model, tokenizer=tokenizer, device="cuda:0")
+>>> result = pipe("Both the music and visual were astounding, not to mention the actors performance.")
+>>> print(result)  # doctest: +IGNORE_RESULT
+# printing: [{'label': 'POSITIVE', 'score': 0.9997727274894c714}]
+```
+
+Additionally, you can pass the session option `log_severity_level = 0` (verbose), to check whether all nodes are indeed placed on the ROCM execution provider or not:
+
+```python
+>>> import onnxruntime
+
+>>> session_options = onnxruntime.SessionOptions()
+>>> session_options.log_severity_level = 0
+
+>>> ort_model = ORTModelForSequenceClassification.from_pretrained(
+...     "distilbert-base-uncased-finetuned-sst-2-english",
+...     export=True,
+...     provider="ROCMExecutionProvider",
+...     session_options=session_options
+... )
+```
+
+### Observed time gains
+
+Coming soon!
diff --git a/docs/source/onnxruntime/usage_guides/trainer.mdx b/docs/source/onnxruntime/usage_guides/trainer.mdx
index 50c6b4d77a..0f48355101 100644
--- a/docs/source/onnxruntime/usage_guides/trainer.mdx
+++ b/docs/source/onnxruntime/usage_guides/trainer.mdx
@@ -56,6 +56,8 @@ To use `ORTTrainer` or `ORTSeq2SeqTrainer`, you need to install ONNX Runtime Tra
 To set up the environment, we __strongly recommend__ you install the dependencies with Docker to ensure that the versions are correct and well
 configured. You can find dockerfiles with various combinations [here](https://github.com/huggingface/optimum/tree/main/examples/onnxruntime/training/docker).
 
+#### Setup for NVIDIA GPU
+
 Here below we take the installation of `onnxruntime-training 1.14.0` as an example:
 
 * If you want to install `onnxruntime-training 1.14.0` via [Dockerfile](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/training/docker/Dockerfile-ort1.14.0-cu116):
@@ -80,6 +82,32 @@ And run post-installation configuration:
 python -m torch_ort.configure
 ```
 
+#### Setup for AMD GPU
+
+Here below we take the installation of `onnxruntime-training` nightly as an example:
+
+* If you want to install `onnxruntime-training` via [Dockerfile](https://github.com/huggingface/optimum/blob/main/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-rocm57):
+
+```bash
+docker build -f Dockerfile-ort-nightly-rocm57 -t ort/train:nightly .
+```
+
+* If you want to install the dependencies beyond in a local Python environment. You can pip install them once you have [ROCM 5.7](https://rocmdocs.amd.com/en/latest/deploy/linux/quick_start.html) well installed.
+
+```bash
+pip install onnx ninja
+pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm5.7
+pip install pip install --pre onnxruntime-training -f https://download.onnxruntime.ai/onnxruntime_nightly_rocm57.html
+pip install torch-ort
+pip install --upgrade protobuf==3.20.2
+```
+
+And run post-installation configuration:
+
+```bash
+python -m torch_ort.configure
+```
+
 ### Install Optimum
 
 You can install Optimum via pypi:
diff --git a/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-rocm57 b/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-rocm57
new file mode 100644
index 0000000000..450bb7ca0d
--- /dev/null
+++ b/examples/onnxruntime/training/docker/Dockerfile-ort-nightly-rocm57
@@ -0,0 +1,43 @@
+# Use rocm image
+FROM rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1
+CMD rocm-smi
+
+# Ignore interactive questions during `docker build`
+ENV DEBIAN_FRONTEND noninteractive
+
+# Versions
+# available options 3.10
+ARG PYTHON_VERSION=3.10
+
+# Bash shell
+RUN chsh -s /bin/bash
+SHELL ["/bin/bash", "-c"]
+
+# Install and update tools to minimize security vulnerabilities
+RUN apt-get update
+RUN apt-get install -y software-properties-common wget apt-utils patchelf git libprotobuf-dev protobuf-compiler cmake \
+    bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 mercurial subversion libopenmpi-dev ffmpeg && \
+    apt-get clean
+RUN apt-get autoremove -y
+
+ARG PYTHON_EXE=/opt/conda/envs/py_$PYTHON_VERSION/bin/python
+
+# (Optional) Intall test dependencies
+RUN $PYTHON_EXE -m pip install -U pip
+RUN $PYTHON_EXE -m pip install git+https://github.com/huggingface/transformers
+RUN $PYTHON_EXE -m pip install datasets accelerate evaluate coloredlogs absl-py rouge_score seqeval scipy sacrebleu nltk scikit-learn parameterized sentencepiece --no-cache-dir
+RUN $PYTHON_EXE -m pip install deepspeed --no-cache-dir
+RUN conda install -y mpi4py
+
+# PyTorch
+RUN $PYTHON_EXE -m pip install onnx ninja
+
+# ORT Module
+RUN $PYTHON_EXE -m pip install --pre onnxruntime-training -f https://download.onnxruntime.ai/onnxruntime_nightly_rocm57.html
+RUN $PYTHON_EXE -m pip install torch-ort
+RUN $PYTHON_EXE -m pip install --upgrade protobuf==3.20.2
+RUN $PYTHON_EXE -m torch_ort.configure
+
+WORKDIR .
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py
index b58a37eb43..c618f8daad 100644
--- a/optimum/onnxruntime/modeling_ort.py
+++ b/optimum/onnxruntime/modeling_ort.py
@@ -308,16 +308,20 @@ def to(self, device: Union[torch.device, str, int]):
         if device.type == "cuda" and self.providers[0] == "TensorrtExecutionProvider":
             return self
 
-        if device.type == "cuda" and self._use_io_binding is False:
+        self.device = device
+        provider = get_provider_for_device(self.device)
+        validate_provider_availability(provider)  # raise error if the provider is not available
+
+        # IOBinding is only supported for CPU and CUDA Execution Providers.
+        if device.type == "cuda" and self._use_io_binding is False and provider == "CUDAExecutionProvider":
             self.use_io_binding = True
             logger.info(
                 "use_io_binding was set to False, setting it to True because it can provide a huge speedup on GPUs. "
                 "It is possible to disable this feature manually by setting the use_io_binding attribute back to False."
             )
 
-        self.device = device
-        provider = get_provider_for_device(self.device)
-        validate_provider_availability(provider)  # raise error if the provider is not available
+        if provider == "ROCMExecutionProvider":
+            self.use_io_binding = False
 
         self.model.set_providers([provider], provider_options=[provider_options])
         self.providers = self.model.get_providers()
diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py
index cdd4460fb0..1ec21c9802 100644
--- a/optimum/onnxruntime/utils.py
+++ b/optimum/onnxruntime/utils.py
@@ -63,7 +63,9 @@ def _is_gpu_available():
     Checks if a gpu is available.
     """
     available_providers = ort.get_available_providers()
-    if "CUDAExecutionProvider" in available_providers and torch.cuda.is_available():
+    if (
+        "CUDAExecutionProvider" in available_providers or "ROCMExecutionProvider" in available_providers
+    ) and torch.cuda.is_available():
         return True
     else:
         return False
@@ -184,7 +186,7 @@ def get_device_for_provider(provider: str, provider_options: Dict) -> torch.devi
     """
     Gets the PyTorch device (CPU/CUDA) associated with an ONNX Runtime provider.
     """
-    if provider in ["CUDAExecutionProvider", "TensorrtExecutionProvider"]:
+    if provider in ["CUDAExecutionProvider", "TensorrtExecutionProvider", "ROCMExecutionProvider"]:
         return torch.device(f"cuda:{provider_options['device_id']}")
     else:
         return torch.device("cpu")
@@ -194,7 +196,12 @@ def get_provider_for_device(device: torch.device) -> str:
     """
     Gets the ONNX Runtime provider associated with the PyTorch device (CPU/CUDA).
     """
-    return "CUDAExecutionProvider" if device.type.lower() == "cuda" else "CPUExecutionProvider"
+    if device.type.lower() == "cuda":
+        if "ROCMExecutionProvider" in ort.get_available_providers():
+            return "ROCMExecutionProvider"
+        else:
+            return "CUDAExecutionProvider"
+    return "CPUExecutionProvider"
 
 
 def parse_device(device: Union[torch.device, str, int]) -> Tuple[torch.device, Dict]:
diff --git a/optimum/utils/testing_utils.py b/optimum/utils/testing_utils.py
index b047b9cf28..9559d289bc 100644
--- a/optimum/utils/testing_utils.py
+++ b/optimum/utils/testing_utils.py
@@ -69,6 +69,17 @@ def require_torch_gpu(test_case):
     return unittest.skipUnless(torch_device == "cuda", "test requires CUDA")(test_case)
 
 
+def require_ort_rocm(test_case):
+    """Decorator marking a test that requires ROCMExecutionProvider for ONNX Runtime."""
+    import onnxruntime as ort
+
+    providers = ort.get_available_providers()
+
+    return unittest.skipUnless("ROCMExecutionProvider" == providers[0], "test requires ROCMExecutionProvider")(
+        test_case
+    )
+
+
 def require_hf_token(test_case):
     """
     Decorator marking a test that requires huggingface hub token.
diff --git a/pyproject.toml b/pyproject.toml
index b4a9cfc548..99a0f1c85f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,6 +33,9 @@ known-first-party = ["optimum"]
 [tool.pytest.ini_options]
 markers = [
     "gpu_test",
+    "cuda_ep_test",
+    "trt_ep_test",
+    "rocm_ep_test",
     "tensorflow_test",
     "timm_test",
     "run_in_series",
diff --git a/tests/onnxruntime/docker/Dockerfile_onnxruntime_gpu b/tests/onnxruntime/docker/Dockerfile_onnxruntime_gpu
index ebd1c538cd..9013697e04 100644
--- a/tests/onnxruntime/docker/Dockerfile_onnxruntime_gpu
+++ b/tests/onnxruntime/docker/Dockerfile_onnxruntime_gpu
@@ -23,4 +23,4 @@ COPY . /workspace/optimum
 RUN pip install /workspace/optimum[onnxruntime-gpu,tests]
 
 ENV TEST_LEVEL=1
-CMD pytest onnxruntime/test_*.py --durations=0 -s -vvvvv -m gpu_test
+CMD pytest onnxruntime/test_*.py --durations=0 -s -vvvvv -m cuda_ep_test -m trt_ep_test
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index baa62f1670..0db60c289d 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -108,7 +108,7 @@
     DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
     logging,
 )
-from optimum.utils.testing_utils import grid_parameters, require_hf_token
+from optimum.utils.testing_utils import grid_parameters, require_hf_token, require_ort_rocm
 
 
 logger = logging.get_logger()
@@ -233,13 +233,22 @@ def test_load_stable_diffusion_model_from_empty_cache(self):
             )
 
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_load_model_cuda_provider(self):
         model = ORTModel.from_pretrained(self.ONNX_MODEL_ID, provider="CUDAExecutionProvider")
         self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"])
         self.assertListEqual(model.model.get_providers(), model.providers)
         self.assertEqual(model.device, torch.device("cuda:0"))
 
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_load_model_rocm_provider(self):
+        model = ORTModel.from_pretrained(self.ONNX_MODEL_ID, provider="ROCMExecutionProvider")
+        self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"])
+        self.assertListEqual(model.model.get_providers(), model.providers)
+        self.assertEqual(model.device, torch.device("cuda:0"))
+
     def test_load_model_cpu_provider(self):
         model = ORTModel.from_pretrained(self.ONNX_MODEL_ID, provider="CPUExecutionProvider")
         self.assertListEqual(model.providers, ["CPUExecutionProvider"])
@@ -265,7 +274,7 @@ def test_load_seq2seq_model_without_past_from_hub(self):
         self.assertIsInstance(model.config, PretrainedConfig)
 
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_load_seq2seq_model_cuda_provider(self):
         model = ORTModelForSeq2SeqLM.from_pretrained(self.ONNX_SEQ2SEQ_MODEL_ID, provider="CUDAExecutionProvider")
         self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"])
@@ -273,6 +282,16 @@ def test_load_seq2seq_model_cuda_provider(self):
         self.assertListEqual(model.decoder.session.get_providers(), model.providers)
         self.assertEqual(model.device, torch.device("cuda:0"))
 
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_load_seq2seq_model_rocm_provider(self):
+        model = ORTModelForSeq2SeqLM.from_pretrained(self.ONNX_SEQ2SEQ_MODEL_ID, provider="ROCMExecutionProvider")
+        self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"])
+        self.assertListEqual(model.encoder.session.get_providers(), model.providers)
+        self.assertListEqual(model.decoder.session.get_providers(), model.providers)
+        self.assertEqual(model.device, torch.device("cuda:0"))
+
     def test_load_seq2seq_model_cpu_provider(self):
         model = ORTModelForSeq2SeqLM.from_pretrained(self.ONNX_SEQ2SEQ_MODEL_ID, provider="CPUExecutionProvider")
         self.assertListEqual(model.providers, ["CPUExecutionProvider"])
@@ -293,7 +312,7 @@ def test_load_stable_diffusion_model_from_hub(self):
         self.assertIsInstance(model.config, Dict)
 
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_load_stable_diffusion_model_cuda_provider(self):
         model = ORTStableDiffusionPipeline.from_pretrained(
             self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="CUDAExecutionProvider"
@@ -305,6 +324,20 @@ def test_load_stable_diffusion_model_cuda_provider(self):
         self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers)
         self.assertEqual(model.device, torch.device("cuda:0"))
 
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_load_stable_diffusion_model_rocm_provider(self):
+        model = ORTStableDiffusionPipeline.from_pretrained(
+            self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="ROCMExecutionProvider"
+        )
+        self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"])
+        self.assertListEqual(model.unet.session.get_providers(), model.providers)
+        self.assertListEqual(model.text_encoder.session.get_providers(), model.providers)
+        self.assertListEqual(model.vae_decoder.session.get_providers(), model.providers)
+        self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers)
+        self.assertEqual(model.device, torch.device("cuda:0"))
+
     def test_load_stable_diffusion_model_cpu_provider(self):
         model = ORTStableDiffusionPipeline.from_pretrained(
             self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="CPUExecutionProvider"
@@ -399,7 +432,7 @@ def test_missing_execution_provider(self):
         """
 
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_model_on_gpu(self):
         model = ORTModel.from_pretrained(self.ONNX_MODEL_ID)
         gpu = torch.device("cuda")
@@ -407,15 +440,34 @@ def test_model_on_gpu(self):
         self.assertEqual(model.device, torch.device("cuda:0"))
         self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"])
 
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_model_on_rocm_ep(self):
+        model = ORTModel.from_pretrained(self.ONNX_MODEL_ID)
+        gpu = torch.device("cuda")
+        model.to(gpu)
+        self.assertEqual(model.device, torch.device("cuda:0"))
+        self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"])
+
     # test string device input for to()
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_model_on_gpu_str(self):
         model = ORTModel.from_pretrained(self.ONNX_MODEL_ID)
         model.to("cuda")
         self.assertEqual(model.device, torch.device("cuda:0"))
         self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"])
 
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_model_on_rocm_ep_str(self):
+        model = ORTModel.from_pretrained(self.ONNX_MODEL_ID)
+        model.to("cuda")
+        self.assertEqual(model.device, torch.device("cuda:0"))
+        self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"])
+
     def test_passing_session_options(self):
         options = onnxruntime.SessionOptions()
         options.intra_op_num_threads = 3
@@ -441,7 +493,8 @@ def test_passing_session_options_stable_diffusion(self):
         self.assertEqual(model.vae_encoder.session.get_session_options().intra_op_num_threads, 3)
 
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
+    @pytest.mark.trt_ep_test
     def test_passing_provider_options(self):
         model = ORTModel.from_pretrained(self.ONNX_MODEL_ID, provider="CUDAExecutionProvider")
         self.assertEqual(model.model.get_provider_options()["CUDAExecutionProvider"]["do_copy_in_default_stream"], "1")
@@ -468,6 +521,20 @@ def test_passing_provider_options(self):
             model.model.get_provider_options()["TensorrtExecutionProvider"]["trt_engine_cache_enable"], "1"
         )
 
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_passing_provider_options_rocm_provider(self):
+        model = ORTModel.from_pretrained(self.ONNX_MODEL_ID, provider="ROCMExecutionProvider")
+        self.assertEqual(model.model.get_provider_options()["ROCMExecutionProvider"]["do_copy_in_default_stream"], "1")
+
+        model = ORTModel.from_pretrained(
+            self.ONNX_MODEL_ID,
+            provider="ROCMExecutionProvider",
+            provider_options={"do_copy_in_default_stream": 0},
+        )
+        self.assertEqual(model.model.get_provider_options()["ROCMExecutionProvider"]["do_copy_in_default_stream"], "0")
+
     @unittest.skipIf(get_gpu_count() <= 1, "this test requires multi-gpu")
     def test_model_on_gpu_id(self):
         model = ORTModel.from_pretrained(self.ONNX_MODEL_ID)
@@ -483,7 +550,8 @@ def test_model_on_gpu_id(self):
         self.assertEqual(model.model.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
 
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
+    @pytest.mark.trt_ep_test
     def test_passing_provider_options_seq2seq(self):
         model = ORTModelForSeq2SeqLM.from_pretrained(self.ONNX_SEQ2SEQ_MODEL_ID, provider="CUDAExecutionProvider")
         self.assertEqual(
@@ -556,6 +624,43 @@ def test_passing_provider_options_seq2seq(self):
             "1",
         )
 
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_passing_provider_options_seq2seq_rocm_provider(self):
+        model = ORTModelForSeq2SeqLM.from_pretrained(self.ONNX_SEQ2SEQ_MODEL_ID, provider="ROCMExecutionProvider")
+        self.assertEqual(
+            model.encoder.session.get_provider_options()["ROCMExecutionProvider"]["do_copy_in_default_stream"], "1"
+        )
+        self.assertEqual(
+            model.decoder.session.get_provider_options()["ROCMExecutionProvider"]["do_copy_in_default_stream"], "1"
+        )
+        self.assertEqual(
+            model.decoder_with_past.session.get_provider_options()["ROCMExecutionProvider"][
+                "do_copy_in_default_stream"
+            ],
+            "1",
+        )
+
+        model = ORTModelForSeq2SeqLM.from_pretrained(
+            self.ONNX_SEQ2SEQ_MODEL_ID,
+            provider="ROCMExecutionProvider",
+            provider_options={"do_copy_in_default_stream": 0},
+            use_cache=True,
+        )
+        self.assertEqual(
+            model.encoder.session.get_provider_options()["ROCMExecutionProvider"]["do_copy_in_default_stream"], "0"
+        )
+        self.assertEqual(
+            model.decoder.session.get_provider_options()["ROCMExecutionProvider"]["do_copy_in_default_stream"], "0"
+        )
+        self.assertEqual(
+            model.decoder_with_past.session.get_provider_options()["ROCMExecutionProvider"][
+                "do_copy_in_default_stream"
+            ],
+            "0",
+        )
+
     def test_seq2seq_model_on_cpu(self):
         model = ORTModelForSeq2SeqLM.from_pretrained(self.ONNX_SEQ2SEQ_MODEL_ID, use_cache=True)
         cpu = torch.device("cpu")
@@ -584,7 +689,7 @@ def test_seq2seq_model_on_cpu_str(self):
         self.assertListEqual(model.providers, ["CPUExecutionProvider"])
 
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_seq2seq_model_on_gpu(self):
         model = ORTModelForSeq2SeqLM.from_pretrained(self.ONNX_SEQ2SEQ_MODEL_ID, use_cache=True)
         gpu = torch.device("cuda")
@@ -598,6 +703,22 @@ def test_seq2seq_model_on_gpu(self):
         self.assertEqual(model.decoder_with_past.session.get_providers()[0], "CUDAExecutionProvider")
         self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"])
 
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_seq2seq_model_on_rocm_ep(self):
+        model = ORTModelForSeq2SeqLM.from_pretrained(self.ONNX_SEQ2SEQ_MODEL_ID, use_cache=True)
+        gpu = torch.device("cuda")
+        model.to(gpu)
+        self.assertEqual(model.device, torch.device("cuda:0"))
+        self.assertEqual(model.encoder.device, torch.device("cuda:0"))
+        self.assertEqual(model.decoder.device, torch.device("cuda:0"))
+        self.assertEqual(model.decoder_with_past.device, torch.device("cuda:0"))
+        self.assertEqual(model.encoder.session.get_providers()[0], "ROCMExecutionProvider")
+        self.assertEqual(model.decoder.session.get_providers()[0], "ROCMExecutionProvider")
+        self.assertEqual(model.decoder_with_past.session.get_providers()[0], "ROCMExecutionProvider")
+        self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"])
+
     @unittest.skipIf(get_gpu_count() <= 1, "this test requires multi-gpu")
     def test_seq2seq_model_on_gpu_id(self):
         model = ORTModelForSeq2SeqLM.from_pretrained(self.ONNX_SEQ2SEQ_MODEL_ID, use_cache=True)
@@ -626,7 +747,7 @@ def test_seq2seq_model_on_gpu_id(self):
 
     # test string device input for to()
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_seq2seq_model_on_gpu_str(self):
         model = ORTModelForSeq2SeqLM.from_pretrained(self.ONNX_SEQ2SEQ_MODEL_ID, use_cache=True)
         model.to("cuda")
@@ -640,7 +761,22 @@ def test_seq2seq_model_on_gpu_str(self):
         self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"])
 
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_seq2seq_model_on_rocm_ep_str(self):
+        model = ORTModelForSeq2SeqLM.from_pretrained(self.ONNX_SEQ2SEQ_MODEL_ID, use_cache=True)
+        model.to("cuda")
+        self.assertEqual(model.device, torch.device("cuda:0"))
+        self.assertEqual(model.encoder.device, torch.device("cuda:0"))
+        self.assertEqual(model.decoder.device, torch.device("cuda:0"))
+        self.assertEqual(model.decoder_with_past.device, torch.device("cuda:0"))
+        self.assertEqual(model.encoder.session.get_providers()[0], "ROCMExecutionProvider")
+        self.assertEqual(model.decoder.session.get_providers()[0], "ROCMExecutionProvider")
+        self.assertEqual(model.decoder_with_past.session.get_providers()[0], "ROCMExecutionProvider")
+        self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"])
+
+    @require_torch_gpu
+    @pytest.mark.cuda_ep_test
     def test_passing_provider_options_stable_diffusion(self):
         model = ORTStableDiffusionPipeline.from_pretrained(
             self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="CUDAExecutionProvider"
@@ -709,7 +845,7 @@ def test_stable_diffusion_model_on_cpu_str(self):
         self.assertListEqual(model.providers, ["CPUExecutionProvider"])
 
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_stable_diffusion_model_on_gpu(self):
         model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         gpu = torch.device("cuda")
@@ -725,6 +861,24 @@ def test_stable_diffusion_model_on_gpu(self):
         self.assertEqual(model.vae_encoder.session.get_providers()[0], "CUDAExecutionProvider")
         self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"])
 
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_stable_diffusion_model_on_rocm_ep(self):
+        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+        gpu = torch.device("cuda")
+        model.to(gpu)
+        self.assertEqual(model.device, torch.device("cuda:0"))
+        self.assertEqual(model.unet.device, torch.device("cuda:0"))
+        self.assertEqual(model.text_encoder.device, torch.device("cuda:0"))
+        self.assertEqual(model.vae_decoder.device, torch.device("cuda:0"))
+        self.assertEqual(model.vae_encoder.device, torch.device("cuda:0"))
+        self.assertEqual(model.unet.session.get_providers()[0], "ROCMExecutionProvider")
+        self.assertEqual(model.text_encoder.session.get_providers()[0], "ROCMExecutionProvider")
+        self.assertEqual(model.vae_decoder.session.get_providers()[0], "ROCMExecutionProvider")
+        self.assertEqual(model.vae_encoder.session.get_providers()[0], "ROCMExecutionProvider")
+        self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"])
+
     @unittest.skipIf(get_gpu_count() <= 1, "this test requires multi-gpu")
     def test_stable_diffusion_model_on_gpu_id(self):
         model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
@@ -750,7 +904,7 @@ def test_stable_diffusion_model_on_gpu_id(self):
 
     # test string device input for to()
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_stable_diffusion_model_on_gpu_str(self):
         model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         model.to("cuda")
@@ -765,6 +919,23 @@ def test_stable_diffusion_model_on_gpu_str(self):
         self.assertEqual(model.vae_encoder.session.get_providers()[0], "CUDAExecutionProvider")
         self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"])
 
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_stable_diffusion_model_on_rocm_ep_str(self):
+        model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
+        model.to("cuda")
+        self.assertEqual(model.device, torch.device("cuda:0"))
+        self.assertEqual(model.unet.device, torch.device("cuda:0"))
+        self.assertEqual(model.text_encoder.device, torch.device("cuda:0"))
+        self.assertEqual(model.vae_decoder.device, torch.device("cuda:0"))
+        self.assertEqual(model.vae_encoder.device, torch.device("cuda:0"))
+        self.assertEqual(model.unet.session.get_providers()[0], "ROCMExecutionProvider")
+        self.assertEqual(model.text_encoder.session.get_providers()[0], "ROCMExecutionProvider")
+        self.assertEqual(model.vae_decoder.session.get_providers()[0], "ROCMExecutionProvider")
+        self.assertEqual(model.vae_encoder.session.get_providers()[0], "ROCMExecutionProvider")
+        self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"])
+
     @require_hf_token
     def test_load_model_from_hub_private(self):
         model = ORTModel.from_pretrained(self.ONNX_MODEL_ID, use_auth_token=os.environ.get("HF_AUTH_TOKEN", None))
@@ -1166,7 +1337,8 @@ def test_pipeline_model_is_none(self):
         )
     )
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
+    @pytest.mark.trt_ep_test
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
         if provider == "TensorrtExecutionProvider" and model_arch != self.__class__.SUPPORTED_ARCHITECTURES[0]:
             self.skipTest("testing a single arch for TensorrtExecutionProvider")
@@ -1189,9 +1361,35 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
 
         gc.collect()
 
+    @parameterized.expand(
+        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]})
+    )
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str):
+        provider = "ROCMExecutionProvider"
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        model_id = MODEL_NAMES[model_arch]
+        onnx_model = ORTModelForQuestionAnswering.from_pretrained(self.onnx_model_dirs[model_arch], provider=provider)
+        tokenizer = get_preprocessor(model_id)
+        pipe = pipeline("question-answering", model=onnx_model, tokenizer=tokenizer, device=0)
+        question = "Whats my name?"
+        context = "My Name is Philipp and I live in Nuremberg."
+        outputs = pipe(question, context)
+        # check model device
+        self.assertEqual(pipe.model.device.type.lower(), "cuda")
+        # compare model output class
+        self.assertGreaterEqual(outputs["score"], 0.0)
+        self.assertTrue(isinstance(outputs["answer"], str))
+
+        gc.collect()
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_compare_to_io_binding(self, model_arch):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
@@ -1317,7 +1515,7 @@ def test_pipeline_model_is_none(self):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_pipeline_on_gpu(self, model_arch):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
@@ -1339,7 +1537,30 @@ def test_pipeline_on_gpu(self, model_arch):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_pipeline_on_rocm_ep(self, model_arch):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        model_id = MODEL_NAMES[model_arch]
+        onnx_model = ORTModelForMaskedLM.from_pretrained(self.onnx_model_dirs[model_arch])
+        tokenizer = get_preprocessor(model_id)
+        MASK_TOKEN = tokenizer.mask_token
+        pipe = pipeline("fill-mask", model=onnx_model, tokenizer=tokenizer, device=0)
+        text = f"The capital of France is {MASK_TOKEN}."
+        outputs = pipe(text)
+        # check model device
+        self.assertEqual(pipe.model.device.type.lower(), "cuda")
+        # compare model output class
+        self.assertGreaterEqual(outputs[0]["score"], 0.0)
+        self.assertTrue(isinstance(outputs[0]["token_str"], str))
+
+        gc.collect()
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_torch_gpu
+    @pytest.mark.cuda_ep_test
     def test_compare_to_io_binding(self, model_arch):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
@@ -1477,7 +1698,8 @@ def test_pipeline_model_is_none(self):
         )
     )
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
+    @pytest.mark.trt_ep_test
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
         if provider == "TensorrtExecutionProvider" and model_arch != self.__class__.SUPPORTED_ARCHITECTURES[0]:
             self.skipTest("testing a single arch for TensorrtExecutionProvider")
@@ -1501,6 +1723,32 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
 
         gc.collect()
 
+    @parameterized.expand(
+        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]})
+    )
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        model_id = MODEL_NAMES[model_arch]
+        onnx_model = ORTModelForSequenceClassification.from_pretrained(
+            self.onnx_model_dirs[model_arch], provider=provider
+        )
+        tokenizer = get_preprocessor(model_id)
+        pipe = pipeline("text-classification", model=onnx_model, tokenizer=tokenizer, device=0)
+        text = "My Name is Philipp and i live in Germany."
+        outputs = pipe(text)
+        # check model device
+        self.assertEqual(pipe.model.device.type.lower(), "cuda")
+        # compare model output class
+        self.assertGreaterEqual(outputs[0]["score"], 0.0)
+        self.assertTrue(isinstance(outputs[0]["label"], str))
+
+        gc.collect()
+
     def test_pipeline_zero_shot_classification(self):
         onnx_model = ORTModelForSequenceClassification.from_pretrained(
             "typeform/distilbert-base-uncased-mnli", export=True
@@ -1520,7 +1768,7 @@ def test_pipeline_zero_shot_classification(self):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_compare_to_io_binding(self, model_arch):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
@@ -1648,7 +1896,8 @@ def test_pipeline_model_is_none(self):
         )
     )
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
+    @pytest.mark.trt_ep_test
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
         if provider == "TensorrtExecutionProvider" and model_arch != self.__class__.SUPPORTED_ARCHITECTURES[0]:
             self.skipTest("testing a single arch for TensorrtExecutionProvider")
@@ -1671,9 +1920,34 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
 
         gc.collect()
 
+    @parameterized.expand(
+        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]})
+    )
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        model_id = MODEL_NAMES[model_arch]
+        onnx_model = ORTModelForTokenClassification.from_pretrained(
+            self.onnx_model_dirs[model_arch], provider=provider
+        )
+        tokenizer = get_preprocessor(model_id)
+        pipe = pipeline("token-classification", model=onnx_model, tokenizer=tokenizer, device=0)
+        text = "My Name is Philipp and i live in Germany."
+        outputs = pipe(text)
+        # check model device
+        self.assertEqual(pipe.model.device.type.lower(), "cuda")
+        # compare model output class
+        self.assertTrue(all(item["score"] > 0.0 for item in outputs))
+
+        gc.collect()
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_compare_to_io_binding(self, model_arch):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
@@ -1775,7 +2049,8 @@ def test_pipeline_model_is_none(self):
         )
     )
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
+    @pytest.mark.trt_ep_test
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
         if provider == "TensorrtExecutionProvider" and model_arch != self.__class__.SUPPORTED_ARCHITECTURES[0]:
             self.skipTest("testing a single arch for TensorrtExecutionProvider")
@@ -1796,9 +2071,32 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
 
         gc.collect()
 
+    @parameterized.expand(
+        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]})
+    )
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        model_id = MODEL_NAMES[model_arch]
+        onnx_model = ORTModelForFeatureExtraction.from_pretrained(self.onnx_model_dirs[model_arch], provider=provider)
+        tokenizer = get_preprocessor(model_id)
+        pipe = pipeline("feature-extraction", model=onnx_model, tokenizer=tokenizer, device=0)
+        text = "My Name is Philipp and i live in Germany."
+        outputs = pipe(text)
+        # check model device
+        self.assertEqual(pipe.model.device.type.lower(), "cuda")
+        # compare model output class
+        self.assertTrue(all(all(isinstance(item, float) for item in row) for row in outputs[0]))
+
+        gc.collect()
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_compare_to_io_binding(self, model_arch):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
@@ -1894,7 +2192,7 @@ def test_compare_to_transformers(self, model_arch):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_compare_to_io_binding(self, model_arch):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
@@ -2145,7 +2443,7 @@ def test_pipeline_model_is_none(self):
 
     @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]}))
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, use_cache: bool):
         model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache}
         self._setup(model_args)
@@ -2165,10 +2463,33 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, use_cache: bool)
 
         gc.collect()
 
+    @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]}))
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, use_cache: bool):
+        model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache}
+        self._setup(model_args)
+
+        model_id = MODEL_NAMES[model_arch]
+        onnx_model = ORTModelForCausalLM.from_pretrained(self.onnx_model_dirs[test_name])
+
+        tokenizer = get_preprocessor(model_id)
+        pipe = pipeline("text-generation", model=onnx_model, tokenizer=tokenizer, device=0)
+        text = "My Name is Philipp and i live"
+        outputs = pipe(text)
+        # check model device
+        self.assertEqual(pipe.model.device.type.lower(), "cuda")
+        # compare model output class
+        self.assertTrue(isinstance(outputs[0]["generated_text"], str))
+        self.assertTrue(len(outputs[0]["generated_text"]) > len(text))
+
+        gc.collect()
+
     # TRT EP compile time can be long, so we don't test all archs
     @parameterized.expand(grid_parameters({"model_arch": ["gpt2"], "use_cache": [True, False]}))
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.trt_ep_test
     def test_pipeline_on_trt_execution_provider(self, test_name: str, model_arch: str, use_cache: bool):
         model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache}
         self._setup(model_args)
@@ -2219,7 +2540,7 @@ def test_pipeline_on_trt_execution_provider(self, test_name: str, model_arch: st
             gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @pytest.mark.gpu_test  # mark as GPU test as well to run the without/with cache timing test on the slow tests
+    @pytest.mark.cuda_ep_test  # mark as GPU test as well to run the without/with cache timing test on the slow tests
     def test_compare_with_and_without_past_key_values(self, model_arch):
         model_args = {"test_name": model_arch + "_False", "model_arch": model_arch, "use_cache": False}
         self._setup(model_args)
@@ -2307,7 +2628,7 @@ def test_compare_merged_and_not_merged_models_outputs(self, test_name: str, mode
         grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True], "use_merged": [False, True]})
     )
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: bool, use_merged: bool):
         model_args = {
             "test_name": test_name,
@@ -2348,7 +2669,7 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache:
 
     @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]}))
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_compare_generation_to_io_binding(self, test_name: str, model_arch: str, use_cache: bool):
         model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache}
         self._setup(model_args)
@@ -2470,7 +2791,8 @@ def test_pipeline_model_is_none(self):
         )
     )
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
+    @pytest.mark.trt_ep_test
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
         if provider == "TensorrtExecutionProvider" and model_arch != self.__class__.SUPPORTED_ARCHITECTURES[0]:
             self.skipTest("testing a single arch for TensorrtExecutionProvider")
@@ -2495,9 +2817,36 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
 
         gc.collect()
 
+    @parameterized.expand(
+        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]})
+    )
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        model_id = MODEL_NAMES[model_arch]
+        onnx_model = ORTModelForImageClassification.from_pretrained(
+            self.onnx_model_dirs[model_arch], provider=provider
+        )
+        preprocessor = get_preprocessor(model_id)
+        pipe = pipeline("image-classification", model=onnx_model, feature_extractor=preprocessor, device=0)
+        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        outputs = pipe(url)
+        # check model device
+        self.assertEqual(pipe.model.device.type.lower(), "cuda")
+
+        # compare model output class
+        self.assertGreaterEqual(outputs[0]["score"], 0.0)
+        self.assertTrue(isinstance(outputs[0]["label"], str))
+
+        gc.collect()
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_compare_to_io_binding(self, model_arch):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
@@ -2607,7 +2956,8 @@ def test_pipeline_model_is_none(self):
         grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]})
     )
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
+    @pytest.mark.trt_ep_test
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
         if provider == "TensorrtExecutionProvider" and model_arch != self.__class__.SUPPORTED_ARCHITECTURES[0]:
             self.skipTest("testing a single arch for TensorrtExecutionProvider")
@@ -2632,9 +2982,36 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
 
         gc.collect()
 
+    @parameterized.expand(
+        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]})
+    )
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        model_id = MODEL_NAMES[model_arch]
+        onnx_model = ORTModelForSemanticSegmentation.from_pretrained(
+            self.onnx_model_dirs[model_arch], provider=provider
+        )
+        preprocessor = get_preprocessor(model_id)
+        pipe = pipeline("image-segmentation", model=onnx_model, feature_extractor=preprocessor, device=0)
+        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        outputs = pipe(url)
+        # check model device
+        self.assertEqual(pipe.model.device.type.lower(), "cuda")
+
+        # compare model output class
+        self.assertTrue(outputs[0]["mask"] is not None)
+        self.assertTrue(isinstance(outputs[0]["label"], str))
+
+        gc.collect()
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_compare_to_io_binding(self, model_arch):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
@@ -2764,7 +3141,8 @@ def test_pipeline_model_is_none(self):
         )
     )
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
+    @pytest.mark.trt_ep_test
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
         if provider == "TensorrtExecutionProvider" and model_arch != self.__class__.SUPPORTED_ARCHITECTURES[0]:
             self.skipTest("testing a single arch for TensorrtExecutionProvider")
@@ -2788,9 +3166,35 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
 
         gc.collect()
 
+    @parameterized.expand(
+        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]})
+    )
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        model_id = MODEL_NAMES[model_arch]
+        onnx_model = ORTModelForAudioClassification.from_pretrained(
+            self.onnx_model_dirs[model_arch], provider=provider
+        )
+        processor = AutoFeatureExtractor.from_pretrained(model_id)
+        pipe = pipeline("audio-classification", model=onnx_model, feature_extractor=processor, device=0)
+        data = self._generate_random_audio_data()
+        outputs = pipe(data)
+        # check model device
+        self.assertEqual(pipe.model.device.type.lower(), "cuda")
+        # compare model output class
+        self.assertGreaterEqual(outputs[0]["score"], 0.0)
+        self.assertTrue(isinstance(outputs[0]["label"], str))
+
+        gc.collect()
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_compare_to_io_binding(self, model_arch):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
@@ -2944,7 +3348,7 @@ def test_compare_to_transformers(self, model_arch):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_compare_to_io_binding(self, model_arch):
         model_args = {"test_name": model_arch, "model_arch": model_arch}
         self._setup(model_args)
@@ -3338,7 +3742,7 @@ def test_pipeline_model_is_none(self):
 
     @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]}))
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, use_cache: bool):
         model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache}
         self._setup(model_args)
@@ -3374,10 +3778,49 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, use_cache: bool)
             self.assertTrue(isinstance(outputs[0]["translation_token_ids"], torch.Tensor))
             self.assertTrue(len(outputs[0]["translation_token_ids"]) > len(text))
 
+    @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]}))
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, use_cache: bool):
+        model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache}
+        self._setup(model_args)
+
+        model_ids = self._get_model_ids(model_arch)
+        for model_id in model_ids:
+            if (
+                model_arch == "encoder-decoder"
+                and "text2text-generation-with-past" not in MODEL_NAMES[model_arch][model_id]
+            ):
+                # The model with use_cache=True is not supported for bert as a decoder"
+                continue
+
+            onnx_model_dir = self._get_onnx_model_dir(model_id, model_arch, test_name)
+            onnx_model = ORTModelForSeq2SeqLM.from_pretrained(onnx_model_dir, use_cache=use_cache)
+
+            tokenizer = get_preprocessor(model_id)
+            pipe = pipeline(
+                "translation_en_to_de", model=onnx_model, tokenizer=tokenizer, return_tensors=False, device=0
+            )
+            text = "My Name is Philipp and i live"
+            outputs = pipe(text, max_length=2 * len(text) + 1)
+            # check model device
+            self.assertEqual(pipe.model.device.type.lower(), "cuda")
+            # compare model output class
+            self.assertTrue(isinstance(outputs[0]["translation_text"], str))
+
+            pipe = pipeline(
+                "translation_en_to_de", model=onnx_model, tokenizer=tokenizer, return_tensors=True, device=0
+            )
+
+            outputs = pipe(text, min_length=len(text) + 1, max_length=2 * len(text) + 1)
+            self.assertTrue(isinstance(outputs[0]["translation_token_ids"], torch.Tensor))
+            self.assertTrue(len(outputs[0]["translation_token_ids"]) > len(text))
+
     # TRT EP compile time can be long, so we don't test all archs
     @parameterized.expand(grid_parameters({"model_arch": ["t5"], "use_cache": [True, False]}))
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.trt_ep_test
     def test_pipeline_on_trt_execution_provider(self, test_name: str, model_arch: str, use_cache: bool):
         model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache}
         self._setup(model_args)
@@ -3423,7 +3866,7 @@ def test_pipeline_on_trt_execution_provider(self, test_name: str, model_arch: st
             gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @pytest.mark.gpu_test  # mark as GPU test as well to run the without/with cache timing test on the slow tests
+    @pytest.mark.cuda_ep_test  # mark as GPU test as well to run the without/with cache timing test on the slow tests
     def test_compare_with_and_without_past_key_values(self, model_arch: str):
         if model_arch == "m2m_100":
             self.skipTest("m2m_100 comparison with/without pkv fail or is not supported")
@@ -3527,7 +3970,7 @@ def test_compare_merged_and_not_merged_models_outputs(self, test_name: str, mode
         grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True], "use_merged": [False, True]})
     )
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: bool, use_merged: bool):
         if use_cache is False and use_merged is True:
             self.skipTest("use_cache=False, use_merged=True are uncompatible")
@@ -3816,7 +4259,7 @@ def test_pipeline_speech_recognition(self, test_name: str, model_arch: str, use_
 
     @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]}))
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, use_cache: bool):
         model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache}
         self._setup(model_args)
@@ -3840,8 +4283,35 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, use_cache: bool)
         # compare model output class
         self.assertTrue(isinstance(outputs["text"], str))
 
+    @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]}))
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, use_cache: bool):
+        model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache}
+        self._setup(model_args)
+
+        model_id = MODEL_NAMES[model_arch]
+        onnx_model = ORTModelForSpeechSeq2Seq.from_pretrained(self.onnx_model_dirs[test_name], use_cache=use_cache)
+        processor = get_preprocessor(model_id)
+        pipe = pipeline(
+            "automatic-speech-recognition",
+            model=onnx_model,
+            tokenizer=processor.tokenizer,
+            feature_extractor=processor.feature_extractor,
+            device=0,
+        )
+
+        data = self._generate_random_audio_data()
+        outputs = pipe(data)
+
+        # check model device
+        self.assertEqual(pipe.model.device.type.lower(), "cuda")
+        # compare model output class
+        self.assertTrue(isinstance(outputs["text"], str))
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @pytest.mark.gpu_test  # mark as GPU test as well to run the without/with cache timing test on the slow tests
+    @pytest.mark.cuda_ep_test  # mark as GPU test as well to run the without/with cache timing test on the slow tests
     def test_compare_with_and_without_past_key_values(self, model_arch: str):
         model_args = {"test_name": model_arch + "_False", "model_arch": model_arch, "use_cache": False}
         self._setup(model_args)
@@ -3941,7 +4411,7 @@ def test_compare_merged_and_not_merged_models_outputs(self, test_name: str, mode
         grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True], "use_merged": [False, True]})
     )
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: bool, use_merged: bool):
         if use_cache is False and use_merged is True:
             self.skipTest("use_cache=False, use_merged=True are uncompatible")
@@ -3988,7 +4458,7 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache:
         grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True], "use_merged": [False, True]})
     )
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_compare_generation_to_io_binding(
         self, test_name: str, model_arch: str, use_cache: bool, use_merged: bool
     ):
@@ -4196,7 +4666,7 @@ def test_pipeline_image_to_text(self, test_name: str, model_arch: str, use_cache
 
     @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]}))
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, use_cache: bool):
         model_args = {
             "test_name": test_name,
@@ -4226,8 +4696,41 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, use_cache: bool)
         # compare model output class
         self.assertTrue(isinstance(outputs[0]["generated_text"], str))
 
+    @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]}))
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, use_cache: bool):
+        model_args = {
+            "test_name": test_name,
+            "model_arch": model_arch,
+            "use_cache": use_cache,
+        }
+        self._setup(model_args)
+
+        model_id = MODEL_NAMES[model_arch]
+        onnx_model = ORTModelForVision2Seq.from_pretrained(
+            self.onnx_model_dirs[test_name], use_cache=use_cache, use_io_binding=False
+        )
+        feature_extractor, tokenizer = self._get_preprocessors(model_id)
+        pipe = pipeline(
+            "image-to-text",
+            model=onnx_model,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            device=0,
+        )
+
+        data = self._get_sample_image()
+        outputs = pipe(data)
+
+        # check model device
+        self.assertEqual(pipe.model.device.type.lower(), "cuda")
+        # compare model output class
+        self.assertTrue(isinstance(outputs[0]["generated_text"], str))
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES[:1])
-    @pytest.mark.gpu_test  # mark as GPU test as well to run the without/with cache timing test on the slow tests
+    @pytest.mark.cuda_ep_test  # mark as GPU test as well to run the without/with cache timing test on the slow tests
     def test_compare_with_and_without_past_key_values(self, model_arch: str):
         model_args = {"test_name": model_arch + "_False", "model_arch": model_arch, "use_cache": False}
         self._setup(model_args)
@@ -4273,7 +4776,7 @@ def test_compare_with_and_without_past_key_values(self, model_arch: str):
         grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True], "use_merged": [False, True]})
     )
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: bool, use_merged: bool):
         if use_cache is False and use_merged is True:
             self.skipTest("use_cache=False, use_merged=True are uncompatible")
@@ -4320,7 +4823,7 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache:
         grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True], "use_merged": [False, True]})
     )
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_compare_generation_to_io_binding(
         self, test_name: str, model_arch: str, use_cache: bool, use_merged: bool
     ):
@@ -4387,7 +4890,7 @@ def test_pipeline_ort_model(self, *args, **kwargs):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_MODEL_ID.items())
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_pipeline_on_gpu(self, *args, **kwargs):
         model_arch, model_id = args
         onnx_model = ORTModelForCustomTasks.from_pretrained(model_id)
@@ -4400,6 +4903,22 @@ def test_pipeline_on_gpu(self, *args, **kwargs):
         # compare model output class
         self.assertTrue(any(any(isinstance(item, float) for item in row) for row in outputs[0]))
 
+    @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_MODEL_ID.items())
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_pipeline_on_rocm_ep(self, *args, **kwargs):
+        model_arch, model_id = args
+        onnx_model = ORTModelForCustomTasks.from_pretrained(model_id)
+        tokenizer = get_preprocessor(model_id)
+        pipe = pipeline("feature-extraction", model=onnx_model, tokenizer=tokenizer, device=0)
+        text = "My Name is Philipp and i live in Germany."
+        outputs = pipe(text)
+        # check model device
+        self.assertEqual(pipe.model.device.type.lower(), "cuda")
+        # compare model output class
+        self.assertTrue(any(any(isinstance(item, float) for item in row) for row in outputs[0]))
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_MODEL_ID.items())
     def test_default_pipeline_and_model_device(self, *args, **kwargs):
         model_arch, model_id = args
@@ -4410,7 +4929,7 @@ def test_default_pipeline_and_model_device(self, *args, **kwargs):
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_MODEL_ID.items())
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_compare_to_io_binding(self, *args, **kwargs):
         model_arch, model_id = args
         set_seed(SEED)
@@ -4577,7 +5096,7 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @pytest.mark.gpu_test  # mark as GPU test as well to run the without/with cache timing test on the slow tests
+    @pytest.mark.cuda_ep_test  # mark as GPU test as well to run the without/with cache timing test on the slow tests
     def test_compare_with_and_without_past_key_values(self, model_arch: str):
         if model_arch == "m2m_100":
             return  # TODO: this test is failing for m2m_100
@@ -4671,7 +5190,7 @@ def test_compare_merged_and_not_merged_models_outputs(self, test_name: str, mode
     @parameterized.expand(
         grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True], "use_merged": [False, True]})
     )
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: bool, use_merged: bool):
         if use_cache is False and use_merged is True:
             self.skipTest("use_cache=False, use_merged=True are uncompatible")
diff --git a/tests/onnxruntime/test_optimization.py b/tests/onnxruntime/test_optimization.py
index 6bc8d49fed..e0072b2929 100644
--- a/tests/onnxruntime/test_optimization.py
+++ b/tests/onnxruntime/test_optimization.py
@@ -340,7 +340,7 @@ def test_optimization_levels_cpu(self, test_name: str, model_arch: str, use_cach
         )
     )
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_optimization_levels_gpu(self, test_name: str, model_arch: str, use_cache: bool, optimization_level: str):
         for use_io_binding in [False, True]:
             # TODO: investigate why marian with IO Binding fails
@@ -457,7 +457,7 @@ def test_optimization_levels_cpu(self, test_name: str, model_arch: str, use_cach
         )
     )
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_optimization_levels_gpu(
         self, test_name: str, model_arch: str, use_cache: bool, use_io_binding: bool, optimization_level: str
     ):
@@ -570,7 +570,7 @@ def test_optimization_levels_cpu(
         grid_parameters({**FULL_GRID, "use_cache": [True], "optimization_level": ["O1", "O2", "O3", "O4"]})
     )
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     def test_optimization_levels_gpu(
         self, test_name: str, model_arch: str, use_merged: bool, use_cache: bool, optimization_level: str
     ):
diff --git a/tests/onnxruntime/test_stable_diffusion_pipeline.py b/tests/onnxruntime/test_stable_diffusion_pipeline.py
index 0f166af290..0e56b22f71 100644
--- a/tests/onnxruntime/test_stable_diffusion_pipeline.py
+++ b/tests/onnxruntime/test_stable_diffusion_pipeline.py
@@ -48,7 +48,7 @@
 )
 from optimum.pipelines.diffusers.pipeline_utils import VaeImageProcessor
 from optimum.utils.import_utils import _diffusers_version
-from optimum.utils.testing_utils import grid_parameters, require_diffusers
+from optimum.utils.testing_utils import grid_parameters, require_diffusers, require_ort_rocm
 
 
 if parse(_diffusers_version) > Version("0.21.4"):
@@ -124,7 +124,7 @@ def test_num_images_per_prompt(self, model_arch: str):
         grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]})
     )
     @require_torch_gpu
-    @pytest.mark.gpu_test
+    @pytest.mark.cuda_ep_test
     @require_diffusers
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
         model_args = {"test_name": test_name, "model_arch": model_arch}
@@ -139,6 +139,26 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
         self.assertIsInstance(outputs, np.ndarray)
         self.assertEqual(outputs.shape, (batch_size, height, width, 3))
 
+    @parameterized.expand(
+        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]})
+    )
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    @require_diffusers
+    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": test_name, "model_arch": model_arch}
+        self._setup(model_args)
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+        outputs = pipeline(**inputs).images
+        # Verify model devices
+        self.assertEqual(pipeline.device.type.lower(), "cuda")
+        # Verify model outptus
+        self.assertIsInstance(outputs, np.ndarray)
+        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @require_diffusers
     def test_callback(self, model_arch: str):
diff --git a/tests/onnxruntime/test_utils.py b/tests/onnxruntime/test_utils.py
index c3dd4c0561..2e30851618 100644
--- a/tests/onnxruntime/test_utils.py
+++ b/tests/onnxruntime/test_utils.py
@@ -1,6 +1,7 @@
 import tempfile
 import unittest
 
+import onnxruntime as ort
 import torch
 
 from optimum.onnxruntime.configuration import AutoQuantizationConfig, OptimizationConfig, ORTConfig
@@ -16,7 +17,11 @@ def test_get_device_for_provider(self):
 
     def test_get_provider_for_device(self):
         self.assertEqual(get_provider_for_device(torch.device("cpu")), "CPUExecutionProvider")
-        self.assertEqual(get_provider_for_device(torch.device("cuda")), "CUDAExecutionProvider")
+
+        if "ROCMExecutionProvider" in ort.get_available_providers():
+            self.assertEqual(get_provider_for_device(torch.device("cuda")), "ROCMExecutionProvider")
+        else:
+            self.assertEqual(get_provider_for_device(torch.device("cuda")), "CUDAExecutionProvider")
 
 
 class ORTConfigTest(unittest.TestCase):