From 9eb0c1de2fb9b56bb4033d8070882cabac6da706 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Sat, 2 Sep 2023 17:17:25 +0000
Subject: [PATCH 1/4] add model

---
 optimum/onnxruntime/modeling_seq2seq.py      |  5 ++++
 tests/exporters/exporters_utils.py           | 30 ++++++++++++--------
 tests/onnxruntime/test_modeling.py           |  9 +++---
 tests/onnxruntime/utils_onnxruntime_tests.py |  3 +-
 4 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py
index a6528e3ef4..64bc514cb7 100644
--- a/optimum/onnxruntime/modeling_seq2seq.py
+++ b/optimum/onnxruntime/modeling_seq2seq.py
@@ -1130,6 +1130,11 @@ def __init__(
                 config.decoder.model_type
             )(config.decoder)
 
+            if self.decoder_with_past is not None:
+                self.decoder_with_past.normalized_config = NormalizedConfigManager.get_normalized_config_class(
+                    config.decoder.model_type
+                )(config.decoder)
+
     def _initialize_encoder(self, session: ort.InferenceSession) -> ORTEncoder:
         return ORTEncoder(session, self)
 
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index 000dd59241..9737bb87ac 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -55,7 +55,13 @@
     "detr": "hf-internal-testing/tiny-random-DetrModel",  # hf-internal-testing/tiny-random-detr is larger
     "distilbert": "hf-internal-testing/tiny-random-DistilBertModel",
     "electra": "hf-internal-testing/tiny-random-ElectraModel",
-    "encoder-decoder": "hf-internal-testing/tiny-random-EncoderDecoderModel-bert-bert",
+    "encoder-decoder": {
+        "hf-internal-testing/tiny-random-EncoderDecoderModel-bert-bert": [
+            "text2text-generation",
+            "text2text-generation-with-past",
+        ],
+        "mohitsha/tiny-random-testing-bert2gpt2": ["text2text-generation", "text2text-generation-with-past"],
+    },
     "flaubert": "hf-internal-testing/tiny-random-flaubert",
     "gpt2": "hf-internal-testing/tiny-random-gpt2",
     "gpt-bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",
@@ -136,17 +142,17 @@
     "speech-to-text": "hf-internal-testing/tiny-random-Speech2TextModel",
     "xlm": "hf-internal-testing/tiny-random-XLMModel",
     "xlm-roberta": "hf-internal-testing/tiny-xlm-roberta",
-    "vision-encoder-decoder": {
-        "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2": [
-            "image-to-text",
-            "image-to-text-with-past",
-        ],
-        "microsoft/trocr-small-handwritten": ["image-to-text"],
-        "fxmarty/tiny-doc-qa-vision-encoder-decoder": [
-            "document-question-answering",
-            "document-question-answering-with-past",
-        ],
-    },
+    # "vision-encoder-decoder": {
+    #     "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2": [
+    #         "image-to-text",
+    #         "image-to-text-with-past",
+    #     ],
+    #     "microsoft/trocr-small-handwritten": ["image-to-text"],
+    #     "fxmarty/tiny-doc-qa-vision-encoder-decoder": [
+    #         "document-question-answering",
+    #         "document-question-answering-with-past",
+    #     ],
+    # },
 }
 
 
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index dcf3a32dfc..0779560864 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -3475,8 +3475,6 @@ def test_compare_merged_and_not_merged_models_outputs(self, test_name: str, mode
     @require_torch_gpu
     @pytest.mark.gpu_test
     def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: bool, use_merged: bool):
-        if model_arch == "encoder-decoder":
-            use_cache = False
         if use_cache is False and use_merged is True:
             self.skipTest("use_cache=False, use_merged=True are uncompatible")
 
@@ -3503,6 +3501,9 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache:
         tokenizer = get_preprocessor(model_id)
         tokens = tokenizer(["This is a sample output"] * 2, return_tensors="pt").to("cuda")
         decoder_start_token_id = onnx_model.config.decoder_start_token_id if model_arch != "mbart" else 2
+        if model_arch == "encoder-decoder":
+            decoder_start_token_id = tokenizer.cls_token_id
+
         decoder_inputs = {"decoder_input_ids": torch.ones((2, 1), dtype=torch.long) * decoder_start_token_id}
 
         onnx_outputs = onnx_model(**tokens, **decoder_inputs)
@@ -3523,8 +3524,6 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache:
     def test_compare_generation_to_io_binding(
         self, test_name: str, model_arch: str, use_cache: bool, use_merged: bool
     ):
-        if model_arch == "encoder-decoder":
-            use_cache = False
         if use_cache is False and use_merged is True:
             self.skipTest("use_cache=False, use_merged=True are uncompatible")
 
@@ -3958,7 +3957,7 @@ def test_compare_generation_to_io_binding(
 
 class ORTModelForVision2SeqIntegrationTest(ORTModelTestMixin):
     # TODO: speech_to_text should be tested
-    SUPPORTED_ARCHITECTURES = ["vision-encoder-decoder", "trocr"]
+    SUPPORTED_ARCHITECTURES = ["trocr"]
 
     FULL_GRID = {
         "model_arch": SUPPORTED_ARCHITECTURES,
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index fdcd565e9c..6e2dc16ae7 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -50,7 +50,8 @@
     "detr": "hf-internal-testing/tiny-random-detr",
     "distilbert": "hf-internal-testing/tiny-random-DistilBertModel",
     "electra": "hf-internal-testing/tiny-random-ElectraModel",
-    "encoder-decoder": "hf-internal-testing/tiny-random-EncoderDecoderModel-bert-bert",
+    # "encoder-decoder": "hf-internal-testing/tiny-random-EncoderDecoderModel-bert-bert",
+    "encoder-decoder": "mohitsha/tiny-random-testing-bert2gpt2",
     "flaubert": "hf-internal-testing/tiny-random-flaubert",
     "gpt2": "hf-internal-testing/tiny-random-gpt2",
     "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",

From 5dfb62054fd6f6e8ae8556e03d2e71a3b0bcd9f9 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Sun, 3 Sep 2023 18:47:29 +0000
Subject: [PATCH 2/4] fix tests

---
 tests/exporters/exporters_utils.py           |   1 -
 tests/onnxruntime/test_modeling.py           | 522 +++++++++++--------
 tests/onnxruntime/utils_onnxruntime_tests.py |  56 +-
 3 files changed, 352 insertions(+), 227 deletions(-)

diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index 9737bb87ac..12483303f8 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -58,7 +58,6 @@
     "encoder-decoder": {
         "hf-internal-testing/tiny-random-EncoderDecoderModel-bert-bert": [
             "text2text-generation",
-            "text2text-generation-with-past",
         ],
         "mohitsha/tiny-random-testing-bert2gpt2": ["text2text-generation", "text2text-generation-with-past"],
     },
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index 0779560864..c4fd279fc9 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -3082,6 +3082,25 @@ class ORTModelForSeq2SeqLMIntegrationTest(ORTModelTestMixin):
     GENERATION_LENGTH = 100
     SPEEDUP_CACHE = 1.1
 
+    def _get_model_ids(self, model_arch):
+        model_ids = MODEL_NAMES[model_arch]
+        if isinstance(model_ids, dict):
+            model_ids = list(model_ids.keys())
+        else:
+            model_ids = [model_ids]
+        return model_ids
+
+    def _get_onnx_model_dir(self, model_id, model_arch, test_name):
+        onnx_model_dir = self.onnx_model_dirs[test_name]
+        if isinstance(MODEL_NAMES[model_arch], dict):
+            onnx_model_dir = onnx_model_dir[model_id]
+
+        return onnx_model_dir
+
+    def _load_model(self, model_id, model_arch, test_name, use_cache):
+        onnx_model_dir = self._get_onnx_model_dir(model_id, model_arch, test_name)
+        return ORTModelForSeq2SeqLM.from_pretrained(onnx_model_dir, use_cache=use_cache)
+
     def test_inference_old_onnx_model(self):
         model = ORTModelForSeq2SeqLM.from_pretrained("optimum/t5-small")
 
@@ -3099,84 +3118,98 @@ def test_load_vanilla_transformers_which_is_not_supported(self):
 
     @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]}))
     def test_generate_utils(self, test_name: str, model_arch: str, use_cache: str):
-        if model_arch == "encoder-decoder" and use_cache is True:
-            self.skipTest("encoder-decoder model type with use_cache=True is not supported for bert as a decoder")
         model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache}
         self._setup(model_args)
 
-        model_id = MODEL_NAMES[model_arch]
-        model = ORTModelForSeq2SeqLM.from_pretrained(self.onnx_model_dirs[test_name], use_cache=use_cache)
-        tokenizer = get_preprocessor(model_id)
-        text = "This is a sample output"
-        tokens = tokenizer(text, return_tensors="pt")
+        model_ids = self._get_model_ids(model_arch)
+        for model_id in model_ids:
+            if (
+                model_arch == "encoder-decoder"
+                and use_cache is True
+                and "text2text-generation-with-past" not in MODEL_NAMES[model_arch][model_id]
+            ):
+                continue
 
-        # General case
-        outputs = model.generate(**tokens)
-        res = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        self.assertIsInstance(res[0], str)
+            model = self._load_model(model_id, model_arch, test_name, use_cache)
+            tokenizer = get_preprocessor(model_id)
+            text = "This is a sample output"
+            tokens = tokenizer(text, return_tensors="pt")
 
-        # With input ids
-        outputs = model.generate(input_ids=tokens["input_ids"])
-        res = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        self.assertIsInstance(res[0], str)
+            # General case
+            outputs = model.generate(**tokens)
+            res = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+            self.assertIsInstance(res[0], str)
+
+            # With input ids
+            outputs = model.generate(input_ids=tokens["input_ids"])
+            res = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+            self.assertIsInstance(res[0], str)
 
         gc.collect()
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_merge_from_transformers_and_save(self, model_arch):
-        if model_arch == "encoder-decoder":
-            self.skipTest("encoder-decoder model type with use_merged=True is not supported for bert as a decoder")
-
         if "text2text-generation-with-past" not in TasksManager.get_supported_tasks_for_model_type(
             model_arch.replace("_", "-"), exporter="onnx"
         ):
             self.skipTest("Unsupported -with-past export case")
 
-        model_id = MODEL_NAMES[model_arch]
-        model = ORTModelForSeq2SeqLM.from_pretrained(model_id, export=True, use_merged=True)
-        with tempfile.TemporaryDirectory() as tmpdir:
-            model.save_pretrained(tmpdir)
-            save_path = os.path.join(tmpdir, ONNX_DECODER_MERGED_NAME)
-            self.assertTrue(has_onnx_input(save_path, "use_cache_branch"))
+        model_ids = self._get_model_ids(model_arch)
+        for model_id in model_ids:
+            if (
+                model_arch == "encoder-decoder"
+                and "text2text-generation-with-past" not in MODEL_NAMES[model_arch][model_id]
+            ):
+                # The model with use_merged=True is not supported for bert as a decoder")
+                continue
 
-            folder_contents = os.listdir(tmpdir)
-            self.assertTrue(ONNX_ENCODER_NAME in folder_contents)
-            self.assertTrue(ONNX_DECODER_NAME not in folder_contents)
-            self.assertTrue(ONNX_DECODER_WITH_PAST_NAME not in folder_contents)
+            model = ORTModelForSeq2SeqLM.from_pretrained(model_id, export=True, use_merged=True)
+
+            with tempfile.TemporaryDirectory() as tmpdir:
+                model.save_pretrained(tmpdir)
+                save_path = os.path.join(tmpdir, ONNX_DECODER_MERGED_NAME)
+                self.assertTrue(has_onnx_input(save_path, "use_cache_branch"))
+
+                folder_contents = os.listdir(tmpdir)
+                self.assertTrue(ONNX_ENCODER_NAME in folder_contents)
+                self.assertTrue(ONNX_DECODER_NAME not in folder_contents)
+                self.assertTrue(ONNX_DECODER_WITH_PAST_NAME not in folder_contents)
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_merge_from_onnx_and_save(self, model_arch):
-        if model_arch == "encoder-decoder":
-            self.skipTest("encoder-decoder model type with use_merged=True is not supported for bert as a decoder")
-
-        model_id = MODEL_NAMES[model_arch]
         task = "text2text-generation-with-past"
 
         if task not in TasksManager.get_supported_tasks_for_model_type(model_arch.replace("_", "-"), exporter="onnx"):
             self.skipTest("Unsupported export case")
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            main_export(model_id, tmpdir, task=task)
+        model_ids = self._get_model_ids(model_arch)
+        for model_id in model_ids:
+            if (
+                model_arch == "encoder-decoder"
+                and "text2text-generation-with-past" not in MODEL_NAMES[model_arch][model_id]
+            ):
+                # The model with use_merged=True is not supported for bert as a decoder")
+                continue
 
-            model = ORTModelForSeq2SeqLM.from_pretrained(tmpdir)
+            with tempfile.TemporaryDirectory() as tmpdir:
+                main_export(model_id, tmpdir, task=task)
 
-            self.assertTrue(model.use_merged)
-            self.assertTrue(model.decoder_with_past is None)
+                model = ORTModelForSeq2SeqLM.from_pretrained(tmpdir)
 
-            model.save_pretrained(tmpdir + "_save")
-            save_path = os.path.join(tmpdir + "_save", ONNX_DECODER_MERGED_NAME)
-            self.assertTrue(has_onnx_input(save_path, "use_cache_branch"))
+                self.assertTrue(model.use_merged)
+                self.assertTrue(model.decoder_with_past is None)
 
-            folder_contents = os.listdir(tmpdir + "_save")
-            self.assertTrue(ONNX_ENCODER_NAME in folder_contents)
-            self.assertFalse(ONNX_DECODER_NAME in folder_contents)
-            self.assertFalse(ONNX_DECODER_WITH_PAST_NAME in folder_contents)
+                model.save_pretrained(tmpdir + "_save")
+                save_path = os.path.join(tmpdir + "_save", ONNX_DECODER_MERGED_NAME)
+                self.assertTrue(has_onnx_input(save_path, "use_cache_branch"))
+
+                folder_contents = os.listdir(tmpdir + "_save")
+                self.assertTrue(ONNX_ENCODER_NAME in folder_contents)
+                self.assertFalse(ONNX_DECODER_NAME in folder_contents)
+                self.assertFalse(ONNX_DECODER_WITH_PAST_NAME in folder_contents)
 
     @parameterized.expand(grid_parameters(FULL_GRID))
     def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cache: bool, use_merged: bool):
-        if model_arch == "encoder-decoder" and use_cache is True:
-            self.skipTest("encoder-decoder model type with use_cache=True is not supported for bert as a decoder")
-
         if use_cache is False and use_merged is True:
             self.skipTest("use_cache=False, use_merged=True are uncompatible")
 
@@ -3189,61 +3222,71 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach
 
         self._setup(model_args)
 
-        model_id = MODEL_NAMES[model_arch]
-        onnx_model = ORTModelForSeq2SeqLM.from_pretrained(self.onnx_model_dirs[test_name], use_cache=use_cache)
-
-        self.assertIsInstance(onnx_model.encoder, ORTEncoder)
-        if use_merged is False:
-            model_path = Path(self.onnx_model_dirs[test_name], ONNX_DECODER_NAME)
-            self.assertFalse(has_onnx_input(model_path, "use_cache_branch"))
-            self.assertEqual(onnx_model.use_merged, False)
-        else:
-            model_path = Path(self.onnx_model_dirs[test_name], ONNX_DECODER_MERGED_NAME)
-            self.assertTrue(has_onnx_input(model_path, "use_cache_branch"))
-            self.assertEqual(onnx_model.use_merged, True)
-
-        self.assertIsInstance(onnx_model.decoder, ORTDecoderForSeq2Seq)
-        if onnx_model.use_cache is True and onnx_model.use_merged is False:
-            self.assertIsInstance(onnx_model.decoder_with_past, ORTDecoderForSeq2Seq)
-        if onnx_model.use_cache is True and onnx_model.use_merged is True:
-            self.assertTrue(onnx_model.decoder_with_past is None)
-
-        self.assertIsInstance(onnx_model.config, PretrainedConfig)
-
-        set_seed(SEED)
-        transformers_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
-        tokenizer = get_preprocessor(model_id)
-        tokens = tokenizer("This is a sample output", return_tensors="pt")
-        decoder_start_token_id = transformers_model.config.decoder_start_token_id if model_arch != "mbart" else 2
-        if model_arch == "encoder-decoder":
-            decoder_start_token_id = tokenizer.cls_token_id
+        model_ids = self._get_model_ids(model_arch)
+        for model_id in model_ids:
+            if (
+                model_arch == "encoder-decoder"
+                and use_cache is True
+                and "text2text-generation-with-past" not in MODEL_NAMES[model_arch][model_id]
+            ):
+                # The model with use_cache=True is not supported for bert as a decoder")
+                continue
+
+            onnx_model_dir = self._get_onnx_model_dir(model_id, model_arch, test_name)
+
+            onnx_model = self._load_model(model_id, model_arch, test_name, use_cache)
+
+            self.assertIsInstance(onnx_model.encoder, ORTEncoder)
+            if use_merged is False:
+                model_path = Path(onnx_model_dir, ONNX_DECODER_NAME)
+                self.assertFalse(has_onnx_input(model_path, "use_cache_branch"))
+                self.assertEqual(onnx_model.use_merged, False)
+            else:
+                model_path = Path(onnx_model_dir, ONNX_DECODER_MERGED_NAME)
+                self.assertTrue(has_onnx_input(model_path, "use_cache_branch"))
+                self.assertEqual(onnx_model.use_merged, True)
+
+            self.assertIsInstance(onnx_model.decoder, ORTDecoderForSeq2Seq)
+            if onnx_model.use_cache is True and onnx_model.use_merged is False:
+                self.assertIsInstance(onnx_model.decoder_with_past, ORTDecoderForSeq2Seq)
+            if onnx_model.use_cache is True and onnx_model.use_merged is True:
+                self.assertTrue(onnx_model.decoder_with_past is None)
+
+            self.assertIsInstance(onnx_model.config, PretrainedConfig)
+
+            set_seed(SEED)
+            transformers_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
+            tokenizer = get_preprocessor(model_id)
+            tokens = tokenizer("This is a sample output", return_tensors="pt")
+            decoder_start_token_id = transformers_model.config.decoder_start_token_id if model_arch != "mbart" else 2
+            if model_arch == "encoder-decoder":
+                decoder_start_token_id = tokenizer.cls_token_id
 
-        decoder_inputs = {"decoder_input_ids": torch.ones((1, 1), dtype=torch.long) * decoder_start_token_id}
+            decoder_inputs = {"decoder_input_ids": torch.ones((1, 1), dtype=torch.long) * decoder_start_token_id}
 
-        with torch.no_grad():
-            transformers_outputs = transformers_model(**tokens, **decoder_inputs)
+            with torch.no_grad():
+                transformers_outputs = transformers_model(**tokens, **decoder_inputs)
 
-        for input_type in ["pt", "np"]:
-            tokens = tokenizer("This is a sample output", return_tensors=input_type)
+            for input_type in ["pt", "np"]:
+                tokens = tokenizer("This is a sample output", return_tensors=input_type)
 
-            if input_type == "np":
-                decoder_inputs = {"decoder_input_ids": np.ones((1, 1), dtype=np.int64) * decoder_start_token_id}
+                if input_type == "np":
+                    decoder_inputs = {"decoder_input_ids": np.ones((1, 1), dtype=np.int64) * decoder_start_token_id}
 
-            onnx_outputs = onnx_model(**tokens, **decoder_inputs)
+                onnx_outputs = onnx_model(**tokens, **decoder_inputs)
 
-            self.assertTrue("logits" in onnx_outputs)
-            self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type])
+                self.assertTrue("logits" in onnx_outputs)
+                self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type])
 
-            # Compare tensor outputs
-            self.assertTrue(torch.allclose(torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=1e-4))
+                # Compare tensor outputs
+                self.assertTrue(
+                    torch.allclose(torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=1e-4)
+                )
 
         gc.collect()
 
     @parameterized.expand(grid_parameters(FULL_GRID))
     def test_pipeline_text_generation(self, test_name: str, model_arch: str, use_cache: bool, use_merged: bool):
-        if model_arch == "encoder-decoder" and use_cache is True:
-            self.skipTest("encoder-decoder model type with use_cache=True is not supported for bert as a decoder")
-
         if use_cache is False and use_merged is True:
             self.skipTest("use_cache=False, use_merged=True are uncompatible")
 
@@ -3256,34 +3299,44 @@ def test_pipeline_text_generation(self, test_name: str, model_arch: str, use_cac
 
         self._setup(model_args)
 
-        model_id = MODEL_NAMES[model_arch]
-        onnx_model = ORTModelForSeq2SeqLM.from_pretrained(self.onnx_model_dirs[test_name], use_cache=use_cache)
-        tokenizer = get_preprocessor(model_id)
+        model_ids = self._get_model_ids(model_arch)
+        for model_id in model_ids:
+            if (
+                model_arch == "encoder-decoder"
+                and use_cache is True
+                and "text2text-generation-with-past" not in MODEL_NAMES[model_arch][model_id]
+            ):
+                # The model with use_cache=True is not supported for bert as a decoder")
+                continue
 
-        decoder_start_token_id = onnx_model.config.decoder_start_token_id if model_arch != "mbart" else 2
-        if model_arch == "encoder-decoder":
-            decoder_start_token_id = tokenizer.cls_token_id
+            onnx_model = self._load_model(model_id, model_arch, test_name, use_cache)
 
-        # Text2Text generation
-        pipe = pipeline("text2text-generation", model=onnx_model, tokenizer=tokenizer)
-        text = "This is a test"
-        outputs = pipe(text, decoder_start_token_id=decoder_start_token_id)
-        self.assertEqual(pipe.device, onnx_model.device)
-        self.assertIsInstance(outputs[0]["generated_text"], str)
-
-        # Summarization
-        pipe = pipeline("summarization", model=onnx_model, tokenizer=tokenizer)
-        text = "This is a test"
-        outputs = pipe(text, decoder_start_token_id=decoder_start_token_id)
-        self.assertEqual(pipe.device, onnx_model.device)
-        self.assertIsInstance(outputs[0]["summary_text"], str)
+            tokenizer = get_preprocessor(model_id)
 
-        # Translation
-        pipe = pipeline("translation_en_to_de", model=onnx_model, tokenizer=tokenizer)
-        text = "This is a test"
-        outputs = pipe(text, decoder_start_token_id=decoder_start_token_id)
-        self.assertEqual(pipe.device, onnx_model.device)
-        self.assertIsInstance(outputs[0]["translation_text"], str)
+            decoder_start_token_id = onnx_model.config.decoder_start_token_id if model_arch != "mbart" else 2
+            if model_arch == "encoder-decoder":
+                decoder_start_token_id = tokenizer.cls_token_id
+
+            # Text2Text generation
+            pipe = pipeline("text2text-generation", model=onnx_model, tokenizer=tokenizer)
+            text = "This is a test"
+            outputs = pipe(text, decoder_start_token_id=decoder_start_token_id)
+            self.assertEqual(pipe.device, onnx_model.device)
+            self.assertIsInstance(outputs[0]["generated_text"], str)
+
+            # Summarization
+            pipe = pipeline("summarization", model=onnx_model, tokenizer=tokenizer)
+            text = "This is a test"
+            outputs = pipe(text, decoder_start_token_id=decoder_start_token_id)
+            self.assertEqual(pipe.device, onnx_model.device)
+            self.assertIsInstance(outputs[0]["summary_text"], str)
+
+            # Translation
+            pipe = pipeline("translation_en_to_de", model=onnx_model, tokenizer=tokenizer)
+            text = "This is a test"
+            outputs = pipe(text, decoder_start_token_id=decoder_start_token_id)
+            self.assertEqual(pipe.device, onnx_model.device)
+            self.assertIsInstance(outputs[0]["translation_text"], str)
 
         gc.collect()
 
@@ -3312,27 +3365,38 @@ def test_pipeline_model_is_none(self):
     @require_torch_gpu
     @pytest.mark.gpu_test
     def test_pipeline_on_gpu(self, test_name: str, model_arch: str, use_cache: bool):
-        if model_arch == "encoder-decoder":
-            use_cache = False
         model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache}
         self._setup(model_args)
 
-        model_id = MODEL_NAMES[model_arch]
-        onnx_model = ORTModelForSeq2SeqLM.from_pretrained(self.onnx_model_dirs[test_name], use_cache=use_cache)
-        tokenizer = get_preprocessor(model_id)
-        pipe = pipeline("translation_en_to_de", model=onnx_model, tokenizer=tokenizer, return_tensors=False, device=0)
-        text = "My Name is Philipp and i live"
-        outputs = pipe(text, max_length=2 * len(text) + 1)
-        # check model device
-        self.assertEqual(pipe.model.device.type.lower(), "cuda")
-        # compare model output class
-        self.assertTrue(isinstance(outputs[0]["translation_text"], str))
+        model_ids = self._get_model_ids(model_arch)
+        for model_id in model_ids:
+            if (
+                model_arch == "encoder-decoder"
+                and "text2text-generation-with-past" not in MODEL_NAMES[model_arch][model_id]
+            ):
+                # The model with use_cache=True is not supported for bert as a decoder"
+                continue
 
-        pipe = pipeline("translation_en_to_de", model=onnx_model, tokenizer=tokenizer, return_tensors=True, device=0)
+            onnx_model = self._load_model(model_id, model_arch, test_name, use_cache)
 
-        outputs = pipe(text, min_length=len(text) + 1, max_length=2 * len(text) + 1)
-        self.assertTrue(isinstance(outputs[0]["translation_token_ids"], torch.Tensor))
-        self.assertTrue(len(outputs[0]["translation_token_ids"]) > len(text))
+            tokenizer = get_preprocessor(model_id)
+            pipe = pipeline(
+                "translation_en_to_de", model=onnx_model, tokenizer=tokenizer, return_tensors=False, device=0
+            )
+            text = "My Name is Philipp and i live"
+            outputs = pipe(text, max_length=2 * len(text) + 1)
+            # check model device
+            self.assertEqual(pipe.model.device.type.lower(), "cuda")
+            # compare model output class
+            self.assertTrue(isinstance(outputs[0]["translation_text"], str))
+
+            pipe = pipeline(
+                "translation_en_to_de", model=onnx_model, tokenizer=tokenizer, return_tensors=True, device=0
+            )
+
+            outputs = pipe(text, min_length=len(text) + 1, max_length=2 * len(text) + 1)
+            self.assertTrue(isinstance(outputs[0]["translation_token_ids"], torch.Tensor))
+            self.assertTrue(len(outputs[0]["translation_token_ids"]) > len(text))
 
     # TRT EP compile time can be long, so we don't test all archs
     @parameterized.expand(grid_parameters({"model_arch": ["t5"], "use_cache": [True, False]}))
@@ -3385,51 +3449,57 @@ def test_pipeline_on_trt_execution_provider(self, test_name: str, model_arch: st
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     @pytest.mark.gpu_test  # mark as GPU test as well to run the without/with cache timing test on the slow tests
     def test_compare_with_and_without_past_key_values(self, model_arch: str):
-        if model_arch == "m2m_100" or model_arch == "encoder-decoder":
-            self.skipTest("m2m_100 and encoder-decoder comparison with/without pkv fail or is not supported")
+        if model_arch == "m2m_100":
+            self.skipTest("m2m_100 comparison with/without pkv fail or is not supported")
         model_args = {"test_name": model_arch + "_False", "model_arch": model_arch, "use_cache": False}
         self._setup(model_args)
         model_args = {"test_name": model_arch + "_True", "model_arch": model_arch, "use_cache": True}
         self._setup(model_args)
 
-        model_id = MODEL_NAMES[model_arch]
-        tokenizer = get_preprocessor(model_id)
-        text = "This is a sample output"
-        tokens = tokenizer(text, return_tensors="pt")
-        model_with_pkv = ORTModelForSeq2SeqLM.from_pretrained(
-            self.onnx_model_dirs[model_arch + "_True"], use_cache=True
-        )
+        model_ids = self._get_model_ids(model_arch)
+        for model_id in model_ids:
+            if (
+                model_arch == "encoder-decoder"
+                and "text2text-generation-with-past" not in MODEL_NAMES[model_arch][model_id]
+            ):
+                # The model with use_cache=True is not supported for bert as a decoder")
+                continue
 
-        _ = model_with_pkv.generate(**tokens)  # warmup
-        with Timer() as with_pkv_timer:
-            outputs_model_with_pkv = model_with_pkv.generate(
-                **tokens, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1
+            tokenizer = get_preprocessor(model_id)
+            text = "This is a sample output"
+            tokens = tokenizer(text, return_tensors="pt")
+            model_with_pkv = ORTModelForSeq2SeqLM.from_pretrained(
+                self._get_onnx_model_dir(model_id, model_arch, model_arch + "_True"), use_cache=True
             )
 
-        model_without_pkv = ORTModelForSeq2SeqLM.from_pretrained(
-            self.onnx_model_dirs[model_arch + "_False"], use_cache=False
-        )
-        _ = model_without_pkv.generate(**tokens)  # warmup
-        with Timer() as without_pkv_timer:
-            outputs_model_without_pkv = model_without_pkv.generate(
-                **tokens, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1
+            _ = model_with_pkv.generate(**tokens)  # warmup
+            with Timer() as with_pkv_timer:
+                outputs_model_with_pkv = model_with_pkv.generate(
+                    **tokens, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1
+                )
+
+            model_without_pkv = ORTModelForSeq2SeqLM.from_pretrained(
+                self._get_onnx_model_dir(model_id, model_arch, model_arch + "_False"), use_cache=False
             )
+            _ = model_without_pkv.generate(**tokens)  # warmup
+            with Timer() as without_pkv_timer:
+                outputs_model_without_pkv = model_without_pkv.generate(
+                    **tokens, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1
+                )
 
-        self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv))
-        self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH + 1)
-        self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH + 1)
+            self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv))
+            self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH + 1)
+            self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH + 1)
 
-        if os.environ.get("TEST_LEVEL", 0) == "1":
-            self.assertTrue(
-                without_pkv_timer.elapsed / with_pkv_timer.elapsed > self.SPEEDUP_CACHE,
-                f"With pkv latency: {with_pkv_timer.elapsed:.3f} ms, without pkv latency: {without_pkv_timer.elapsed:.3f} ms,"
-                f" speedup: {without_pkv_timer.elapsed / with_pkv_timer.elapsed:.3f}",
-            )
+            if os.environ.get("TEST_LEVEL", 0) == "1":
+                self.assertTrue(
+                    without_pkv_timer.elapsed / with_pkv_timer.elapsed > self.SPEEDUP_CACHE,
+                    f"With pkv latency: {with_pkv_timer.elapsed:.3f} ms, without pkv latency: {without_pkv_timer.elapsed:.3f} ms,"
+                    f" speedup: {without_pkv_timer.elapsed / with_pkv_timer.elapsed:.3f}",
+                )
 
     @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]}))
     def test_compare_merged_and_not_merged_models_outputs(self, test_name: str, model_arch: str, use_cache: bool):
-        if model_arch == "encoder-decoder" and use_cache is True:
-            self.skipTest("encoder-decoder model type with use_cache=True is not supported for bert as a decoder")
         model_args = {
             "test_name": test_name + "_True",
             "model_arch": model_arch,
@@ -3445,29 +3515,37 @@ def test_compare_merged_and_not_merged_models_outputs(self, test_name: str, mode
         }
         self._setup(model_args)
 
-        model_id = MODEL_NAMES[model_arch]
-        tokenizer = get_preprocessor(model_id)
-        text = "My Name is Philipp and i live"
-        tokens = tokenizer(text, return_tensors="pt")
+        model_ids = self._get_model_ids(model_arch)
+        for model_id in model_ids:
+            if (
+                model_arch == "encoder-decoder"
+                and "text2text-generation-with-past" not in MODEL_NAMES[model_arch][model_id]
+            ):
+                # The model with use_cache=True is not supported for bert as a decoder")
+                continue
 
-        model_not_merged_dir = self.onnx_model_dirs[test_name + "_False"]
-        model_merged_dir = self.onnx_model_dirs[test_name + "_True"]
+            tokenizer = get_preprocessor(model_id)
+            text = "My Name is Philipp and i live"
+            tokens = tokenizer(text, return_tensors="pt")
 
-        model_not_merged = ORTModelForSeq2SeqLM.from_pretrained(model_not_merged_dir)
-        not_merged_onnx_path = Path(model_not_merged_dir, ONNX_DECODER_NAME)
-        self.assertFalse(has_onnx_input(not_merged_onnx_path, "use_cache_branch"))
-        self.assertEqual(model_not_merged.use_merged, False)
+            model_not_merged_dir = self._get_onnx_model_dir(model_id, model_arch, test_name + "_False")
+            model_merged_dir = self._get_onnx_model_dir(model_id, model_arch, test_name + "_True")
 
-        model_merged = ORTModelForSeq2SeqLM.from_pretrained(model_merged_dir)
-        merged_onnx_path = Path(model_merged_dir, ONNX_DECODER_MERGED_NAME)
-        self.assertTrue(has_onnx_input(merged_onnx_path, "use_cache_branch"))
-        self.assertEqual(model_merged.decoder_with_past, None)
-        self.assertEqual(model_merged.use_merged, True)
+            model_not_merged = ORTModelForSeq2SeqLM.from_pretrained(model_not_merged_dir)
+            not_merged_onnx_path = Path(model_not_merged_dir, ONNX_DECODER_NAME)
+            self.assertFalse(has_onnx_input(not_merged_onnx_path, "use_cache_branch"))
+            self.assertEqual(model_not_merged.use_merged, False)
 
-        outputs_model_not_merged = model_not_merged.generate(**tokens)
-        outputs_model_merged = model_merged.generate(**tokens)
+            model_merged = ORTModelForSeq2SeqLM.from_pretrained(model_merged_dir)
+            merged_onnx_path = Path(model_merged_dir, ONNX_DECODER_MERGED_NAME)
+            self.assertTrue(has_onnx_input(merged_onnx_path, "use_cache_branch"))
+            self.assertEqual(model_merged.decoder_with_past, None)
+            self.assertEqual(model_merged.use_merged, True)
 
-        self.assertTrue(torch.equal(outputs_model_merged, outputs_model_not_merged))
+            outputs_model_not_merged = model_not_merged.generate(**tokens)
+            outputs_model_merged = model_merged.generate(**tokens)
+
+            self.assertTrue(torch.equal(outputs_model_merged, outputs_model_not_merged))
 
     @parameterized.expand(
         grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True], "use_merged": [False, True]})
@@ -3487,33 +3565,41 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache:
 
         self._setup(model_args)
 
-        model_id = MODEL_NAMES[model_arch]
-        onnx_model = ORTModelForSeq2SeqLM.from_pretrained(
-            self.onnx_model_dirs[test_name], use_io_binding=False, use_cache=use_cache
-        ).to("cuda")
-        io_model = ORTModelForSeq2SeqLM.from_pretrained(
-            self.onnx_model_dirs[test_name], use_io_binding=True, use_cache=use_cache
-        ).to("cuda")
+        model_ids = self._get_model_ids(model_arch)
+        for model_id in model_ids:
+            if (
+                model_arch == "encoder-decoder"
+                and "text2text-generation-with-past" not in MODEL_NAMES[model_arch][model_id]
+            ):
+                # The model with use_cache=True is not supported for bert as a decoder")
+                continue
 
-        self.assertFalse(onnx_model.use_io_binding)
-        self.assertTrue(io_model.use_io_binding)
+            onnx_model = ORTModelForSeq2SeqLM.from_pretrained(
+                self._get_onnx_model_dir(model_id, model_arch, test_name), use_io_binding=False, use_cache=use_cache
+            ).to("cuda")
+            io_model = ORTModelForSeq2SeqLM.from_pretrained(
+                self._get_onnx_model_dir(model_id, model_arch, test_name), use_io_binding=True, use_cache=use_cache
+            ).to("cuda")
 
-        tokenizer = get_preprocessor(model_id)
-        tokens = tokenizer(["This is a sample output"] * 2, return_tensors="pt").to("cuda")
-        decoder_start_token_id = onnx_model.config.decoder_start_token_id if model_arch != "mbart" else 2
-        if model_arch == "encoder-decoder":
-            decoder_start_token_id = tokenizer.cls_token_id
+            self.assertFalse(onnx_model.use_io_binding)
+            self.assertTrue(io_model.use_io_binding)
 
-        decoder_inputs = {"decoder_input_ids": torch.ones((2, 1), dtype=torch.long) * decoder_start_token_id}
+            tokenizer = get_preprocessor(model_id)
+            tokens = tokenizer(["This is a sample output"] * 2, return_tensors="pt").to("cuda")
+            decoder_start_token_id = onnx_model.config.decoder_start_token_id if model_arch != "mbart" else 2
+            if model_arch == "encoder-decoder":
+                decoder_start_token_id = tokenizer.cls_token_id
 
-        onnx_outputs = onnx_model(**tokens, **decoder_inputs)
-        io_outputs = io_model(**tokens, **decoder_inputs)
+            decoder_inputs = {"decoder_input_ids": torch.ones((2, 1), dtype=torch.long) * decoder_start_token_id}
 
-        self.assertTrue("logits" in io_outputs)
-        self.assertIsInstance(io_outputs.logits, torch.Tensor)
+            onnx_outputs = onnx_model(**tokens, **decoder_inputs)
+            io_outputs = io_model(**tokens, **decoder_inputs)
 
-        # compare tensor outputs
-        self.assertTrue(torch.equal(onnx_outputs.logits, io_outputs.logits))
+            self.assertTrue("logits" in io_outputs)
+            self.assertIsInstance(io_outputs.logits, torch.Tensor)
+
+            # compare tensor outputs
+            self.assertTrue(torch.equal(onnx_outputs.logits, io_outputs.logits))
 
         gc.collect()
 
@@ -3536,21 +3622,29 @@ def test_compare_generation_to_io_binding(
 
         self._setup(model_args)
 
-        model_id = MODEL_NAMES[model_arch]
-        onnx_model = ORTModelForSeq2SeqLM.from_pretrained(
-            self.onnx_model_dirs[test_name], use_io_binding=False, use_cache=use_cache
-        ).to("cuda")
-        io_model = ORTModelForSeq2SeqLM.from_pretrained(
-            self.onnx_model_dirs[test_name], use_io_binding=True, use_cache=use_cache
-        ).to("cuda")
+        model_ids = self._get_model_ids(model_arch)
+        for model_id in model_ids:
+            if (
+                model_arch == "encoder-decoder"
+                and "text2text-generation-with-past" not in MODEL_NAMES[model_arch][model_id]
+            ):
+                # The model with use_cache=True is not supported for bert as a decoder")
+                continue
 
-        tokenizer = get_preprocessor(model_id)
-        tokens = tokenizer("This is a sample output", return_tensors="pt").to("cuda")
-        onnx_outputs = onnx_model.generate(**tokens, num_beams=5)
-        io_outputs = io_model.generate(**tokens, num_beams=5)
+            onnx_model = ORTModelForSeq2SeqLM.from_pretrained(
+                self._get_onnx_model_dir(model_id, model_arch, test_name), use_io_binding=False, use_cache=use_cache
+            ).to("cuda")
+            io_model = ORTModelForSeq2SeqLM.from_pretrained(
+                self._get_onnx_model_dir(model_id, model_arch, test_name), use_io_binding=True, use_cache=use_cache
+            ).to("cuda")
 
-        # compare tensor outputs
-        self.assertTrue(torch.equal(onnx_outputs, io_outputs))
+            tokenizer = get_preprocessor(model_id)
+            tokens = tokenizer("This is a sample output", return_tensors="pt").to("cuda")
+            onnx_outputs = onnx_model.generate(**tokens, num_beams=5)
+            io_outputs = io_model.generate(**tokens, num_beams=5)
+
+            # compare tensor outputs
+            self.assertTrue(torch.equal(onnx_outputs, io_outputs))
 
         gc.collect()
 
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index 6e2dc16ae7..44d5caae5e 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -50,8 +50,13 @@
     "detr": "hf-internal-testing/tiny-random-detr",
     "distilbert": "hf-internal-testing/tiny-random-DistilBertModel",
     "electra": "hf-internal-testing/tiny-random-ElectraModel",
-    # "encoder-decoder": "hf-internal-testing/tiny-random-EncoderDecoderModel-bert-bert",
-    "encoder-decoder": "mohitsha/tiny-random-testing-bert2gpt2",
+    # "encoder-decoder": "mohitsha/tiny-random-testing-bert2gpt2",
+    "encoder-decoder": {
+        "hf-internal-testing/tiny-random-EncoderDecoderModel-bert-bert": [
+            "text2text-generation",
+        ],
+        "mohitsha/tiny-random-testing-bert2gpt2": ["text2text-generation", "text2text-generation-with-past"],
+    },
     "flaubert": "hf-internal-testing/tiny-random-flaubert",
     "gpt2": "hf-internal-testing/tiny-random-gpt2",
     "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",
@@ -138,22 +143,49 @@ def _setup(self, model_args: Dict):
         ):
             self.skipTest("Unsupported export case")
 
+        model_ids = MODEL_NAMES[model_arch]
+        if isinstance(model_ids, dict):
+            model_ids = list(model_ids.keys())
+        else:
+            model_ids = [model_ids]
+
         if model_arch_and_params not in self.onnx_model_dirs:
+            self.onnx_model_dirs[model_arch_and_params] = {}
+
             # model_args will contain kwargs to pass to ORTModel.from_pretrained()
             model_args.pop("test_name")
             model_args.pop("model_arch")
 
-            model_id = (
-                self.ARCH_MODEL_MAP[model_arch] if model_arch in self.ARCH_MODEL_MAP else MODEL_NAMES[model_arch]
-            )
-            set_seed(SEED)
-            onnx_model = self.ORTMODEL_CLASS.from_pretrained(model_id, **model_args, use_io_binding=False, export=True)
-
-            model_dir = tempfile.mkdtemp(prefix=f"{model_arch_and_params}_{self.TASK}_")
-            onnx_model.save_pretrained(model_dir)
-            self.onnx_model_dirs[model_arch_and_params] = model_dir
+            for idx, model_id in enumerate(model_ids):
+                if model_arch == "encoder-decoder" and task not in MODEL_NAMES[model_arch][model_id]:
+                    # The model with use_cache=True is not supported for bert as a decoder")
+                    continue
+
+                if model_arch in self.ARCH_MODEL_MAP:
+                    if isinstance(MODEL_NAMES[model_arch], dict):
+                        model_id = list(self.ARCH_MODEL_MAP[model_arch].keys())[idx]
+                    else:
+                        model_id = self.ARCH_MODEL_MAP[model_arch]
+
+                set_seed(SEED)
+                onnx_model = self.ORTMODEL_CLASS.from_pretrained(
+                    model_id, **model_args, use_io_binding=False, export=True
+                )
+
+                model_dir = tempfile.mkdtemp(
+                    prefix=f"{model_arch_and_params}_{self.TASK}_{model_id.replace('/', '_')}"
+                )
+                onnx_model.save_pretrained(model_dir)
+                if isinstance(MODEL_NAMES[model_arch], dict):
+                    self.onnx_model_dirs[model_arch_and_params][model_id] = model_dir
+                else:
+                    self.onnx_model_dirs[model_arch_and_params] = model_dir
 
     @classmethod
     def tearDownClass(cls):
         for _, dir_path in cls.onnx_model_dirs.items():
-            shutil.rmtree(dir_path)
+            if isinstance(dir_path, dict):
+                for _, sec_dir_path in dir_path.items():
+                    shutil.rmtree(sec_dir_path)
+            else:
+                shutil.rmtree(dir_path)

From 56cd5e876faee5908026ba3cb1eaa468d880c248 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Sun, 3 Sep 2023 19:01:55 +0000
Subject: [PATCH 3/4] remove comments

---
 tests/exporters/exporters_utils.py | 22 +++++++++++-----------
 tests/onnxruntime/test_modeling.py |  2 +-
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index 12483303f8..18ce225fcb 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -141,17 +141,17 @@
     "speech-to-text": "hf-internal-testing/tiny-random-Speech2TextModel",
     "xlm": "hf-internal-testing/tiny-random-XLMModel",
     "xlm-roberta": "hf-internal-testing/tiny-xlm-roberta",
-    # "vision-encoder-decoder": {
-    #     "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2": [
-    #         "image-to-text",
-    #         "image-to-text-with-past",
-    #     ],
-    #     "microsoft/trocr-small-handwritten": ["image-to-text"],
-    #     "fxmarty/tiny-doc-qa-vision-encoder-decoder": [
-    #         "document-question-answering",
-    #         "document-question-answering-with-past",
-    #     ],
-    # },
+    "vision-encoder-decoder": {
+        "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2": [
+            "image-to-text",
+            "image-to-text-with-past",
+        ],
+        "microsoft/trocr-small-handwritten": ["image-to-text"],
+        "fxmarty/tiny-doc-qa-vision-encoder-decoder": [
+            "document-question-answering",
+            "document-question-answering-with-past",
+        ],
+    },
 }
 
 
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index c4fd279fc9..a9d9864637 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -4051,7 +4051,7 @@ def test_compare_generation_to_io_binding(
 
 class ORTModelForVision2SeqIntegrationTest(ORTModelTestMixin):
     # TODO: speech_to_text should be tested
-    SUPPORTED_ARCHITECTURES = ["trocr"]
+    SUPPORTED_ARCHITECTURES = ["vision-encoder-decoder", "trocr"]
 
     FULL_GRID = {
         "model_arch": SUPPORTED_ARCHITECTURES,

From 990bc5944f9673bd2683e33358cbed088f4bab86 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Sun, 3 Sep 2023 20:02:06 +0000
Subject: [PATCH 4/4] remove extra func

---
 tests/onnxruntime/test_modeling.py           | 17 ++++++++---------
 tests/onnxruntime/utils_onnxruntime_tests.py |  1 -
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index a9d9864637..d9ca1860ff 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -3097,10 +3097,6 @@ def _get_onnx_model_dir(self, model_id, model_arch, test_name):
 
         return onnx_model_dir
 
-    def _load_model(self, model_id, model_arch, test_name, use_cache):
-        onnx_model_dir = self._get_onnx_model_dir(model_id, model_arch, test_name)
-        return ORTModelForSeq2SeqLM.from_pretrained(onnx_model_dir, use_cache=use_cache)
-
     def test_inference_old_onnx_model(self):
         model = ORTModelForSeq2SeqLM.from_pretrained("optimum/t5-small")
 
@@ -3130,7 +3126,9 @@ def test_generate_utils(self, test_name: str, model_arch: str, use_cache: str):
             ):
                 continue
 
-            model = self._load_model(model_id, model_arch, test_name, use_cache)
+            onnx_model_dir = self._get_onnx_model_dir(model_id, model_arch, test_name)
+            model = ORTModelForSeq2SeqLM.from_pretrained(onnx_model_dir, use_cache=use_cache)
+
             tokenizer = get_preprocessor(model_id)
             text = "This is a sample output"
             tokens = tokenizer(text, return_tensors="pt")
@@ -3233,8 +3231,7 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach
                 continue
 
             onnx_model_dir = self._get_onnx_model_dir(model_id, model_arch, test_name)
-
-            onnx_model = self._load_model(model_id, model_arch, test_name, use_cache)
+            onnx_model = ORTModelForSeq2SeqLM.from_pretrained(onnx_model_dir, use_cache=use_cache)
 
             self.assertIsInstance(onnx_model.encoder, ORTEncoder)
             if use_merged is False:
@@ -3309,7 +3306,8 @@ def test_pipeline_text_generation(self, test_name: str, model_arch: str, use_cac
                 # The model with use_cache=True is not supported for bert as a decoder")
                 continue
 
-            onnx_model = self._load_model(model_id, model_arch, test_name, use_cache)
+            onnx_model_dir = self._get_onnx_model_dir(model_id, model_arch, test_name)
+            onnx_model = ORTModelForSeq2SeqLM.from_pretrained(onnx_model_dir, use_cache=use_cache)
 
             tokenizer = get_preprocessor(model_id)
 
@@ -3377,7 +3375,8 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, use_cache: bool)
                 # The model with use_cache=True is not supported for bert as a decoder"
                 continue
 
-            onnx_model = self._load_model(model_id, model_arch, test_name, use_cache)
+            onnx_model_dir = self._get_onnx_model_dir(model_id, model_arch, test_name)
+            onnx_model = ORTModelForSeq2SeqLM.from_pretrained(onnx_model_dir, use_cache=use_cache)
 
             tokenizer = get_preprocessor(model_id)
             pipe = pipeline(
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index 44d5caae5e..8ec6ce90b0 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -50,7 +50,6 @@
     "detr": "hf-internal-testing/tiny-random-detr",
     "distilbert": "hf-internal-testing/tiny-random-DistilBertModel",
     "electra": "hf-internal-testing/tiny-random-ElectraModel",
-    # "encoder-decoder": "mohitsha/tiny-random-testing-bert2gpt2",
     "encoder-decoder": {
         "hf-internal-testing/tiny-random-EncoderDecoderModel-bert-bert": [
             "text2text-generation",