From 9eb0c1de2fb9b56bb4033d8070882cabac6da706 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Sat, 2 Sep 2023 17:17:25 +0000 Subject: [PATCH 1/4] add model --- optimum/onnxruntime/modeling_seq2seq.py | 5 ++++ tests/exporters/exporters_utils.py | 30 ++++++++++++-------- tests/onnxruntime/test_modeling.py | 9 +++--- tests/onnxruntime/utils_onnxruntime_tests.py | 3 +- 4 files changed, 29 insertions(+), 18 deletions(-) diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py index a6528e3ef4..64bc514cb7 100644 --- a/optimum/onnxruntime/modeling_seq2seq.py +++ b/optimum/onnxruntime/modeling_seq2seq.py @@ -1130,6 +1130,11 @@ def __init__( config.decoder.model_type )(config.decoder) + if self.decoder_with_past is not None: + self.decoder_with_past.normalized_config = NormalizedConfigManager.get_normalized_config_class( + config.decoder.model_type + )(config.decoder) + def _initialize_encoder(self, session: ort.InferenceSession) -> ORTEncoder: return ORTEncoder(session, self) diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index 000dd59241..9737bb87ac 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -55,7 +55,13 @@ "detr": "hf-internal-testing/tiny-random-DetrModel", # hf-internal-testing/tiny-random-detr is larger "distilbert": "hf-internal-testing/tiny-random-DistilBertModel", "electra": "hf-internal-testing/tiny-random-ElectraModel", - "encoder-decoder": "hf-internal-testing/tiny-random-EncoderDecoderModel-bert-bert", + "encoder-decoder": { + "hf-internal-testing/tiny-random-EncoderDecoderModel-bert-bert": [ + "text2text-generation", + "text2text-generation-with-past", + ], + "mohitsha/tiny-random-testing-bert2gpt2": ["text2text-generation", "text2text-generation-with-past"], + }, "flaubert": "hf-internal-testing/tiny-random-flaubert", "gpt2": "hf-internal-testing/tiny-random-gpt2", "gpt-bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", @@ -136,17 +142,17 @@ "speech-to-text": "hf-internal-testing/tiny-random-Speech2TextModel", "xlm": "hf-internal-testing/tiny-random-XLMModel", "xlm-roberta": "hf-internal-testing/tiny-xlm-roberta", - "vision-encoder-decoder": { - "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2": [ - "image-to-text", - "image-to-text-with-past", - ], - "microsoft/trocr-small-handwritten": ["image-to-text"], - "fxmarty/tiny-doc-qa-vision-encoder-decoder": [ - "document-question-answering", - "document-question-answering-with-past", - ], - }, + # "vision-encoder-decoder": { + # "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2": [ + # "image-to-text", + # "image-to-text-with-past", + # ], + # "microsoft/trocr-small-handwritten": ["image-to-text"], + # "fxmarty/tiny-doc-qa-vision-encoder-decoder": [ + # "document-question-answering", + # "document-question-answering-with-past", + # ], + # }, } diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index dcf3a32dfc..0779560864 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -3475,8 +3475,6 @@ def test_compare_merged_and_not_merged_models_outputs(self, test_name: str, mode @require_torch_gpu @pytest.mark.gpu_test def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: bool, use_merged: bool): - if model_arch == "encoder-decoder": - use_cache = False if use_cache is False and use_merged is True: self.skipTest("use_cache=False, use_merged=True are uncompatible") @@ -3503,6 +3501,9 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: tokenizer = get_preprocessor(model_id) tokens = tokenizer(["This is a sample output"] * 2, return_tensors="pt").to("cuda") decoder_start_token_id = onnx_model.config.decoder_start_token_id if model_arch != "mbart" else 2 + if model_arch == "encoder-decoder": + decoder_start_token_id = tokenizer.cls_token_id + decoder_inputs = {"decoder_input_ids": torch.ones((2, 1), dtype=torch.long) * decoder_start_token_id} onnx_outputs = onnx_model(**tokens, **decoder_inputs) @@ -3523,8 +3524,6 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: def test_compare_generation_to_io_binding( self, test_name: str, model_arch: str, use_cache: bool, use_merged: bool ): - if model_arch == "encoder-decoder": - use_cache = False if use_cache is False and use_merged is True: self.skipTest("use_cache=False, use_merged=True are uncompatible") @@ -3958,7 +3957,7 @@ def test_compare_generation_to_io_binding( class ORTModelForVision2SeqIntegrationTest(ORTModelTestMixin): # TODO: speech_to_text should be tested - SUPPORTED_ARCHITECTURES = ["vision-encoder-decoder", "trocr"] + SUPPORTED_ARCHITECTURES = ["trocr"] FULL_GRID = { "model_arch": SUPPORTED_ARCHITECTURES, diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index fdcd565e9c..6e2dc16ae7 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -50,7 +50,8 @@ "detr": "hf-internal-testing/tiny-random-detr", "distilbert": "hf-internal-testing/tiny-random-DistilBertModel", "electra": "hf-internal-testing/tiny-random-ElectraModel", - "encoder-decoder": "hf-internal-testing/tiny-random-EncoderDecoderModel-bert-bert", + # "encoder-decoder": "hf-internal-testing/tiny-random-EncoderDecoderModel-bert-bert", + "encoder-decoder": "mohitsha/tiny-random-testing-bert2gpt2", "flaubert": "hf-internal-testing/tiny-random-flaubert", "gpt2": "hf-internal-testing/tiny-random-gpt2", "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", From 5dfb62054fd6f6e8ae8556e03d2e71a3b0bcd9f9 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Sun, 3 Sep 2023 18:47:29 +0000 Subject: [PATCH 2/4] fix tests --- tests/exporters/exporters_utils.py | 1 - tests/onnxruntime/test_modeling.py | 522 +++++++++++-------- tests/onnxruntime/utils_onnxruntime_tests.py | 56 +- 3 files changed, 352 insertions(+), 227 deletions(-) diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index 9737bb87ac..12483303f8 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -58,7 +58,6 @@ "encoder-decoder": { "hf-internal-testing/tiny-random-EncoderDecoderModel-bert-bert": [ "text2text-generation", - "text2text-generation-with-past", ], "mohitsha/tiny-random-testing-bert2gpt2": ["text2text-generation", "text2text-generation-with-past"], }, diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 0779560864..c4fd279fc9 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -3082,6 +3082,25 @@ class ORTModelForSeq2SeqLMIntegrationTest(ORTModelTestMixin): GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.1 + def _get_model_ids(self, model_arch): + model_ids = MODEL_NAMES[model_arch] + if isinstance(model_ids, dict): + model_ids = list(model_ids.keys()) + else: + model_ids = [model_ids] + return model_ids + + def _get_onnx_model_dir(self, model_id, model_arch, test_name): + onnx_model_dir = self.onnx_model_dirs[test_name] + if isinstance(MODEL_NAMES[model_arch], dict): + onnx_model_dir = onnx_model_dir[model_id] + + return onnx_model_dir + + def _load_model(self, model_id, model_arch, test_name, use_cache): + onnx_model_dir = self._get_onnx_model_dir(model_id, model_arch, test_name) + return ORTModelForSeq2SeqLM.from_pretrained(onnx_model_dir, use_cache=use_cache) + def test_inference_old_onnx_model(self): model = ORTModelForSeq2SeqLM.from_pretrained("optimum/t5-small") @@ -3099,84 +3118,98 @@ def test_load_vanilla_transformers_which_is_not_supported(self): @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]})) def test_generate_utils(self, test_name: str, model_arch: str, use_cache: str): - if model_arch == "encoder-decoder" and use_cache is True: - self.skipTest("encoder-decoder model type with use_cache=True is not supported for bert as a decoder") model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache} self._setup(model_args) - model_id = MODEL_NAMES[model_arch] - model = ORTModelForSeq2SeqLM.from_pretrained(self.onnx_model_dirs[test_name], use_cache=use_cache) - tokenizer = get_preprocessor(model_id) - text = "This is a sample output" - tokens = tokenizer(text, return_tensors="pt") + model_ids = self._get_model_ids(model_arch) + for model_id in model_ids: + if ( + model_arch == "encoder-decoder" + and use_cache is True + and "text2text-generation-with-past" not in MODEL_NAMES[model_arch][model_id] + ): + continue - # General case - outputs = model.generate(**tokens) - res = tokenizer.batch_decode(outputs, skip_special_tokens=True) - self.assertIsInstance(res[0], str) + model = self._load_model(model_id, model_arch, test_name, use_cache) + tokenizer = get_preprocessor(model_id) + text = "This is a sample output" + tokens = tokenizer(text, return_tensors="pt") - # With input ids - outputs = model.generate(input_ids=tokens["input_ids"]) - res = tokenizer.batch_decode(outputs, skip_special_tokens=True) - self.assertIsInstance(res[0], str) + # General case + outputs = model.generate(**tokens) + res = tokenizer.batch_decode(outputs, skip_special_tokens=True) + self.assertIsInstance(res[0], str) + + # With input ids + outputs = model.generate(input_ids=tokens["input_ids"]) + res = tokenizer.batch_decode(outputs, skip_special_tokens=True) + self.assertIsInstance(res[0], str) gc.collect() @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_merge_from_transformers_and_save(self, model_arch): - if model_arch == "encoder-decoder": - self.skipTest("encoder-decoder model type with use_merged=True is not supported for bert as a decoder") - if "text2text-generation-with-past" not in TasksManager.get_supported_tasks_for_model_type( model_arch.replace("_", "-"), exporter="onnx" ): self.skipTest("Unsupported -with-past export case") - model_id = MODEL_NAMES[model_arch] - model = ORTModelForSeq2SeqLM.from_pretrained(model_id, export=True, use_merged=True) - with tempfile.TemporaryDirectory() as tmpdir: - model.save_pretrained(tmpdir) - save_path = os.path.join(tmpdir, ONNX_DECODER_MERGED_NAME) - self.assertTrue(has_onnx_input(save_path, "use_cache_branch")) + model_ids = self._get_model_ids(model_arch) + for model_id in model_ids: + if ( + model_arch == "encoder-decoder" + and "text2text-generation-with-past" not in MODEL_NAMES[model_arch][model_id] + ): + # The model with use_merged=True is not supported for bert as a decoder") + continue - folder_contents = os.listdir(tmpdir) - self.assertTrue(ONNX_ENCODER_NAME in folder_contents) - self.assertTrue(ONNX_DECODER_NAME not in folder_contents) - self.assertTrue(ONNX_DECODER_WITH_PAST_NAME not in folder_contents) + model = ORTModelForSeq2SeqLM.from_pretrained(model_id, export=True, use_merged=True) + + with tempfile.TemporaryDirectory() as tmpdir: + model.save_pretrained(tmpdir) + save_path = os.path.join(tmpdir, ONNX_DECODER_MERGED_NAME) + self.assertTrue(has_onnx_input(save_path, "use_cache_branch")) + + folder_contents = os.listdir(tmpdir) + self.assertTrue(ONNX_ENCODER_NAME in folder_contents) + self.assertTrue(ONNX_DECODER_NAME not in folder_contents) + self.assertTrue(ONNX_DECODER_WITH_PAST_NAME not in folder_contents) @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_merge_from_onnx_and_save(self, model_arch): - if model_arch == "encoder-decoder": - self.skipTest("encoder-decoder model type with use_merged=True is not supported for bert as a decoder") - - model_id = MODEL_NAMES[model_arch] task = "text2text-generation-with-past" if task not in TasksManager.get_supported_tasks_for_model_type(model_arch.replace("_", "-"), exporter="onnx"): self.skipTest("Unsupported export case") - with tempfile.TemporaryDirectory() as tmpdir: - main_export(model_id, tmpdir, task=task) + model_ids = self._get_model_ids(model_arch) + for model_id in model_ids: + if ( + model_arch == "encoder-decoder" + and "text2text-generation-with-past" not in MODEL_NAMES[model_arch][model_id] + ): + # The model with use_merged=True is not supported for bert as a decoder") + continue - model = ORTModelForSeq2SeqLM.from_pretrained(tmpdir) + with tempfile.TemporaryDirectory() as tmpdir: + main_export(model_id, tmpdir, task=task) - self.assertTrue(model.use_merged) - self.assertTrue(model.decoder_with_past is None) + model = ORTModelForSeq2SeqLM.from_pretrained(tmpdir) - model.save_pretrained(tmpdir + "_save") - save_path = os.path.join(tmpdir + "_save", ONNX_DECODER_MERGED_NAME) - self.assertTrue(has_onnx_input(save_path, "use_cache_branch")) + self.assertTrue(model.use_merged) + self.assertTrue(model.decoder_with_past is None) - folder_contents = os.listdir(tmpdir + "_save") - self.assertTrue(ONNX_ENCODER_NAME in folder_contents) - self.assertFalse(ONNX_DECODER_NAME in folder_contents) - self.assertFalse(ONNX_DECODER_WITH_PAST_NAME in folder_contents) + model.save_pretrained(tmpdir + "_save") + save_path = os.path.join(tmpdir + "_save", ONNX_DECODER_MERGED_NAME) + self.assertTrue(has_onnx_input(save_path, "use_cache_branch")) + + folder_contents = os.listdir(tmpdir + "_save") + self.assertTrue(ONNX_ENCODER_NAME in folder_contents) + self.assertFalse(ONNX_DECODER_NAME in folder_contents) + self.assertFalse(ONNX_DECODER_WITH_PAST_NAME in folder_contents) @parameterized.expand(grid_parameters(FULL_GRID)) def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cache: bool, use_merged: bool): - if model_arch == "encoder-decoder" and use_cache is True: - self.skipTest("encoder-decoder model type with use_cache=True is not supported for bert as a decoder") - if use_cache is False and use_merged is True: self.skipTest("use_cache=False, use_merged=True are uncompatible") @@ -3189,61 +3222,71 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach self._setup(model_args) - model_id = MODEL_NAMES[model_arch] - onnx_model = ORTModelForSeq2SeqLM.from_pretrained(self.onnx_model_dirs[test_name], use_cache=use_cache) - - self.assertIsInstance(onnx_model.encoder, ORTEncoder) - if use_merged is False: - model_path = Path(self.onnx_model_dirs[test_name], ONNX_DECODER_NAME) - self.assertFalse(has_onnx_input(model_path, "use_cache_branch")) - self.assertEqual(onnx_model.use_merged, False) - else: - model_path = Path(self.onnx_model_dirs[test_name], ONNX_DECODER_MERGED_NAME) - self.assertTrue(has_onnx_input(model_path, "use_cache_branch")) - self.assertEqual(onnx_model.use_merged, True) - - self.assertIsInstance(onnx_model.decoder, ORTDecoderForSeq2Seq) - if onnx_model.use_cache is True and onnx_model.use_merged is False: - self.assertIsInstance(onnx_model.decoder_with_past, ORTDecoderForSeq2Seq) - if onnx_model.use_cache is True and onnx_model.use_merged is True: - self.assertTrue(onnx_model.decoder_with_past is None) - - self.assertIsInstance(onnx_model.config, PretrainedConfig) - - set_seed(SEED) - transformers_model = AutoModelForSeq2SeqLM.from_pretrained(model_id) - tokenizer = get_preprocessor(model_id) - tokens = tokenizer("This is a sample output", return_tensors="pt") - decoder_start_token_id = transformers_model.config.decoder_start_token_id if model_arch != "mbart" else 2 - if model_arch == "encoder-decoder": - decoder_start_token_id = tokenizer.cls_token_id + model_ids = self._get_model_ids(model_arch) + for model_id in model_ids: + if ( + model_arch == "encoder-decoder" + and use_cache is True + and "text2text-generation-with-past" not in MODEL_NAMES[model_arch][model_id] + ): + # The model with use_cache=True is not supported for bert as a decoder") + continue + + onnx_model_dir = self._get_onnx_model_dir(model_id, model_arch, test_name) + + onnx_model = self._load_model(model_id, model_arch, test_name, use_cache) + + self.assertIsInstance(onnx_model.encoder, ORTEncoder) + if use_merged is False: + model_path = Path(onnx_model_dir, ONNX_DECODER_NAME) + self.assertFalse(has_onnx_input(model_path, "use_cache_branch")) + self.assertEqual(onnx_model.use_merged, False) + else: + model_path = Path(onnx_model_dir, ONNX_DECODER_MERGED_NAME) + self.assertTrue(has_onnx_input(model_path, "use_cache_branch")) + self.assertEqual(onnx_model.use_merged, True) + + self.assertIsInstance(onnx_model.decoder, ORTDecoderForSeq2Seq) + if onnx_model.use_cache is True and onnx_model.use_merged is False: + self.assertIsInstance(onnx_model.decoder_with_past, ORTDecoderForSeq2Seq) + if onnx_model.use_cache is True and onnx_model.use_merged is True: + self.assertTrue(onnx_model.decoder_with_past is None) + + self.assertIsInstance(onnx_model.config, PretrainedConfig) + + set_seed(SEED) + transformers_model = AutoModelForSeq2SeqLM.from_pretrained(model_id) + tokenizer = get_preprocessor(model_id) + tokens = tokenizer("This is a sample output", return_tensors="pt") + decoder_start_token_id = transformers_model.config.decoder_start_token_id if model_arch != "mbart" else 2 + if model_arch == "encoder-decoder": + decoder_start_token_id = tokenizer.cls_token_id - decoder_inputs = {"decoder_input_ids": torch.ones((1, 1), dtype=torch.long) * decoder_start_token_id} + decoder_inputs = {"decoder_input_ids": torch.ones((1, 1), dtype=torch.long) * decoder_start_token_id} - with torch.no_grad(): - transformers_outputs = transformers_model(**tokens, **decoder_inputs) + with torch.no_grad(): + transformers_outputs = transformers_model(**tokens, **decoder_inputs) - for input_type in ["pt", "np"]: - tokens = tokenizer("This is a sample output", return_tensors=input_type) + for input_type in ["pt", "np"]: + tokens = tokenizer("This is a sample output", return_tensors=input_type) - if input_type == "np": - decoder_inputs = {"decoder_input_ids": np.ones((1, 1), dtype=np.int64) * decoder_start_token_id} + if input_type == "np": + decoder_inputs = {"decoder_input_ids": np.ones((1, 1), dtype=np.int64) * decoder_start_token_id} - onnx_outputs = onnx_model(**tokens, **decoder_inputs) + onnx_outputs = onnx_model(**tokens, **decoder_inputs) - self.assertTrue("logits" in onnx_outputs) - self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type]) + self.assertTrue("logits" in onnx_outputs) + self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type]) - # Compare tensor outputs - self.assertTrue(torch.allclose(torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=1e-4)) + # Compare tensor outputs + self.assertTrue( + torch.allclose(torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=1e-4) + ) gc.collect() @parameterized.expand(grid_parameters(FULL_GRID)) def test_pipeline_text_generation(self, test_name: str, model_arch: str, use_cache: bool, use_merged: bool): - if model_arch == "encoder-decoder" and use_cache is True: - self.skipTest("encoder-decoder model type with use_cache=True is not supported for bert as a decoder") - if use_cache is False and use_merged is True: self.skipTest("use_cache=False, use_merged=True are uncompatible") @@ -3256,34 +3299,44 @@ def test_pipeline_text_generation(self, test_name: str, model_arch: str, use_cac self._setup(model_args) - model_id = MODEL_NAMES[model_arch] - onnx_model = ORTModelForSeq2SeqLM.from_pretrained(self.onnx_model_dirs[test_name], use_cache=use_cache) - tokenizer = get_preprocessor(model_id) + model_ids = self._get_model_ids(model_arch) + for model_id in model_ids: + if ( + model_arch == "encoder-decoder" + and use_cache is True + and "text2text-generation-with-past" not in MODEL_NAMES[model_arch][model_id] + ): + # The model with use_cache=True is not supported for bert as a decoder") + continue - decoder_start_token_id = onnx_model.config.decoder_start_token_id if model_arch != "mbart" else 2 - if model_arch == "encoder-decoder": - decoder_start_token_id = tokenizer.cls_token_id + onnx_model = self._load_model(model_id, model_arch, test_name, use_cache) - # Text2Text generation - pipe = pipeline("text2text-generation", model=onnx_model, tokenizer=tokenizer) - text = "This is a test" - outputs = pipe(text, decoder_start_token_id=decoder_start_token_id) - self.assertEqual(pipe.device, onnx_model.device) - self.assertIsInstance(outputs[0]["generated_text"], str) - - # Summarization - pipe = pipeline("summarization", model=onnx_model, tokenizer=tokenizer) - text = "This is a test" - outputs = pipe(text, decoder_start_token_id=decoder_start_token_id) - self.assertEqual(pipe.device, onnx_model.device) - self.assertIsInstance(outputs[0]["summary_text"], str) + tokenizer = get_preprocessor(model_id) - # Translation - pipe = pipeline("translation_en_to_de", model=onnx_model, tokenizer=tokenizer) - text = "This is a test" - outputs = pipe(text, decoder_start_token_id=decoder_start_token_id) - self.assertEqual(pipe.device, onnx_model.device) - self.assertIsInstance(outputs[0]["translation_text"], str) + decoder_start_token_id = onnx_model.config.decoder_start_token_id if model_arch != "mbart" else 2 + if model_arch == "encoder-decoder": + decoder_start_token_id = tokenizer.cls_token_id + + # Text2Text generation + pipe = pipeline("text2text-generation", model=onnx_model, tokenizer=tokenizer) + text = "This is a test" + outputs = pipe(text, decoder_start_token_id=decoder_start_token_id) + self.assertEqual(pipe.device, onnx_model.device) + self.assertIsInstance(outputs[0]["generated_text"], str) + + # Summarization + pipe = pipeline("summarization", model=onnx_model, tokenizer=tokenizer) + text = "This is a test" + outputs = pipe(text, decoder_start_token_id=decoder_start_token_id) + self.assertEqual(pipe.device, onnx_model.device) + self.assertIsInstance(outputs[0]["summary_text"], str) + + # Translation + pipe = pipeline("translation_en_to_de", model=onnx_model, tokenizer=tokenizer) + text = "This is a test" + outputs = pipe(text, decoder_start_token_id=decoder_start_token_id) + self.assertEqual(pipe.device, onnx_model.device) + self.assertIsInstance(outputs[0]["translation_text"], str) gc.collect() @@ -3312,27 +3365,38 @@ def test_pipeline_model_is_none(self): @require_torch_gpu @pytest.mark.gpu_test def test_pipeline_on_gpu(self, test_name: str, model_arch: str, use_cache: bool): - if model_arch == "encoder-decoder": - use_cache = False model_args = {"test_name": test_name, "model_arch": model_arch, "use_cache": use_cache} self._setup(model_args) - model_id = MODEL_NAMES[model_arch] - onnx_model = ORTModelForSeq2SeqLM.from_pretrained(self.onnx_model_dirs[test_name], use_cache=use_cache) - tokenizer = get_preprocessor(model_id) - pipe = pipeline("translation_en_to_de", model=onnx_model, tokenizer=tokenizer, return_tensors=False, device=0) - text = "My Name is Philipp and i live" - outputs = pipe(text, max_length=2 * len(text) + 1) - # check model device - self.assertEqual(pipe.model.device.type.lower(), "cuda") - # compare model output class - self.assertTrue(isinstance(outputs[0]["translation_text"], str)) + model_ids = self._get_model_ids(model_arch) + for model_id in model_ids: + if ( + model_arch == "encoder-decoder" + and "text2text-generation-with-past" not in MODEL_NAMES[model_arch][model_id] + ): + # The model with use_cache=True is not supported for bert as a decoder" + continue - pipe = pipeline("translation_en_to_de", model=onnx_model, tokenizer=tokenizer, return_tensors=True, device=0) + onnx_model = self._load_model(model_id, model_arch, test_name, use_cache) - outputs = pipe(text, min_length=len(text) + 1, max_length=2 * len(text) + 1) - self.assertTrue(isinstance(outputs[0]["translation_token_ids"], torch.Tensor)) - self.assertTrue(len(outputs[0]["translation_token_ids"]) > len(text)) + tokenizer = get_preprocessor(model_id) + pipe = pipeline( + "translation_en_to_de", model=onnx_model, tokenizer=tokenizer, return_tensors=False, device=0 + ) + text = "My Name is Philipp and i live" + outputs = pipe(text, max_length=2 * len(text) + 1) + # check model device + self.assertEqual(pipe.model.device.type.lower(), "cuda") + # compare model output class + self.assertTrue(isinstance(outputs[0]["translation_text"], str)) + + pipe = pipeline( + "translation_en_to_de", model=onnx_model, tokenizer=tokenizer, return_tensors=True, device=0 + ) + + outputs = pipe(text, min_length=len(text) + 1, max_length=2 * len(text) + 1) + self.assertTrue(isinstance(outputs[0]["translation_token_ids"], torch.Tensor)) + self.assertTrue(len(outputs[0]["translation_token_ids"]) > len(text)) # TRT EP compile time can be long, so we don't test all archs @parameterized.expand(grid_parameters({"model_arch": ["t5"], "use_cache": [True, False]})) @@ -3385,51 +3449,57 @@ def test_pipeline_on_trt_execution_provider(self, test_name: str, model_arch: st @parameterized.expand(SUPPORTED_ARCHITECTURES) @pytest.mark.gpu_test # mark as GPU test as well to run the without/with cache timing test on the slow tests def test_compare_with_and_without_past_key_values(self, model_arch: str): - if model_arch == "m2m_100" or model_arch == "encoder-decoder": - self.skipTest("m2m_100 and encoder-decoder comparison with/without pkv fail or is not supported") + if model_arch == "m2m_100": + self.skipTest("m2m_100 comparison with/without pkv fail or is not supported") model_args = {"test_name": model_arch + "_False", "model_arch": model_arch, "use_cache": False} self._setup(model_args) model_args = {"test_name": model_arch + "_True", "model_arch": model_arch, "use_cache": True} self._setup(model_args) - model_id = MODEL_NAMES[model_arch] - tokenizer = get_preprocessor(model_id) - text = "This is a sample output" - tokens = tokenizer(text, return_tensors="pt") - model_with_pkv = ORTModelForSeq2SeqLM.from_pretrained( - self.onnx_model_dirs[model_arch + "_True"], use_cache=True - ) + model_ids = self._get_model_ids(model_arch) + for model_id in model_ids: + if ( + model_arch == "encoder-decoder" + and "text2text-generation-with-past" not in MODEL_NAMES[model_arch][model_id] + ): + # The model with use_cache=True is not supported for bert as a decoder") + continue - _ = model_with_pkv.generate(**tokens) # warmup - with Timer() as with_pkv_timer: - outputs_model_with_pkv = model_with_pkv.generate( - **tokens, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 + tokenizer = get_preprocessor(model_id) + text = "This is a sample output" + tokens = tokenizer(text, return_tensors="pt") + model_with_pkv = ORTModelForSeq2SeqLM.from_pretrained( + self._get_onnx_model_dir(model_id, model_arch, model_arch + "_True"), use_cache=True ) - model_without_pkv = ORTModelForSeq2SeqLM.from_pretrained( - self.onnx_model_dirs[model_arch + "_False"], use_cache=False - ) - _ = model_without_pkv.generate(**tokens) # warmup - with Timer() as without_pkv_timer: - outputs_model_without_pkv = model_without_pkv.generate( - **tokens, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 + _ = model_with_pkv.generate(**tokens) # warmup + with Timer() as with_pkv_timer: + outputs_model_with_pkv = model_with_pkv.generate( + **tokens, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 + ) + + model_without_pkv = ORTModelForSeq2SeqLM.from_pretrained( + self._get_onnx_model_dir(model_id, model_arch, model_arch + "_False"), use_cache=False ) + _ = model_without_pkv.generate(**tokens) # warmup + with Timer() as without_pkv_timer: + outputs_model_without_pkv = model_without_pkv.generate( + **tokens, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 + ) - self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) - self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH + 1) - self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH + 1) + self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) + self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH + 1) + self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH + 1) - if os.environ.get("TEST_LEVEL", 0) == "1": - self.assertTrue( - without_pkv_timer.elapsed / with_pkv_timer.elapsed > self.SPEEDUP_CACHE, - f"With pkv latency: {with_pkv_timer.elapsed:.3f} ms, without pkv latency: {without_pkv_timer.elapsed:.3f} ms," - f" speedup: {without_pkv_timer.elapsed / with_pkv_timer.elapsed:.3f}", - ) + if os.environ.get("TEST_LEVEL", 0) == "1": + self.assertTrue( + without_pkv_timer.elapsed / with_pkv_timer.elapsed > self.SPEEDUP_CACHE, + f"With pkv latency: {with_pkv_timer.elapsed:.3f} ms, without pkv latency: {without_pkv_timer.elapsed:.3f} ms," + f" speedup: {without_pkv_timer.elapsed / with_pkv_timer.elapsed:.3f}", + ) @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]})) def test_compare_merged_and_not_merged_models_outputs(self, test_name: str, model_arch: str, use_cache: bool): - if model_arch == "encoder-decoder" and use_cache is True: - self.skipTest("encoder-decoder model type with use_cache=True is not supported for bert as a decoder") model_args = { "test_name": test_name + "_True", "model_arch": model_arch, @@ -3445,29 +3515,37 @@ def test_compare_merged_and_not_merged_models_outputs(self, test_name: str, mode } self._setup(model_args) - model_id = MODEL_NAMES[model_arch] - tokenizer = get_preprocessor(model_id) - text = "My Name is Philipp and i live" - tokens = tokenizer(text, return_tensors="pt") + model_ids = self._get_model_ids(model_arch) + for model_id in model_ids: + if ( + model_arch == "encoder-decoder" + and "text2text-generation-with-past" not in MODEL_NAMES[model_arch][model_id] + ): + # The model with use_cache=True is not supported for bert as a decoder") + continue - model_not_merged_dir = self.onnx_model_dirs[test_name + "_False"] - model_merged_dir = self.onnx_model_dirs[test_name + "_True"] + tokenizer = get_preprocessor(model_id) + text = "My Name is Philipp and i live" + tokens = tokenizer(text, return_tensors="pt") - model_not_merged = ORTModelForSeq2SeqLM.from_pretrained(model_not_merged_dir) - not_merged_onnx_path = Path(model_not_merged_dir, ONNX_DECODER_NAME) - self.assertFalse(has_onnx_input(not_merged_onnx_path, "use_cache_branch")) - self.assertEqual(model_not_merged.use_merged, False) + model_not_merged_dir = self._get_onnx_model_dir(model_id, model_arch, test_name + "_False") + model_merged_dir = self._get_onnx_model_dir(model_id, model_arch, test_name + "_True") - model_merged = ORTModelForSeq2SeqLM.from_pretrained(model_merged_dir) - merged_onnx_path = Path(model_merged_dir, ONNX_DECODER_MERGED_NAME) - self.assertTrue(has_onnx_input(merged_onnx_path, "use_cache_branch")) - self.assertEqual(model_merged.decoder_with_past, None) - self.assertEqual(model_merged.use_merged, True) + model_not_merged = ORTModelForSeq2SeqLM.from_pretrained(model_not_merged_dir) + not_merged_onnx_path = Path(model_not_merged_dir, ONNX_DECODER_NAME) + self.assertFalse(has_onnx_input(not_merged_onnx_path, "use_cache_branch")) + self.assertEqual(model_not_merged.use_merged, False) - outputs_model_not_merged = model_not_merged.generate(**tokens) - outputs_model_merged = model_merged.generate(**tokens) + model_merged = ORTModelForSeq2SeqLM.from_pretrained(model_merged_dir) + merged_onnx_path = Path(model_merged_dir, ONNX_DECODER_MERGED_NAME) + self.assertTrue(has_onnx_input(merged_onnx_path, "use_cache_branch")) + self.assertEqual(model_merged.decoder_with_past, None) + self.assertEqual(model_merged.use_merged, True) - self.assertTrue(torch.equal(outputs_model_merged, outputs_model_not_merged)) + outputs_model_not_merged = model_not_merged.generate(**tokens) + outputs_model_merged = model_merged.generate(**tokens) + + self.assertTrue(torch.equal(outputs_model_merged, outputs_model_not_merged)) @parameterized.expand( grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True], "use_merged": [False, True]}) @@ -3487,33 +3565,41 @@ def test_compare_to_io_binding(self, test_name: str, model_arch: str, use_cache: self._setup(model_args) - model_id = MODEL_NAMES[model_arch] - onnx_model = ORTModelForSeq2SeqLM.from_pretrained( - self.onnx_model_dirs[test_name], use_io_binding=False, use_cache=use_cache - ).to("cuda") - io_model = ORTModelForSeq2SeqLM.from_pretrained( - self.onnx_model_dirs[test_name], use_io_binding=True, use_cache=use_cache - ).to("cuda") + model_ids = self._get_model_ids(model_arch) + for model_id in model_ids: + if ( + model_arch == "encoder-decoder" + and "text2text-generation-with-past" not in MODEL_NAMES[model_arch][model_id] + ): + # The model with use_cache=True is not supported for bert as a decoder") + continue - self.assertFalse(onnx_model.use_io_binding) - self.assertTrue(io_model.use_io_binding) + onnx_model = ORTModelForSeq2SeqLM.from_pretrained( + self._get_onnx_model_dir(model_id, model_arch, test_name), use_io_binding=False, use_cache=use_cache + ).to("cuda") + io_model = ORTModelForSeq2SeqLM.from_pretrained( + self._get_onnx_model_dir(model_id, model_arch, test_name), use_io_binding=True, use_cache=use_cache + ).to("cuda") - tokenizer = get_preprocessor(model_id) - tokens = tokenizer(["This is a sample output"] * 2, return_tensors="pt").to("cuda") - decoder_start_token_id = onnx_model.config.decoder_start_token_id if model_arch != "mbart" else 2 - if model_arch == "encoder-decoder": - decoder_start_token_id = tokenizer.cls_token_id + self.assertFalse(onnx_model.use_io_binding) + self.assertTrue(io_model.use_io_binding) - decoder_inputs = {"decoder_input_ids": torch.ones((2, 1), dtype=torch.long) * decoder_start_token_id} + tokenizer = get_preprocessor(model_id) + tokens = tokenizer(["This is a sample output"] * 2, return_tensors="pt").to("cuda") + decoder_start_token_id = onnx_model.config.decoder_start_token_id if model_arch != "mbart" else 2 + if model_arch == "encoder-decoder": + decoder_start_token_id = tokenizer.cls_token_id - onnx_outputs = onnx_model(**tokens, **decoder_inputs) - io_outputs = io_model(**tokens, **decoder_inputs) + decoder_inputs = {"decoder_input_ids": torch.ones((2, 1), dtype=torch.long) * decoder_start_token_id} - self.assertTrue("logits" in io_outputs) - self.assertIsInstance(io_outputs.logits, torch.Tensor) + onnx_outputs = onnx_model(**tokens, **decoder_inputs) + io_outputs = io_model(**tokens, **decoder_inputs) - # compare tensor outputs - self.assertTrue(torch.equal(onnx_outputs.logits, io_outputs.logits)) + self.assertTrue("logits" in io_outputs) + self.assertIsInstance(io_outputs.logits, torch.Tensor) + + # compare tensor outputs + self.assertTrue(torch.equal(onnx_outputs.logits, io_outputs.logits)) gc.collect() @@ -3536,21 +3622,29 @@ def test_compare_generation_to_io_binding( self._setup(model_args) - model_id = MODEL_NAMES[model_arch] - onnx_model = ORTModelForSeq2SeqLM.from_pretrained( - self.onnx_model_dirs[test_name], use_io_binding=False, use_cache=use_cache - ).to("cuda") - io_model = ORTModelForSeq2SeqLM.from_pretrained( - self.onnx_model_dirs[test_name], use_io_binding=True, use_cache=use_cache - ).to("cuda") + model_ids = self._get_model_ids(model_arch) + for model_id in model_ids: + if ( + model_arch == "encoder-decoder" + and "text2text-generation-with-past" not in MODEL_NAMES[model_arch][model_id] + ): + # The model with use_cache=True is not supported for bert as a decoder") + continue - tokenizer = get_preprocessor(model_id) - tokens = tokenizer("This is a sample output", return_tensors="pt").to("cuda") - onnx_outputs = onnx_model.generate(**tokens, num_beams=5) - io_outputs = io_model.generate(**tokens, num_beams=5) + onnx_model = ORTModelForSeq2SeqLM.from_pretrained( + self._get_onnx_model_dir(model_id, model_arch, test_name), use_io_binding=False, use_cache=use_cache + ).to("cuda") + io_model = ORTModelForSeq2SeqLM.from_pretrained( + self._get_onnx_model_dir(model_id, model_arch, test_name), use_io_binding=True, use_cache=use_cache + ).to("cuda") - # compare tensor outputs - self.assertTrue(torch.equal(onnx_outputs, io_outputs)) + tokenizer = get_preprocessor(model_id) + tokens = tokenizer("This is a sample output", return_tensors="pt").to("cuda") + onnx_outputs = onnx_model.generate(**tokens, num_beams=5) + io_outputs = io_model.generate(**tokens, num_beams=5) + + # compare tensor outputs + self.assertTrue(torch.equal(onnx_outputs, io_outputs)) gc.collect() diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index 6e2dc16ae7..44d5caae5e 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -50,8 +50,13 @@ "detr": "hf-internal-testing/tiny-random-detr", "distilbert": "hf-internal-testing/tiny-random-DistilBertModel", "electra": "hf-internal-testing/tiny-random-ElectraModel", - # "encoder-decoder": "hf-internal-testing/tiny-random-EncoderDecoderModel-bert-bert", - "encoder-decoder": "mohitsha/tiny-random-testing-bert2gpt2", + # "encoder-decoder": "mohitsha/tiny-random-testing-bert2gpt2", + "encoder-decoder": { + "hf-internal-testing/tiny-random-EncoderDecoderModel-bert-bert": [ + "text2text-generation", + ], + "mohitsha/tiny-random-testing-bert2gpt2": ["text2text-generation", "text2text-generation-with-past"], + }, "flaubert": "hf-internal-testing/tiny-random-flaubert", "gpt2": "hf-internal-testing/tiny-random-gpt2", "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", @@ -138,22 +143,49 @@ def _setup(self, model_args: Dict): ): self.skipTest("Unsupported export case") + model_ids = MODEL_NAMES[model_arch] + if isinstance(model_ids, dict): + model_ids = list(model_ids.keys()) + else: + model_ids = [model_ids] + if model_arch_and_params not in self.onnx_model_dirs: + self.onnx_model_dirs[model_arch_and_params] = {} + # model_args will contain kwargs to pass to ORTModel.from_pretrained() model_args.pop("test_name") model_args.pop("model_arch") - model_id = ( - self.ARCH_MODEL_MAP[model_arch] if model_arch in self.ARCH_MODEL_MAP else MODEL_NAMES[model_arch] - ) - set_seed(SEED) - onnx_model = self.ORTMODEL_CLASS.from_pretrained(model_id, **model_args, use_io_binding=False, export=True) - - model_dir = tempfile.mkdtemp(prefix=f"{model_arch_and_params}_{self.TASK}_") - onnx_model.save_pretrained(model_dir) - self.onnx_model_dirs[model_arch_and_params] = model_dir + for idx, model_id in enumerate(model_ids): + if model_arch == "encoder-decoder" and task not in MODEL_NAMES[model_arch][model_id]: + # The model with use_cache=True is not supported for bert as a decoder") + continue + + if model_arch in self.ARCH_MODEL_MAP: + if isinstance(MODEL_NAMES[model_arch], dict): + model_id = list(self.ARCH_MODEL_MAP[model_arch].keys())[idx] + else: + model_id = self.ARCH_MODEL_MAP[model_arch] + + set_seed(SEED) + onnx_model = self.ORTMODEL_CLASS.from_pretrained( + model_id, **model_args, use_io_binding=False, export=True + ) + + model_dir = tempfile.mkdtemp( + prefix=f"{model_arch_and_params}_{self.TASK}_{model_id.replace('/', '_')}" + ) + onnx_model.save_pretrained(model_dir) + if isinstance(MODEL_NAMES[model_arch], dict): + self.onnx_model_dirs[model_arch_and_params][model_id] = model_dir + else: + self.onnx_model_dirs[model_arch_and_params] = model_dir @classmethod def tearDownClass(cls): for _, dir_path in cls.onnx_model_dirs.items(): - shutil.rmtree(dir_path) + if isinstance(dir_path, dict): + for _, sec_dir_path in dir_path.items(): + shutil.rmtree(sec_dir_path) + else: + shutil.rmtree(dir_path) From 56cd5e876faee5908026ba3cb1eaa468d880c248 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Sun, 3 Sep 2023 19:01:55 +0000 Subject: [PATCH 3/4] remove comments --- tests/exporters/exporters_utils.py | 22 +++++++++++----------- tests/onnxruntime/test_modeling.py | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index 12483303f8..18ce225fcb 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -141,17 +141,17 @@ "speech-to-text": "hf-internal-testing/tiny-random-Speech2TextModel", "xlm": "hf-internal-testing/tiny-random-XLMModel", "xlm-roberta": "hf-internal-testing/tiny-xlm-roberta", - # "vision-encoder-decoder": { - # "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2": [ - # "image-to-text", - # "image-to-text-with-past", - # ], - # "microsoft/trocr-small-handwritten": ["image-to-text"], - # "fxmarty/tiny-doc-qa-vision-encoder-decoder": [ - # "document-question-answering", - # "document-question-answering-with-past", - # ], - # }, + "vision-encoder-decoder": { + "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2": [ + "image-to-text", + "image-to-text-with-past", + ], + "microsoft/trocr-small-handwritten": ["image-to-text"], + "fxmarty/tiny-doc-qa-vision-encoder-decoder": [ + "document-question-answering", + "document-question-answering-with-past", + ], + }, } diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index c4fd279fc9..a9d9864637 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -4051,7 +4051,7 @@ def test_compare_generation_to_io_binding( class ORTModelForVision2SeqIntegrationTest(ORTModelTestMixin): # TODO: speech_to_text should be tested - SUPPORTED_ARCHITECTURES = ["trocr"] + SUPPORTED_ARCHITECTURES = ["vision-encoder-decoder", "trocr"] FULL_GRID = { "model_arch": SUPPORTED_ARCHITECTURES, From 990bc5944f9673bd2683e33358cbed088f4bab86 Mon Sep 17 00:00:00 2001 From: Mohit Sharma Date: Sun, 3 Sep 2023 20:02:06 +0000 Subject: [PATCH 4/4] remove extra func --- tests/onnxruntime/test_modeling.py | 17 ++++++++--------- tests/onnxruntime/utils_onnxruntime_tests.py | 1 - 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index a9d9864637..d9ca1860ff 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -3097,10 +3097,6 @@ def _get_onnx_model_dir(self, model_id, model_arch, test_name): return onnx_model_dir - def _load_model(self, model_id, model_arch, test_name, use_cache): - onnx_model_dir = self._get_onnx_model_dir(model_id, model_arch, test_name) - return ORTModelForSeq2SeqLM.from_pretrained(onnx_model_dir, use_cache=use_cache) - def test_inference_old_onnx_model(self): model = ORTModelForSeq2SeqLM.from_pretrained("optimum/t5-small") @@ -3130,7 +3126,9 @@ def test_generate_utils(self, test_name: str, model_arch: str, use_cache: str): ): continue - model = self._load_model(model_id, model_arch, test_name, use_cache) + onnx_model_dir = self._get_onnx_model_dir(model_id, model_arch, test_name) + model = ORTModelForSeq2SeqLM.from_pretrained(onnx_model_dir, use_cache=use_cache) + tokenizer = get_preprocessor(model_id) text = "This is a sample output" tokens = tokenizer(text, return_tensors="pt") @@ -3233,8 +3231,7 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach continue onnx_model_dir = self._get_onnx_model_dir(model_id, model_arch, test_name) - - onnx_model = self._load_model(model_id, model_arch, test_name, use_cache) + onnx_model = ORTModelForSeq2SeqLM.from_pretrained(onnx_model_dir, use_cache=use_cache) self.assertIsInstance(onnx_model.encoder, ORTEncoder) if use_merged is False: @@ -3309,7 +3306,8 @@ def test_pipeline_text_generation(self, test_name: str, model_arch: str, use_cac # The model with use_cache=True is not supported for bert as a decoder") continue - onnx_model = self._load_model(model_id, model_arch, test_name, use_cache) + onnx_model_dir = self._get_onnx_model_dir(model_id, model_arch, test_name) + onnx_model = ORTModelForSeq2SeqLM.from_pretrained(onnx_model_dir, use_cache=use_cache) tokenizer = get_preprocessor(model_id) @@ -3377,7 +3375,8 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, use_cache: bool) # The model with use_cache=True is not supported for bert as a decoder" continue - onnx_model = self._load_model(model_id, model_arch, test_name, use_cache) + onnx_model_dir = self._get_onnx_model_dir(model_id, model_arch, test_name) + onnx_model = ORTModelForSeq2SeqLM.from_pretrained(onnx_model_dir, use_cache=use_cache) tokenizer = get_preprocessor(model_id) pipe = pipeline( diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index 44d5caae5e..8ec6ce90b0 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -50,7 +50,6 @@ "detr": "hf-internal-testing/tiny-random-detr", "distilbert": "hf-internal-testing/tiny-random-DistilBertModel", "electra": "hf-internal-testing/tiny-random-ElectraModel", - # "encoder-decoder": "mohitsha/tiny-random-testing-bert2gpt2", "encoder-decoder": { "hf-internal-testing/tiny-random-EncoderDecoderModel-bert-bert": [ "text2text-generation",