From 8599b7136bb86e8b890ea21799a75a795e5c4b0f Mon Sep 17 00:00:00 2001 From: ydshieh Date: Mon, 6 Nov 2023 18:15:08 +0100 Subject: [PATCH 1/3] fix --- src/transformers/models/kosmos2/processing_kosmos2.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py index 5dc0fad0de0107..6943c12c487179 100644 --- a/src/transformers/models/kosmos2/processing_kosmos2.py +++ b/src/transformers/models/kosmos2/processing_kosmos2.py @@ -211,7 +211,9 @@ def __call__( image_embeds_position_mask.append(mask) if isinstance(text, list): - sorted_length = sorted([(idx, len(x)) for idx, x in enumerate(text_encoding.input_ids)]) + sorted_length = sorted( + [(idx, len(x)) for idx, x in enumerate(text_encoding.input_ids)], key=lambda x: x[-1] + ) _, min_len_not_padded = sorted_length[0] idx, _ = sorted_length[-1] From 796b83918a84a275922a32ab987d2d651b2da9f9 Mon Sep 17 00:00:00 2001 From: ydshieh Date: Mon, 6 Nov 2023 18:35:36 +0100 Subject: [PATCH 2/3] fix --- tests/models/kosmos2/test_modeling_kosmos2.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index 3f55ad9759dd0f..19500d5d8dee68 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -686,7 +686,7 @@ def test_snowman_image_captioning_batch(self): model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device) - prompt = ["An image of", "Describe this image in detail:"] + prompt = ["Describe this image in detail:", "An image of"] # left padding processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224", padding_side="left") @@ -699,10 +699,6 @@ def test_snowman_image_captioning_batch(self): # left padding gives identical results as non-padding EXPECTED_PROCESSED_TEXT_0 = ( - " An image of a snowman " - "warming himself by a fire." - ) - EXPECTED_PROCESSED_TEXT_1 = ( " Describe this image in detail: The image features a snowman sitting by a campfire" " in the snow. He is wearing a hat" ", scarf" @@ -712,21 +708,21 @@ def test_snowman_image_captioning_batch(self): "nearby. The snowman appears to be enjoying the warmth of the fire, and it appears to have a warm and cozy " "atmosphere." ) + EXPECTED_PROCESSED_TEXT_1 = ( + " An image of a snowman " + "warming himself by a fire." + ) self.assertListEqual(processed_text, [EXPECTED_PROCESSED_TEXT_0, EXPECTED_PROCESSED_TEXT_1]) - EXPECTED_FINAL_TEXT_0 = "An image of a snowman warming himself by a fire." - EXPECTED_FINAL_TEXT_1 = ( + EXPECTED_FINAL_TEXT_0 = ( "Describe this image in detail: The image features a snowman sitting by a campfire in the snow. He is " "wearing a hat, scarf, and gloves, with a pot nearby and a cup placed nearby. The snowman appears to be " "enjoying the warmth of the fire, and it appears to have a warm and cozy atmosphere." ) + EXPECTED_FINAL_TEXT_1 = "An image of a snowman warming himself by a fire." self.assertListEqual(all_final_text, [EXPECTED_FINAL_TEXT_0, EXPECTED_FINAL_TEXT_1]) EXPECTED_ENTITIES_0 = [ - ("a snowman", (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), - ("a fire", (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)]), - ] - EXPECTED_ENTITIES_1 = [ ("a campfire", (71, 81), [(0.171875, 0.015625, 0.484375, 0.984375)]), ("a hat", (109, 114), [(0.515625, 0.046875, 0.828125, 0.234375)]), ("scarf", (116, 121), [(0.515625, 0.234375, 0.890625, 0.578125)]), @@ -734,6 +730,10 @@ def test_snowman_image_captioning_batch(self): ("a pot", (140, 145), [(0.078125, 0.609375, 0.265625, 0.859375)]), ("a cup", (157, 162), [(0.890625, 0.765625, 0.984375, 0.984375)]), ] + EXPECTED_ENTITIES_1 = [ + ("a snowman", (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), + ("a fire", (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)]), + ] self.assertListEqual(all_entities, [EXPECTED_ENTITIES_0, EXPECTED_ENTITIES_1]) # right padding @@ -746,6 +746,6 @@ def test_snowman_image_captioning_batch(self): all_entities = [x[1] for x in final_text_with_entities] # For right padding, only the non-padded sequences will give the same results as non-padding - self.assertEqual(processed_text[1], EXPECTED_PROCESSED_TEXT_1) - self.assertEqual(all_final_text[1], EXPECTED_FINAL_TEXT_1) - self.assertListEqual(all_entities[1], EXPECTED_ENTITIES_1) + self.assertEqual(processed_text[0], EXPECTED_PROCESSED_TEXT_1) + self.assertEqual(all_final_text[0], EXPECTED_FINAL_TEXT_1) + self.assertListEqual(all_entities[0], EXPECTED_ENTITIES_1) From 84a2df50d8f003be3433a4348031e1484bf82dd8 Mon Sep 17 00:00:00 2001 From: ydshieh Date: Mon, 6 Nov 2023 18:38:27 +0100 Subject: [PATCH 3/3] fix --- tests/models/kosmos2/test_modeling_kosmos2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index 19500d5d8dee68..5491ded1bc8152 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -746,6 +746,6 @@ def test_snowman_image_captioning_batch(self): all_entities = [x[1] for x in final_text_with_entities] # For right padding, only the non-padded sequences will give the same results as non-padding - self.assertEqual(processed_text[0], EXPECTED_PROCESSED_TEXT_1) - self.assertEqual(all_final_text[0], EXPECTED_FINAL_TEXT_1) - self.assertListEqual(all_entities[0], EXPECTED_ENTITIES_1) + self.assertEqual(processed_text[0], EXPECTED_PROCESSED_TEXT_0) + self.assertEqual(all_final_text[0], EXPECTED_FINAL_TEXT_0) + self.assertListEqual(all_entities[0], EXPECTED_ENTITIES_0)