huggingface · ydshieh · Nov 6, 2023 · Nov 6, 2023 · Nov 6, 2023 · Nov 6, 2023
diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py
@@ -211,7 +211,9 @@ def __call__(
                 image_embeds_position_mask.append(mask)
 
             if isinstance(text, list):
-                sorted_length = sorted([(idx, len(x)) for idx, x in enumerate(text_encoding.input_ids)])
+                sorted_length = sorted(
+                    [(idx, len(x)) for idx, x in enumerate(text_encoding.input_ids)], key=lambda x: x[-1]
+                )
                 _, min_len_not_padded = sorted_length[0]
                 idx, _ = sorted_length[-1]
 

diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py
@@ -686,7 +686,7 @@ def test_snowman_image_captioning_batch(self):
 
         model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)
 
-        prompt = ["<grounding>An image of", "<grounding>Describe this image in detail:"]
+        prompt = ["<grounding>Describe this image in detail:", "<grounding>An image of"]
 
         # left padding
         processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224", padding_side="left")
@@ -699,10 +699,6 @@ def test_snowman_image_captioning_batch(self):
 
         # left padding gives identical results as non-padding
         EXPECTED_PROCESSED_TEXT_0 = (
-            "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> "
-            "warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
-        )
-        EXPECTED_PROCESSED_TEXT_1 = (
             "<grounding> Describe this image in detail: The image features a snowman sitting by<phrase> a campfire"
             "</phrase><object><patch_index_0005><patch_index_1007></object> in the snow. He is wearing<phrase> a hat"
             "</phrase><object><patch_index_0048><patch_index_0250></object>,<phrase> scarf</phrase><object>"
@@ -712,28 +708,32 @@ def test_snowman_image_captioning_batch(self):
             "nearby. The snowman appears to be enjoying the warmth of the fire, and it appears to have a warm and cozy "
             "atmosphere."
         )
+        EXPECTED_PROCESSED_TEXT_1 = (
+            "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> "
+            "warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
+        )
         self.assertListEqual(processed_text, [EXPECTED_PROCESSED_TEXT_0, EXPECTED_PROCESSED_TEXT_1])
 
-        EXPECTED_FINAL_TEXT_0 = "An image of a snowman warming himself by a fire."
-        EXPECTED_FINAL_TEXT_1 = (
+        EXPECTED_FINAL_TEXT_0 = (
             "Describe this image in detail: The image features a snowman sitting by a campfire in the snow. He is "
             "wearing a hat, scarf, and gloves, with a pot nearby and a cup placed nearby. The snowman appears to be "
             "enjoying the warmth of the fire, and it appears to have a warm and cozy atmosphere."
         )
+        EXPECTED_FINAL_TEXT_1 = "An image of a snowman warming himself by a fire."
         self.assertListEqual(all_final_text, [EXPECTED_FINAL_TEXT_0, EXPECTED_FINAL_TEXT_1])
 
         EXPECTED_ENTITIES_0 = [
-            ("a snowman", (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]),
-            ("a fire", (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)]),
-        ]
-        EXPECTED_ENTITIES_1 = [
             ("a campfire", (71, 81), [(0.171875, 0.015625, 0.484375, 0.984375)]),
             ("a hat", (109, 114), [(0.515625, 0.046875, 0.828125, 0.234375)]),
             ("scarf", (116, 121), [(0.515625, 0.234375, 0.890625, 0.578125)]),
             ("gloves", (127, 133), [(0.515625, 0.390625, 0.640625, 0.515625)]),
             ("a pot", (140, 145), [(0.078125, 0.609375, 0.265625, 0.859375)]),
             ("a cup", (157, 162), [(0.890625, 0.765625, 0.984375, 0.984375)]),
         ]
+        EXPECTED_ENTITIES_1 = [
+            ("a snowman", (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]),
+            ("a fire", (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)]),
+        ]
         self.assertListEqual(all_entities, [EXPECTED_ENTITIES_0, EXPECTED_ENTITIES_1])
 
         # right padding
@@ -746,6 +746,6 @@ def test_snowman_image_captioning_batch(self):
         all_entities = [x[1] for x in final_text_with_entities]
 
         # For right padding, only the non-padded sequences will give the same results as non-padding
-        self.assertEqual(processed_text[1], EXPECTED_PROCESSED_TEXT_1)
-        self.assertEqual(all_final_text[1], EXPECTED_FINAL_TEXT_1)
-        self.assertListEqual(all_entities[1], EXPECTED_ENTITIES_1)
+        self.assertEqual(processed_text[0], EXPECTED_PROCESSED_TEXT_0)
+        self.assertEqual(all_final_text[0], EXPECTED_FINAL_TEXT_0)
+        self.assertListEqual(all_entities[0], EXPECTED_ENTITIES_0)