Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Kosmos2Processor batch mode #27323

Merged
merged 3 commits into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/transformers/models/kosmos2/processing_kosmos2.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,9 @@ def __call__(
image_embeds_position_mask.append(mask)

if isinstance(text, list):
sorted_length = sorted([(idx, len(x)) for idx, x in enumerate(text_encoding.input_ids)])
sorted_length = sorted(
[(idx, len(x)) for idx, x in enumerate(text_encoding.input_ids)], key=lambda x: x[-1]
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here my intention was to sort with the length, not the index. However, I forgot to specify the key so it did not do what I expected.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that we do have tests regarding the batch mode - but the examples I used have the longest sequence as the last example in a batch, so it didn't detect this issue 😭

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we update the test so that it would have been caught?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK!

)
_, min_len_not_padded = sorted_length[0]
idx, _ = sorted_length[-1]

Expand Down
28 changes: 14 additions & 14 deletions tests/models/kosmos2/test_modeling_kosmos2.py
Original file line number Diff line number Diff line change
Expand Up @@ -686,7 +686,7 @@ def test_snowman_image_captioning_batch(self):

model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(torch_device)

prompt = ["<grounding>An image of", "<grounding>Describe this image in detail:"]
prompt = ["<grounding>Describe this image in detail:", "<grounding>An image of"]

# left padding
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224", padding_side="left")
Expand All @@ -699,10 +699,6 @@ def test_snowman_image_captioning_batch(self):

# left padding gives identical results as non-padding
EXPECTED_PROCESSED_TEXT_0 = (
"<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> "
"warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
)
EXPECTED_PROCESSED_TEXT_1 = (
"<grounding> Describe this image in detail: The image features a snowman sitting by<phrase> a campfire"
"</phrase><object><patch_index_0005><patch_index_1007></object> in the snow. He is wearing<phrase> a hat"
"</phrase><object><patch_index_0048><patch_index_0250></object>,<phrase> scarf</phrase><object>"
Expand All @@ -712,28 +708,32 @@ def test_snowman_image_captioning_batch(self):
"nearby. The snowman appears to be enjoying the warmth of the fire, and it appears to have a warm and cozy "
"atmosphere."
)
EXPECTED_PROCESSED_TEXT_1 = (
"<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> "
"warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
)
self.assertListEqual(processed_text, [EXPECTED_PROCESSED_TEXT_0, EXPECTED_PROCESSED_TEXT_1])

EXPECTED_FINAL_TEXT_0 = "An image of a snowman warming himself by a fire."
EXPECTED_FINAL_TEXT_1 = (
EXPECTED_FINAL_TEXT_0 = (
"Describe this image in detail: The image features a snowman sitting by a campfire in the snow. He is "
"wearing a hat, scarf, and gloves, with a pot nearby and a cup placed nearby. The snowman appears to be "
"enjoying the warmth of the fire, and it appears to have a warm and cozy atmosphere."
)
EXPECTED_FINAL_TEXT_1 = "An image of a snowman warming himself by a fire."
self.assertListEqual(all_final_text, [EXPECTED_FINAL_TEXT_0, EXPECTED_FINAL_TEXT_1])

EXPECTED_ENTITIES_0 = [
("a snowman", (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]),
("a fire", (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)]),
]
EXPECTED_ENTITIES_1 = [
("a campfire", (71, 81), [(0.171875, 0.015625, 0.484375, 0.984375)]),
("a hat", (109, 114), [(0.515625, 0.046875, 0.828125, 0.234375)]),
("scarf", (116, 121), [(0.515625, 0.234375, 0.890625, 0.578125)]),
("gloves", (127, 133), [(0.515625, 0.390625, 0.640625, 0.515625)]),
("a pot", (140, 145), [(0.078125, 0.609375, 0.265625, 0.859375)]),
("a cup", (157, 162), [(0.890625, 0.765625, 0.984375, 0.984375)]),
]
EXPECTED_ENTITIES_1 = [
("a snowman", (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]),
("a fire", (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)]),
]
self.assertListEqual(all_entities, [EXPECTED_ENTITIES_0, EXPECTED_ENTITIES_1])

# right padding
Expand All @@ -746,6 +746,6 @@ def test_snowman_image_captioning_batch(self):
all_entities = [x[1] for x in final_text_with_entities]

# For right padding, only the non-padded sequences will give the same results as non-padding
self.assertEqual(processed_text[1], EXPECTED_PROCESSED_TEXT_1)
self.assertEqual(all_final_text[1], EXPECTED_FINAL_TEXT_1)
self.assertListEqual(all_entities[1], EXPECTED_ENTITIES_1)
self.assertEqual(processed_text[0], EXPECTED_PROCESSED_TEXT_0)
self.assertEqual(all_final_text[0], EXPECTED_FINAL_TEXT_0)
self.assertListEqual(all_entities[0], EXPECTED_ENTITIES_0)
Loading