-
Notifications
You must be signed in to change notification settings - Fork 26.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Jamba: update integration tests #32250
Changes from all commits
520bec2
2237449
94e81a1
4ea6855
a305dda
fbd83c2
619d88f
1b81b63
c372fd9
27ce70a
9912a22
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -650,82 +650,112 @@ def test_new_cache_format(self, num_beams, do_sample): | |
class JambaModelIntegrationTest(unittest.TestCase): | ||
model = None | ||
tokenizer = None | ||
# This variable is used to determine which CUDA device are we using for our runners (A10 or T4) | ||
# Depending on the hardware we get different logits / generations | ||
cuda_compute_capability_major_version = None | ||
|
||
@classmethod | ||
def setUpClass(cls): | ||
model_id = "ai21labs/Jamba-tiny-random" | ||
cls.model = JambaForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True) | ||
cls.tokenizer = AutoTokenizer.from_pretrained(model_id) | ||
if is_torch_available() and torch.cuda.is_available(): | ||
# 8 is for A100 / A10 and 7 for T4 | ||
cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0] | ||
|
||
@slow | ||
def test_simple_generate(self): | ||
# Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4. | ||
# | ||
# Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s, | ||
# considering differences in hardware processing and potential deviations in generated text. | ||
EXPECTED_TEXTS = { | ||
7: "<|startoftext|>Hey how are you doing on this lovely evening? Canyon rins hugaughter glamour Rutgers Singh<|reserved_797|>cw algunas", | ||
8: "<|startoftext|>Hey how are you doing on this lovely evening? Canyon rins hugaughter glamour Rutgers Singh Hebrew llam bb", | ||
9: "<|startoftext|>Hey how are you doing on this lovely evening? Canyon rins hugaughter glamour Rutgers Singh Hebrew llam bb", | ||
} | ||
|
||
self.model.to(torch_device) | ||
|
||
input_ids = self.tokenizer("Hey how are you doing on this lovely evening?", return_tensors="pt")[ | ||
"input_ids" | ||
].to(torch_device) | ||
out = self.model.generate(input_ids, do_sample=False, max_new_tokens=10) | ||
output_sentence = self.tokenizer.decode(out[0, :]) | ||
self.assertEqual( | ||
output_sentence, | ||
"<|startoftext|>Hey how are you doing on this lovely evening? Canyon rins hugaughter glamour Rutgers Singh Hebrew cases Cats", | ||
) | ||
self.assertEqual(output_sentence, EXPECTED_TEXTS[self.cuda_compute_capability_major_version]) | ||
|
||
with torch.no_grad(): | ||
logits = self.model(input_ids=input_ids).logits | ||
# TODO: there are significant differences in the logits across major cuda versions, which shouldn't exist | ||
if self.cuda_compute_capability_major_version == 8: | ||
with torch.no_grad(): | ||
logits = self.model(input_ids=input_ids).logits | ||
|
||
EXPECTED_LOGITS_NO_GRAD = torch.tensor( | ||
[ | ||
0.0140, -0.2246, 0.0408, -0.1016, 0.0471, 0.2715, -0.1465, 0.1631, | ||
-0.2949, -0.0297, 0.0250, -0.5586, -0.2139, -0.1426, -0.1602, 0.1309, | ||
0.0703, 0.2236, 0.1729, -0.2285, -0.1152, -0.1177, -0.1367, 0.0289, | ||
0.1245, 0.2363, 0.0442, 0.1094, -0.1348, -0.2295, 0.1494, -0.3945, | ||
0.1777, -0.4570, -0.0408, 0.2412, 0.1562, -0.1943, 0.2373, -0.0593 | ||
] | ||
, dtype=torch.float32) # fmt: skip | ||
EXPECTED_LOGITS_NO_GRAD = torch.tensor( | ||
[ | ||
0.0134, -0.2197, 0.0396, -0.1011, 0.0459, 0.2793, -0.1465, 0.1660, | ||
-0.2930, -0.0278, 0.0269, -0.5586, -0.2109, -0.1426, -0.1553, 0.1279, | ||
0.0713, 0.2246, 0.1660, -0.2314, -0.1187, -0.1162, -0.1377, 0.0292, | ||
0.1245, 0.2275, 0.0374, 0.1089, -0.1348, -0.2305, 0.1484, -0.3906, | ||
0.1709, -0.4590, -0.0447, 0.2422, 0.1592, -0.1855, 0.2441, -0.0562 | ||
] | ||
, dtype=torch.float32) # fmt: skip | ||
|
||
torch.testing.assert_close(logits[0, -1, :40].cpu(), EXPECTED_LOGITS_NO_GRAD, rtol=1e-3, atol=1e-3) | ||
torch.testing.assert_close(logits[0, -1, :40].cpu(), EXPECTED_LOGITS_NO_GRAD, rtol=1e-3, atol=1e-3) | ||
|
||
@slow | ||
def test_simple_batched_generate_with_padding(self): | ||
# Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4. | ||
# | ||
# Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s, | ||
# considering differences in hardware processing and potential deviations in generated text. | ||
EXPECTED_TEXTS = { | ||
7: [ | ||
"<|startoftext|>Hey how are you doing on this lovely evening? Canyon rins hugaughter glamour Rutgers Singh Hebrew cases Cats", | ||
"<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|startoftext|>Tell me a storyptus Nets Madison El chamadamodern updximVaparsed", | ||
], | ||
8: [ | ||
"<|startoftext|>Hey how are you doing on this lovely evening? Canyon rins hugaughter glamour Rutgers Singh<|reserved_797|>cw algunas", | ||
"<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|startoftext|>Tell me a storyptus Nets Madison El chamadamodern updximVaparsed", | ||
], | ||
9: [ | ||
"<|startoftext|>Hey how are you doing on this lovely evening? Canyon rins hugaughter glamour Rutgers Singh<|reserved_797|>cw algunas", | ||
"<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|startoftext|>Tell me a storyptus Nets Madison El chamadamodern updximVaparsed", | ||
], | ||
} | ||
|
||
self.model.to(torch_device) | ||
|
||
inputs = self.tokenizer( | ||
["Hey how are you doing on this lovely evening?", "Tell me a story"], padding=True, return_tensors="pt" | ||
).to(torch_device) | ||
out = self.model.generate(**inputs, do_sample=False, max_new_tokens=10) | ||
output_sentences = self.tokenizer.batch_decode(out) | ||
self.assertEqual( | ||
output_sentences[0], | ||
"<|startoftext|>Hey how are you doing on this lovely evening? Canyon rins hugaughter glamour Rutgers Singh Hebrew cases Cats", | ||
) | ||
self.assertEqual( | ||
output_sentences[1], | ||
"<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|startoftext|>Tell me a storyptus Nets Madison El chamadamodern updximVaparsed", | ||
) | ||
self.assertEqual(output_sentences[0], EXPECTED_TEXTS[self.cuda_compute_capability_major_version][0]) | ||
self.assertEqual(output_sentences[1], EXPECTED_TEXTS[self.cuda_compute_capability_major_version][1]) | ||
|
||
with torch.no_grad(): | ||
logits = self.model(input_ids=inputs["input_ids"]).logits | ||
|
||
EXPECTED_LOGITS_NO_GRAD_0 = torch.tensor( | ||
[ | ||
0.0140, -0.2246, 0.0408, -0.1016, 0.0471, 0.2715, -0.1465, 0.1631, | ||
-0.2949, -0.0297, 0.0250, -0.5586, -0.2139, -0.1426, -0.1602, 0.1309, | ||
0.0703, 0.2236, 0.1729, -0.2285, -0.1152, -0.1177, -0.1367, 0.0289, | ||
0.1245, 0.2363, 0.0442, 0.1094, -0.1348, -0.2295, 0.1494, -0.3945, | ||
0.1777, -0.4570, -0.0408, 0.2412, 0.1562, -0.1943, 0.2373, -0.0593 | ||
] | ||
, dtype=torch.float32) # fmt: skip | ||
|
||
EXPECTED_LOGITS_NO_GRAD_1 = torch.tensor( | ||
[ | ||
-0.1289, 0.2363, -0.4180, -0.0302, -0.0476, 0.0327, 0.2578, 0.0874, | ||
0.1484, 0.2305, -0.1152, -0.1396, -0.1494, -0.1113, -0.0021, -0.2832, | ||
0.2002, -0.2676, 0.0598, -0.1982, -0.2539, -0.1133, -0.1973, 0.2148, | ||
0.0559, 0.1670, 0.1846, 0.1270, 0.1680, -0.1250, -0.2656, -0.2871, | ||
0.2344, 0.2637, 0.0510, -0.1855, 0.2158, -0.1289, 0.1758, 0.0074 | ||
] | ||
, dtype=torch.float32) # fmt: skip | ||
|
||
torch.testing.assert_close(logits[0, -1, :40].cpu(), EXPECTED_LOGITS_NO_GRAD_0, rtol=1e-3, atol=1e-3) | ||
torch.testing.assert_close(logits[1, -1, :40].cpu(), EXPECTED_LOGITS_NO_GRAD_1, rtol=1e-3, atol=1e-3) | ||
# TODO: there are significant differences in the logits across major cuda versions, which shouldn't exist | ||
if self.cuda_compute_capability_major_version == 8: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe better to use
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oops, merged before seeing this comment! You have a good point, in fact we should split the test in two to test (/skip) the logits separately |
||
with torch.no_grad(): | ||
logits = self.model(input_ids=inputs["input_ids"]).logits | ||
|
||
EXPECTED_LOGITS_NO_GRAD_0 = torch.tensor( | ||
[ | ||
0.0166, -0.2227, 0.0396, -0.1035, 0.0459, 0.2754, -0.1445, 0.1641, | ||
-0.2910, -0.0273, 0.0227, -0.5547, -0.2139, -0.1396, -0.1582, 0.1289, | ||
0.0713, 0.2256, 0.1699, -0.2295, -0.1182, -0.1167, -0.1387, 0.0261, | ||
0.1270, 0.2285, 0.0403, 0.1108, -0.1318, -0.2334, 0.1455, -0.3945, | ||
0.1729, -0.4609, -0.0410, 0.2412, 0.1572, -0.1895, 0.2402, -0.0583 | ||
] | ||
, dtype=torch.float32) # fmt: skip | ||
|
||
EXPECTED_LOGITS_NO_GRAD_1 = torch.tensor( | ||
[ | ||
-0.1318, 0.2354, -0.4160, -0.0325, -0.0461, 0.0342, 0.2578, 0.0874, | ||
0.1484, 0.2266, -0.1182, -0.1396, -0.1494, -0.1089, -0.0019, -0.2852, | ||
0.1973, -0.2676, 0.0586, -0.1992, -0.2520, -0.1147, -0.1973, 0.2129, | ||
0.0520, 0.1699, 0.1816, 0.1289, 0.1699, -0.1216, -0.2656, -0.2891, | ||
0.2363, 0.2656, 0.0488, -0.1875, 0.2148, -0.1250, 0.1816, 0.0077 | ||
] | ||
, dtype=torch.float32) # fmt: skip | ||
|
||
torch.testing.assert_close(logits[0, -1, :40].cpu(), EXPECTED_LOGITS_NO_GRAD_0, rtol=1e-3, atol=1e-3) | ||
torch.testing.assert_close(logits[1, -1, :40].cpu(), EXPECTED_LOGITS_NO_GRAD_1, rtol=1e-3, atol=1e-3) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This
cuda_compute_capability_major_version
pattern is copied from other models like e.g.gemma