Squashed commit of the following: (#8157)

Merge remote-tracking branch 'origin/develop' into bugfix/sample_generate Update Makefile Update Makefile remove paddle as require. fix PaddlePaddle/Paddle#62860 fix test_sample_generate bug.
PaddlePaddle · Mar 20, 2024 · ac57ad7 · ac57ad7
1 parent b6dcb4e
commit ac57ad7
Show file tree

Hide file tree

Showing 6 changed files with 15 additions and 14 deletions.
diff --git a/Makefile b/Makefile
@@ -45,6 +45,7 @@ unit-test:
 
 .PHONY: install
 install:
+	pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html
 	pip install -r requirements-dev.txt
 	pip install -r requirements.txt
 	pip install -r paddlenlp/experimental/autonlp/requirements.txt

diff --git a/paddlenlp/generation/utils.py b/paddlenlp/generation/utils.py
@@ -1209,18 +1209,6 @@ def sample(
 
             # multinomial already support fp16 and bf16 currently, fix issue: https://github.com/PaddlePaddle/Paddle/issues/51852
             next_tokens = paddle.multinomial(probs)
-            # # multinomial not support fp16 and bf16 currently, issue: https://github.com/PaddlePaddle/Paddle/issues/51852
-            # if probs.dtype == paddle.bfloat16 and top_k == 1:
-            #     probs = probs.astype("float32")
-            #     next_tokens = paddle.unsqueeze(paddle.argmax(probs, axis=-1), -1)
-            # else:
-            #     # next_tokens = paddle.multinomial(probs)
-            #     probs = probs.cpu()
-            #     from paddlenlp.transformers.utils import device_guard
-
-            #     with device_guard("cpu"):
-            #         next_tokens = paddle.multinomial(probs)
-            #     next_tokens = next_tokens.cuda()
 
             if self.config.tensor_parallel_degree > 1:
                 # Maybe no need to broadcast if seed is set correclty.

diff --git a/paddlenlp/transformers/blenderbot/modeling.py b/paddlenlp/transformers/blenderbot/modeling.py
@@ -193,6 +193,8 @@ def __init__(
         normalize_before=True,
         weight_attr=None,
         bias_attr=None,
+        *args,
+        **kwargs,
     ):
         super(BlenderbotDecoderLayer, self).__init__(
             d_model=d_model,
@@ -205,6 +207,8 @@ def __init__(
             normalize_before=normalize_before,
             weight_attr=weight_attr,
             bias_attr=bias_attr,
+            *args,
+            **kwargs,
         )
 
     def forward(self, tgt, memory=None, tgt_mask=None, memory_mask=None, cache=None):

diff --git a/paddlenlp/transformers/blenderbot_small/modeling.py b/paddlenlp/transformers/blenderbot_small/modeling.py
@@ -126,6 +126,8 @@ def __init__(
         normalize_before=True,
         weight_attr=None,
         bias_attr=None,
+        *args,
+        **kwargs,
     ):
         super(BlenderbotSmallDecoderLayer, self).__init__(
             d_model=d_model,
@@ -138,6 +140,8 @@ def __init__(
             normalize_before=normalize_before,
             weight_attr=weight_attr,
             bias_attr=bias_attr,
+            *args,
+            **kwargs,
         )
 
     def forward(self, tgt, memory=None, tgt_mask=None, memory_mask=None, cache=None):

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,4 +1,3 @@
-paddlepaddle==2.5.1
 paddleocr<2.7
 pre-commit
 pytest

diff --git a/tests/transformers/test_generation_utils.py b/tests/transformers/test_generation_utils.py
@@ -74,6 +74,10 @@ def _get_input_ids_and_config(self):
         max_batch_size = 2
         sequence_length = input_ids.shape[-1] // 2
         input_ids = input_ids[:max_batch_size, :sequence_length]
+        # For test_sample_generate such as: NVIDIA_TF32_OVERRIDE=0 FLAGS_cudnn_deterministic=1 python3.10 -m pytest -svv tests/transformers/bloom/test_modeling.py::BloomModelTest_0::test_sample_generate
+        # There are serious memory bug for this tensor slice. which use the original tensor mem ptr for cold start
+        # Here we just clone the tensor to avoid this problem.
+        input_ids = input_ids.clone()
         attention_mask = attention_mask[:max_batch_size, :sequence_length].unsqueeze([1, 2])
 
         attention_mask = attention_mask * attention_mask.transpose([0, 1, 3, 2])
@@ -270,6 +274,7 @@ def _sample_generate(
         logits_warper,
         process_kwargs,
     ):
+
         with paddle.no_grad():
             output_generate = model.generate(
                 input_ids,
@@ -440,9 +445,9 @@ def test_greedy_generate(self):
             self.assertListEqual(output_greedy[0].tolist(), output_generate[0].tolist())
 
     def test_sample_generate(self):
-
         for model_class in self.all_generative_model_classes.keys():
             config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
+            input_ids = input_ids.clone()
             paddle.seed(124)
             model = self._make_model_instance(config, model_class)
             model.eval()