PaddlePaddle · wawltor · Mar 28, 2022 · Mar 28, 2022 · Mar 28, 2022 · Mar 28, 2022
diff --git a/docs/model_zoo/taskflow.md b/docs/model_zoo/taskflow.md
@@ -112,8 +112,7 @@ from paddlenlp import Taskflow
 在默认模式和精确模式下，词典文件每一行由一个或多个自定义item组成。词典文件`user_dict.txt`示例：
 ```text
 平原上的火焰
-上
-映
+上 映
 ```
 
 在快速模式下，词典文件每一行为一个自定义item+"\t"+词频（词频可省略，词频省略则自动计算能保证分出该词的词频），暂时不支持黑名单词典（即通过设置”年“、”末“，以达到切分”年末“的目的）。词典文件`user_dict.txt`示例：
@@ -129,7 +128,7 @@ from paddlenlp import Taskflow
 >>> seg("平原上的火焰宣布延期上映")
 ['平原', '上', '的', '火焰', '宣布', '延期', '上映']
 >>> seg = Taskflow("word_segmentation", user_dict="user_dict.txt")
->>> seg("平原上的火焰计划于年末上映")
+>>> seg("平原上的火焰宣布延期上映")
 ['平原上的火焰', '宣布', '延期', '上', '映']
 ```
 #### 参数说明

diff --git a/examples/sentiment_analysis/textcnn/deploy/python/predict.py b/examples/sentiment_analysis/textcnn/deploy/python/predict.py
@@ -14,13 +14,12 @@
 
 import argparse
 
+import numpy as np
 import paddle
 import paddle.nn.functional as F
 from paddle import inference
 from paddlenlp.data import JiebaTokenizer, Pad, Vocab
 
-from data import preprocess_prediction_data
-
 # yapf: disable
 parser = argparse.ArgumentParser()
 parser.add_argument("--model_file", type=str, required=True,
@@ -38,6 +37,17 @@
 # yapf: enable
 
 
+def convert_example(data, tokenizer, pad_token_id=0, max_ngram_filter_size=3):
+    """convert_example"""
+    input_ids = tokenizer.encode(data)
+    seq_len = len(input_ids)
+    # Sequence length should larger or equal than the maximum ngram_filter_size in TextCNN model
+    if seq_len < max_ngram_filter_size:
+        input_ids.extend([pad_token_id] * (max_ngram_filter_size - seq_len))
+    input_ids = np.array(input_ids, dtype='int64')
+    return input_ids
+
+
 class Predictor(object):
     def __init__(self, model_file, params_file, device, max_seq_length):
         self.max_seq_length = max_seq_length
@@ -64,31 +74,35 @@ def __init__(self, model_file, params_file, device, max_seq_length):
         self.output_handle = self.predictor.get_output_handle(
             self.predictor.get_output_names()[0])
 
-    def predict(self, data, label_map, batch_size=1, pad_token_id=0):
+    def predict(self, data, tokenizer, label_map, batch_size=1, pad_token_id=0):
         """
         Predicts the data labels.
 
         Args:
-            data (obj:`list`): The processed data whose each element 
-                is a `list` object, which contains 
-
-                - word_ids(obj:`list[int]`): The list of word ids.
+            data (obj:`list(str)`): Data to be predicted.
+            tokenizer(obj: paddlenlp.data.JiebaTokenizer): It use jieba to cut the chinese string.
             label_map(obj:`dict`): The label id (key) to label str (value) map.
             batch_size(obj:`int`, defaults to 1): The number of batch.
             pad_token_id(obj:`int`, optional, defaults to 0): The pad token index.
 
         Returns:
             results(obj:`dict`): All the predictions labels.
         """
+        examples = []
+        for text in data:
+            input_ids = convert_example(text, tokenizer)
+            examples.append(input_ids)
 
         # Seperates data into some batches.
         batches = [
-            data[idx:idx + batch_size]
-            for idx in range(0, len(data), batch_size)
+            examples[idx:idx + batch_size]
+            for idx in range(0, len(examples), batch_size)
         ]
+
         batchify_fn = lambda samples, fn=Pad(
-            axis=0, pad_val=pad_token_id
-        ): [data for data in fn(samples)]
+            axis=0,
+            pad_val=pad_token_id  # input
+        ): fn(samples)
 
         results = []
         for batch in batches:
@@ -117,10 +131,10 @@ def predict(self, data, label_map, batch_size=1, pad_token_id=0):
 
     # Firstly pre-processing prediction data and then do predict.
     data = ['你再骂我我真的不跟你聊了', '你看看我附近有什么好吃的', '我喜欢画画也喜欢唱歌']
-    examples = preprocess_prediction_data(data, tokenizer, pad_token_id)
 
     results = predictor.predict(
-        examples,
+        data,
+        tokenizer,
         label_map,
         batch_size=args.batch_size,
         pad_token_id=pad_token_id)

diff --git a/examples/text_classification/pretrained_models/train.py b/examples/text_classification/pretrained_models/train.py
@@ -175,7 +175,6 @@ def do_train():
         scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)
     global_step = 0
     tic_train = time.time()
-    total_train_time = 0
     for epoch in range(1, args.epochs + 1):
         for step, batch in enumerate(train_data_loader, start=1):
             input_ids, token_type_ids, labels = batch
@@ -201,7 +200,6 @@ def do_train():
             global_step += 1
             if global_step % args.logging_steps == 0 and rank == 0:
                 time_diff = time.time() - tic_train
-                total_train_time += time_diff
                 print(
                     "global step %d, epoch: %d, batch: %d, loss: %.5f, accuracy: %.5f, speed: %.2f step/s"
                     % (global_step, epoch, step, loss, acc,
@@ -220,8 +218,6 @@ def do_train():
                 tokenizer.save_pretrained(save_dir)
                 tic_train = time.time()
 
-    print("Speed: %.2f steps/s" % (global_step / total_train_time))
-
 
 if __name__ == "__main__":
     do_train()
diff --git a/paddlenlp/taskflow/knowledge_mining.py b/paddlenlp/taskflow/knowledge_mining.py
@@ -429,7 +429,7 @@ def _construct_model(self, model):
         Construct the inference model for the predictor.
         """
         model_instance = ErnieCtmWordtagModel.from_pretrained(
-            model, num_tag=len(self._tags_to_index))
+            self._task_path, num_tag=len(self._tags_to_index))
         if self._params_path is not None:
             state_dict = paddle.load(self._params_path)
             model_instance.set_dict(state_dict)