Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix textcnn inference and text_classification distributed training bugs #1839

Merged
merged 3 commits into from
Mar 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions docs/model_zoo/taskflow.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,7 @@ from paddlenlp import Taskflow
在默认模式和精确模式下,词典文件每一行由一个或多个自定义item组成。词典文件`user_dict.txt`示例:
```text
平原上的火焰
上 映
```

在快速模式下,词典文件每一行为一个自定义item+"\t"+词频(词频可省略,词频省略则自动计算能保证分出该词的词频),暂时不支持黑名单词典(即通过设置”年“、”末“,以达到切分”年末“的目的)。词典文件`user_dict.txt`示例:
Expand All @@ -129,7 +128,7 @@ from paddlenlp import Taskflow
>>> seg("平原上的火焰宣布延期上映")
['平原', '上', '的', '火焰', '宣布', '延期', '上映']
>>> seg = Taskflow("word_segmentation", user_dict="user_dict.txt")
>>> seg("平原上的火焰计划于年末上映")
>>> seg("平原上的火焰宣布延期上映")
['平原上的火焰', '宣布', '延期', '上', '映']
```
#### 参数说明
Expand Down
40 changes: 27 additions & 13 deletions examples/sentiment_analysis/textcnn/deploy/python/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,12 @@

import argparse

import numpy as np
import paddle
import paddle.nn.functional as F
from paddle import inference
from paddlenlp.data import JiebaTokenizer, Pad, Vocab

from data import preprocess_prediction_data

# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--model_file", type=str, required=True,
Expand All @@ -38,6 +37,17 @@
# yapf: enable


def convert_example(data, tokenizer, pad_token_id=0, max_ngram_filter_size=3):
"""convert_example"""
input_ids = tokenizer.encode(data)
seq_len = len(input_ids)
# Sequence length should larger or equal than the maximum ngram_filter_size in TextCNN model
if seq_len < max_ngram_filter_size:
input_ids.extend([pad_token_id] * (max_ngram_filter_size - seq_len))
input_ids = np.array(input_ids, dtype='int64')
return input_ids


class Predictor(object):
def __init__(self, model_file, params_file, device, max_seq_length):
self.max_seq_length = max_seq_length
Expand All @@ -64,31 +74,35 @@ def __init__(self, model_file, params_file, device, max_seq_length):
self.output_handle = self.predictor.get_output_handle(
self.predictor.get_output_names()[0])

def predict(self, data, label_map, batch_size=1, pad_token_id=0):
def predict(self, data, tokenizer, label_map, batch_size=1, pad_token_id=0):
"""
Predicts the data labels.

Args:
data (obj:`list`): The processed data whose each element
is a `list` object, which contains

- word_ids(obj:`list[int]`): The list of word ids.
data (obj:`list(str)`): Data to be predicted.
tokenizer(obj: paddlenlp.data.JiebaTokenizer): It use jieba to cut the chinese string.
label_map(obj:`dict`): The label id (key) to label str (value) map.
batch_size(obj:`int`, defaults to 1): The number of batch.
pad_token_id(obj:`int`, optional, defaults to 0): The pad token index.

Returns:
results(obj:`dict`): All the predictions labels.
"""
examples = []
for text in data:
input_ids = convert_example(text, tokenizer)
examples.append(input_ids)

# Seperates data into some batches.
batches = [
data[idx:idx + batch_size]
for idx in range(0, len(data), batch_size)
examples[idx:idx + batch_size]
for idx in range(0, len(examples), batch_size)
]

batchify_fn = lambda samples, fn=Pad(
axis=0, pad_val=pad_token_id
): [data for data in fn(samples)]
axis=0,
pad_val=pad_token_id # input
): fn(samples)

results = []
for batch in batches:
Expand Down Expand Up @@ -117,10 +131,10 @@ def predict(self, data, label_map, batch_size=1, pad_token_id=0):

# Firstly pre-processing prediction data and then do predict.
data = ['你再骂我我真的不跟你聊了', '你看看我附近有什么好吃的', '我喜欢画画也喜欢唱歌']
examples = preprocess_prediction_data(data, tokenizer, pad_token_id)

results = predictor.predict(
examples,
data,
tokenizer,
label_map,
batch_size=args.batch_size,
pad_token_id=pad_token_id)
Expand Down
4 changes: 0 additions & 4 deletions examples/text_classification/pretrained_models/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,6 @@ def do_train():
scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)
global_step = 0
tic_train = time.time()
total_train_time = 0
for epoch in range(1, args.epochs + 1):
for step, batch in enumerate(train_data_loader, start=1):
input_ids, token_type_ids, labels = batch
Expand All @@ -201,7 +200,6 @@ def do_train():
global_step += 1
if global_step % args.logging_steps == 0 and rank == 0:
time_diff = time.time() - tic_train
total_train_time += time_diff
print(
"global step %d, epoch: %d, batch: %d, loss: %.5f, accuracy: %.5f, speed: %.2f step/s"
% (global_step, epoch, step, loss, acc,
Expand All @@ -220,8 +218,6 @@ def do_train():
tokenizer.save_pretrained(save_dir)
tic_train = time.time()

print("Speed: %.2f steps/s" % (global_step / total_train_time))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

有个疑问 为什么要删除这样的速度代码了?



if __name__ == "__main__":
do_train()
2 changes: 1 addition & 1 deletion paddlenlp/taskflow/knowledge_mining.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ def _construct_model(self, model):
Construct the inference model for the predictor.
"""
model_instance = ErnieCtmWordtagModel.from_pretrained(
model, num_tag=len(self._tags_to_index))
self._task_path, num_tag=len(self._tags_to_index))
if self._params_path is not None:
state_dict = paddle.load(self._params_path)
model_instance.set_dict(state_dict)
Expand Down