-
Notifications
You must be signed in to change notification settings - Fork 47
/
dataset.py
42 lines (37 loc) · 1.8 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import pandas as pd
import torch
from torch.utils.data import Dataset
class SSTDataset(Dataset):
"""
Stanford Sentiment Treebank V1.0
Recursive Deep Models for Semantic Compositionality Over a Sentiment Treebank
Richard Socher, Alex Perelygin, Jean Wu, Jason Chuang, Christopher Manning, Andrew Ng and Christopher Potts
Conference on Empirical Methods in Natural Language Processing (EMNLP 2013)
"""
def __init__(self, filename, maxlen, tokenizer):
# Store the contents of the file in pandas dataframe.
self.df = pd.read_csv(filename, delimiter="\t")
# Initialize tokenizer for the desired transformer model.
self.tokenizer = tokenizer
# Maximum length of tokens list to keep all the sequences of fixed size.
self.maxlen = maxlen
def __len__(self):
# Return length of dataframe.
return len(self.df)
def __getitem__(self, index):
# Select sentence and label at specified index from data frame.
sentence = self.df.loc[index, "sentence"]
label = self.df.loc[index, "label"]
# Preprocess text to be suitable for transformer
tokens = self.tokenizer.tokenize(sentence)
tokens = ["[CLS]"] + tokens + ["[SEP]"]
if len(tokens) < self.maxlen:
tokens = tokens + ["[PAD]" for _ in range(self.maxlen - len(tokens))]
else:
tokens = tokens[: self.maxlen - 1] + ["[SEP]"]
# Obtain indices of tokens and convert them to tensor.
input_ids = torch.tensor(self.tokenizer.convert_tokens_to_ids(tokens))
# Obtain attention mask i.e. a tensor containing 1s for no padded tokens and 0s for padded ones.
attention_mask = (input_ids != 0).long()
# Return input IDs, attention mask, and label.
return input_ids, attention_mask, label