-
Notifications
You must be signed in to change notification settings - Fork 0
/
bert_multi_classification.py
286 lines (252 loc) · 13.8 KB
/
bert_multi_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
#! /usr/bin/env python
#
# Uses BERT for multiple-label classification, based on following blog:
# https://towardsdatascience.com/beginners-guide-to-bert-for-multi-classification-task-92f5445c2d7c
#
# Notes:
# - Setup instructions:
# -- sudo pip3 install virtualenv
# -- virtualenv bertenv
# -- python3 -m venv bertenv
# -- source bertenv/bin/activate
# -- tensorflow >= 1.11.0 # CPU Version of TensorFlow.
# -- tensorflow-gpu >= 1.11.0 # GPU version of TensorFlow.
# - Sample invocation:
# TODO
#
# TODO:
# - Use Main class to add in support for argument parsing
# - Generalize to handling other data files.
# - Add options for BERT model and other parameters.
# - Convert into calling module for run_classifier.py directly (i.e., without using shell).
# - Rework sample invocation so that environment settings are temporary.
#
"""BERT for multiple-label classification"""
## TODO: import os
import re
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow
# TODO: make the following optional (so that tensorflow_cpu can be used)
## OLD import tensorflow_gpu
import debug
from regex import my_re
import glue_helpers as gh
import system
# Note: the defaults are unintuitive, but they match the blog article.
MODEL_DIR = system.getenv_text("MODEL_DIR", "./model")
## TODO: BERT_DIR = system.getenv_text("BERT_DIR", MODEL_DIR)
DATA_DIR = system.getenv_text("DATA_DIR", "./dataset")
OUTPUT_DIR = system.getenv_text("OUTPUT_DIR", "./bert_output")
TASK_NAME = system.getenv_text("TASK_NAME", "cola")
USE_TSV_INPUT = system.getenv_bool("USE_TSV_INPUT", False)
CLASSIFIER_INVOCATION = system.getenv_text("CLASSIFIER_INVOCATION", "run_classifier.py")
# Note: USE_TSV_INPUT implies you use a tab-separated format properly formatted for BERT,
# so set BERT_FORMATTED False (e.g., USE_TSV_INPUT=1 BERT_FORMATTED=0 bert_multi_classification.py ...)
BERT_FORMATTED = system.getenv_bool("BERT_FORMATTED", USE_TSV_INPUT)
LOWER_CASE = system.getenv_bool("LOWER_CASE", False)
USE_ALBERT_DEFAULT = ("albert" in CLASSIFIER_INVOCATION)
USE_ALBERT = system.getenv_bool("USE_ALBERT", USE_ALBERT_DEFAULT)
BERT_NAME = "bert" if (not USE_ALBERT) else "albert"
CONFIG_FILE_DEFAULT = system.form_path(MODEL_DIR, "{b}_config.json".format(b=BERT_NAME))
BERT_CONFIG_FILE = system.getenv_text("BERT_CONFIG_FILE", CONFIG_FILE_DEFAULT)
# Get label list and split into columns
INPUT_LABELS = system.getenv_text("INPUT_LABELS", "id, label, text")
## TODO: INPUT_LABEL_LIST = re.split(r"\s, *", INPUT_LABELS)
# Labels to use for ID, CLASS_VALUE, and TEXT
INPUT_LABEL_LIST = re.split(r",", INPUT_LABELS)
debug.assertion(len(INPUT_LABEL_LIST) == 3)
ID_COL = INPUT_LABEL_LIST[0]
LABEL_COL = INPUT_LABEL_LIST[1]
TEXT_COL = INPUT_LABEL_LIST[2]
#-------------------------------------------------------------------------------
# General purpose helper functions
# TODO: put into text_utils, etc.
def version_to_number(version, max_padding=3):
"""Converts VERSION to number that can be used in comparisons
Note: The Result will be of the form M.mmmrrrooo..., where M is the
major number m is the minor, r is the revision and o is other.
Each version component will be prepended with up MAX_PADDING [3] 0's
Notes:
- strings in the version are ignored
- 0 is returned if version string is non-standard"""
# EX: version_to_number("1.11.1") => 1.00010001
# EX: version_to_number("1") => 1
# EX: version_to_number("") => 0
# TODO: support string (e.g., 1.11.2a).
version_number = 0
version_text = version
new_version_text = ""
max_component_length = (1 + max_padding)
debug.trace_fmt(5, "version_to_number({v})", v=version)
# Remove all alphabetic components
version_text = re.sub(r"[a-z]", "", version_text, re.IGNORECASE)
if (version_text != version):
debug.trace_fmt(2, "Warning: stripped alphabetic components from version: {v} => {nv}", v=version, nv=version_text)
# Remove all spaces (TODO: handle tabs and other whitespace)
version_text = version_text.replace(" ", "")
# Convert component numbers iteratively and zero-pad if necessary
# NOTE: Components greater than max-padding + 1 treated as all 9's.
debug.trace_fmt(4, "version_text: {vt}", vt=version_text)
first = False
num_components = 0
regex = r"^(\d+)(\.((\d*).*))?$"
while (my_re.search(regex, version_text)):
component = my_re.group(1)
# TODO: fix my_re.group to handle None as ""
version_text = my_re.group(2) if my_re.group(2) else ""
num_components += 1
debug.trace_fmt(4, "new version_text: {vt}", vt=version_text)
component = system.to_string(system.to_int(component))
if first:
new_version_text = component + "."
regex = r"^(\d+)\.?((\d*).*)$"
else:
if (len(component) > max_component_length):
old_component = component
component = "9" * max_component_length
debug.trace_fmt(2, "Warning: replaced overly long component #{n} {oc} with {c}",
n=num_components, oc=old_component, nc=component)
new_version_text += component
debug.trace_fmt(4, "Component {n}: {c}", n=num_components, c=component)
version_number = system.to_float(new_version_text, version_number)
## TODO:
## if (my_re.search(p"[a-z]", version_text, re.IGNORECASE)) {
## version_text = my_re....
## }
debug.trace_fmt(4, "version_to_number({v}) => {n}", v=version, n=version_number)
return version_number
#-------------------------------------------------------------------------------
# Helper functions specific to BERT
def ensure_bert_data_frame(data_frame, is_test=False):
"""Ensures data frame is in BERT format from input DATA_FRAME, using dummy values for alpha
column.
Notes:
- See comments in blog mentioned in header.
- Uses global costant BERT_FORMATTED."""
# TODO: add parameter mapping input column names to ones assumed here (i.e., id, label, & text)
debug.trace_fmt(5, "ensure_bert_data_frame({df})", df=data_frame)
df_bert = None
if BERT_FORMATTED:
df_bert = data_frame
else:
try:
first_sentence = data_frame[TEXT_COL][0]
debug.trace_fmt(5, "First sentence: {s}", s=first_sentence)
# Ignore header column if given (TODO, add parameter to make this optional)
# Note: This assumes single word tokens without punctuation can't be a sentence.
if re.search(r"^\w+$", first_sentence):
debug.trace("Removing presumed header row of data frame")
debug.assertion(is_test)
data_frame = data_frame.drop(data.index[0])
data_hash = {'guid': data_frame[ID_COL],
'alpha': (['-'] * data_frame.shape[0]),
'text': data_frame[TEXT_COL]}
if not is_test:
data_hash['label'] = data_frame[LABEL_COL]
df_bert = pd.DataFrame(data_hash)
except:
debug.raise_exception(5)
system.print_stderr("Exception converting data frame to BERT format: {exc}",
exc=sys.exc_info())
debug.trace_fmt(4, "ensure_bert_data_frame(_) => {r}", r=df_bert)
return df_bert
#--------------------------------------------------------------------------------
V1_11_0 = version_to_number("1.11.0")
debug.assertion(V1_11_0 <= version_to_number(tensorflow.__version__))
## OLD: debug.assertion(V1_11_0 <= version_to_number("tensorflow_gpu.__version__"))
#-------------------------------------------------------------------------------
## TODO: changes to BERT run_classifier.py code for different label sets
## NOTE: not needed if running one of GLUE tasks with support built into classifier (e.g., CoLA)
## def get_labels(self):
## return ["0", "1"]
## def get_labels(self):
## return ["0", "1", "2", "3", "4"]
## def get_labels(self):
## return ["POSITIVE", "NEGATIVE"]
def main():
"""Entry point for script"""
## sudo apt-get install python3-pip
## a550d 1 a To clarify, I didn't delete these pages.kcd12 0 a Dear god this site is horrible.7379b 1 a I think this is not appropriate.cccfd 2 a The title is fine as it is.
## guid textcasd4 I am not going to buy this useless stuff.3ndf9 I wanna be the very best, like no one ever was
## pip install pandas
## pip install sklearn
## id,text,labelsadcc,This is not what I want.,1cj1ne,He seriously have no idea what it is all about,0123nj,I don't think that we have any right to judge others,2
in_seperator = ","
in_ext = ".csv"
if USE_TSV_INPUT:
in_seperator = "\t"
in_ext = ".tsv"
df_train = pd.read_csv(gh.form_path(DATA_DIR, "train" + in_ext), sep=in_seperator, names=INPUT_LABEL_LIST)
df_bert_train = ensure_bert_data_frame(df_train)
## BAD: df_bert_train.to_csv(gh.form_path(DATA_DIR, 'train.tsv'), sep='\t', index=False, header=False)
## TODO: df_train.to_csv(gh.form_path(DATA_DIR, 'train.tsv'), sep='\t', index=False, header=False)
# read source data from csv file
## OLD: df_train = pd.read_csv(gh.form_path(DATA_DIR, "train" + in_ext))
test_columns = [INPUT_LABEL_LIST[0], INPUT_LABEL_LIST[2]]
df_test = pd.read_csv(gh.form_path(DATA_DIR, "test" + in_ext), sep=in_seperator, names=test_columns)
df_bert_test = ensure_bert_data_frame(df_test, is_test=True)
## TODO: alternative version
## #create a new dataframe for train, dev data
## df_bert = pd.DataFrame({'guid': df_train['id'],
## 'label': df_train['label'],
## 'alpha': ['a']*df_train.shape[0],
## 'text': df_train['text']})
#split into test, dev
# TODO: only do if no det.tsv file
## OLD" df_bert_train, df_bert_dev = train_test_split(df_bert, test_size=0.01)
dev_file = gh.form_path(DATA_DIR, "dev" + in_ext)
if system.file_exists(dev_file):
df_dev = pd.read_csv(dev_file, sep=in_seperator, names=INPUT_LABEL_LIST)
df_bert_dev = ensure_bert_data_frame(df_dev)
else:
df_bert_train, df_bert_dev = train_test_split(df_bert_train, test_size=0.01)
## ALT: create new dataframe for test data
## df_bert_test = ensure_bert_data_frame(df_test)
## pd.DataFrame({'guid': df_test['id'],
## 'text': df_test['text']})
#output tsv file, no header for train and dev
if not USE_TSV_INPUT:
df_bert_train.to_csv(gh.form_path(OUTPUT_DIR, 'train.tsv'), sep='\t', index=False, header=False)
df_bert_dev.to_csv(gh.form_path(OUTPUT_DIR, 'dev.tsv'), sep='\t', index=False, header=False)
df_bert_test.to_csv(gh.form_path(OUTPUT_DIR, 'test.tsv'), sep='\t', index=False, header=True)
## TODO: work example error from run_classifier.py cusstomization into assertion
## label_id = label_map[example.label]
## KeyError: '2'`
## TODO: Run NVIDIA CUDA utility and make sure capable of running TensorFlow w/ GPU's.
## Also, warn is graphics memory is too low.
## nvidia-smi
system.setenv("BERT_BASE_DIR", MODEL_DIR)
##CUDA_VISIBLE_DEVICES=0
## python script.py
print("Make sure your GPU Processor has sufficient memory, besides adequate number of units")
## BAD: gh.issue("nvidia-smi")
print(gh.run("nvidia-smi"))
# note: 0 is the order, not the total number
system.setenv("CUDA_VISIBLE_DEVICES", "0")
# TODO: use run and due sanity checks on output; u
is_lower_case = system.to_string(LOWER_CASE).lower()
bert_proper_args = ("--vocab_file={md}/vocab.txt".format(md=MODEL_DIR) if (not USE_ALBERT) else "--spm_model_file={md}/albert.model".format(md=MODEL_DIR))
print(gh.run("{ci} {bpa} --task_name={t} --do_train=true --do_eval=true --do_test=true --data_dir={dd} --{bn}_config_file={bcf} --init_checkpoint={md}/{bn}_model.ckpt --max_seq_length=64 --train_batch_size=2 --learning_rate=2e-5 --num_train_epochs=3.0 --output_dir={od} --do_lower_case={lc} --save_checkpoints_steps 10000", ci=CLASSIFIER_INVOCATION, bpa=bert_proper_args, t=TASK_NAME, dd=DATA_DIR, md=MODEL_DIR, od=OUTPUT_DIR, lc=is_lower_case, bn=BERT_NAME, bcf=BERT_CONFIG_FILE))
## sample output:
## eval_accuracy = 0.96741855 eval_loss = 0.17597112 global_step = 236962 loss = 0.17553209
## model_checkpoint_path: "model.ckpt-236962" all_model_checkpoint_paths: "model.ckpt-198000"all_model_checkpoint_paths: "model.ckpt-208000"all_model_checkpoint_paths: "model.ckpt-218000"all_model_checkpoint_paths: "model.ckpt-228000"all_model_checkpoint_paths: "model.ckpt-236962"
## aternative run
## CUDA_VISIBLE_DEVICES=0 python run_classifier.py --task_name=cola --do_predict=true --data_dir=./dataset --vocab_file=./model/vocab.txt --bert_config_file=./model/bert_config.json --init_checkpoint=./bert_output/model.ckpt-236962 --max_seq_length=64 --output_dir=./bert_output/
## 1.4509245e-05 1.2467547e-05 0.999946361.4016414e-05 0.99992466 1.5453812e-051.1929651e-05 0.99995375 6.324972e-063.1922486e-05 0.9999423 5.038059e-061.9996814e-05 0.99989235 7.255715e-064.146e-05 0.9999349 5.270801e-06
## alternative input
## # read the original test data for the text and id
## df_test = pd.read_csv(gh.form_path(OUTPUT_DIR, 'test.tsv'), sep='\t')
## # read the results data for the probabilities
## df_result = pd.read_csv('bert_output/test_results.tsv', sep='\t', header=None)
## # create a new dataframe
## df_map_result = pd.DataFrame({'guid': df_test['guid'],
## 'text': df_test['text'],
## 'label': df_result.idxmax(axis=1)})
## # view sample rows of the newly created dataframe
## df_map_result.sample(10)
return
#------------------------------------------------------------------------
if __name__ == '__main__':
main()