Skip to content

Commit

Permalink
quality
Browse files Browse the repository at this point in the history
  • Loading branch information
hSterz committed Nov 16, 2020
1 parent 5a58ca4 commit 91af39c
Show file tree
Hide file tree
Showing 175 changed files with 756 additions and 2,880 deletions.
10 changes: 2 additions & 8 deletions examples/adversarial/utils_hans.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,7 @@ def __init__(
cached_features_file = os.path.join(
data_dir,
"cached_{}_{}_{}_{}".format(
"dev" if evaluate else "train",
tokenizer.__class__.__name__,
str(max_seq_length),
task,
"dev" if evaluate else "train", tokenizer.__class__.__name__, str(max_seq_length), task,
),
)
label_list = processor.get_labels()
Expand Down Expand Up @@ -281,10 +278,7 @@ def _create_examples(self, lines, set_type):


def hans_convert_examples_to_features(
examples: List[InputExample],
label_list: List[str],
max_length: int,
tokenizer: PreTrainedTokenizer,
examples: List[InputExample], label_list: List[str], max_length: int, tokenizer: PreTrainedTokenizer,
):
"""
Loads a data file into a list of ``InputFeatures``
Expand Down
10 changes: 3 additions & 7 deletions examples/benchmarking/plot_csv_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@ class PlotArguments:
Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
"""

csv_file: str = field(
metadata={"help": "The csv file to plot."},
)
csv_file: str = field(metadata={"help": "The csv file to plot."},)
plot_along_batch: bool = field(
default=False,
metadata={"help": "Whether to plot along batch size or sequence lengh. Defaults to sequence length."},
Expand All @@ -32,8 +30,7 @@ class PlotArguments:
metadata={"help": "Whether the csv file has time results or memory results. Defaults to memory results."},
)
no_log_scale: bool = field(
default=False,
metadata={"help": "Disable logarithmic scale when plotting"},
default=False, metadata={"help": "Disable logarithmic scale when plotting"},
)
is_train: bool = field(
default=False,
Expand All @@ -42,8 +39,7 @@ class PlotArguments:
},
)
figure_png_file: Optional[str] = field(
default=None,
metadata={"help": "Filename under which the plot will be saved. If unused no plot is saved."},
default=None, metadata={"help": "Filename under which the plot will be saved. If unused no plot is saved."},
)
short_model_names: Optional[List[str]] = list_field(
default=None, metadata={"help": "List of model names that are used instead of the ones in the csv file."}
Expand Down
10 changes: 2 additions & 8 deletions examples/bert-loses-patience/pabee/modeling_pabee_albert.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,7 @@ def forward(
res = []
for i in range(self.config.num_hidden_layers):
encoder_outputs = self.encoder.adaptive_forward(
encoder_outputs,
current_layer=i,
attention_mask=extended_attention_mask,
head_mask=head_mask,
encoder_outputs, current_layer=i, attention_mask=extended_attention_mask, head_mask=head_mask,
)

pooled_output = self.pooler_activation(self.pooler(encoder_outputs[0][:, 0]))
Expand All @@ -177,10 +174,7 @@ def forward(
for i in range(self.config.num_hidden_layers):
calculated_layer_num += 1
encoder_outputs = self.encoder.adaptive_forward(
encoder_outputs,
current_layer=i,
attention_mask=extended_attention_mask,
head_mask=head_mask,
encoder_outputs, current_layer=i, attention_mask=extended_attention_mask, head_mask=head_mask,
)

pooled_output = self.pooler_activation(self.pooler(encoder_outputs[0][:, 0]))
Expand Down
83 changes: 18 additions & 65 deletions examples/bert-loses-patience/run_glue_with_pabee.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,7 @@ def train(args, train_dataset, model, tokenizer):
# Distributed training (should be after apex fp16 initialization)
if args.local_rank != -1:
model = torch.nn.parallel.DistributedDataParallel(
model,
device_ids=[args.local_rank],
output_device=args.local_rank,
find_unused_parameters=True,
model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
)

# Train!
Expand Down Expand Up @@ -154,17 +151,13 @@ def train(args, train_dataset, model, tokenizer):
logger.info(" Continuing training from epoch %d", epochs_trained)
logger.info(" Continuing training from global step %d", global_step)
logger.info(
" Will skip the first %d steps in the first epoch",
steps_trained_in_current_epoch,
" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch,
)

tr_loss, logging_loss = 0.0, 0.0
model.zero_grad()
train_iterator = trange(
epochs_trained,
int(args.num_train_epochs),
desc="Epoch",
disable=args.local_rank not in [-1, 0],
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0],
)
set_seed(args) # Added here for reproductibility
for _ in train_iterator:
Expand Down Expand Up @@ -379,11 +372,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
)
features = convert_examples_to_features(
examples,
tokenizer,
label_list=label_list,
max_length=args.max_seq_length,
output_mode=output_mode,
examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode,
)
if args.local_rank in [-1, 0]:
logger.info("Saving features into cached file %s", cached_features_file)
Expand Down Expand Up @@ -445,24 +434,15 @@ def main():
help="The output directory where the model predictions and checkpoints will be written.",
)
parser.add_argument(
"--patience",
default="0",
type=str,
required=False,
"--patience", default="0", type=str, required=False,
)
parser.add_argument(
"--regression_threshold",
default=0,
type=float,
required=False,
"--regression_threshold", default=0, type=float, required=False,
)

# Other parameters
parser.add_argument(
"--config_name",
default="",
type=str,
help="Pretrained config name or path if not the same as model_name",
"--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name",
)
parser.add_argument(
"--tokenizer_name",
Expand All @@ -486,27 +466,17 @@ def main():
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
parser.add_argument(
"--evaluate_during_training",
action="store_true",
help="Run evaluation during training at each logging step.",
"--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.",
)
parser.add_argument(
"--do_lower_case",
action="store_true",
help="Set this flag if you are using an uncased model.",
"--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.",
)

parser.add_argument(
"--per_gpu_train_batch_size",
default=8,
type=int,
help="Batch size per GPU/CPU for training.",
"--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.",
)
parser.add_argument(
"--per_gpu_eval_batch_size",
default=1,
type=int,
help="Batch size per GPU/CPU for evaluation.",
"--per_gpu_eval_batch_size", default=1, type=int, help="Batch size per GPU/CPU for evaluation.",
)
parser.add_argument(
"--gradient_accumulation_steps",
Expand All @@ -515,19 +485,13 @@ def main():
help="Number of updates steps to accumulate before performing a backward/update pass.",
)
parser.add_argument(
"--learning_rate",
default=5e-5,
type=float,
help="The initial learning rate for Adam.",
"--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.",
)
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument(
"--num_train_epochs",
default=3.0,
type=float,
help="Total number of training epochs to perform.",
"--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.",
)
parser.add_argument(
"--max_steps",
Expand All @@ -539,10 +503,7 @@ def main():

parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
parser.add_argument(
"--save_steps",
type=int,
default=500,
help="Save checkpoint every X updates steps.",
"--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.",
)
parser.add_argument(
"--eval_all_checkpoints",
Expand All @@ -551,14 +512,10 @@ def main():
)
parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
parser.add_argument(
"--overwrite_output_dir",
action="store_true",
help="Overwrite the content of the output directory",
"--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory",
)
parser.add_argument(
"--overwrite_cache",
action="store_true",
help="Overwrite the cached training and evaluation sets",
"--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets",
)
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

Expand All @@ -575,10 +532,7 @@ def main():
"See details at https://nvidia.github.io/apex/amp.html",
)
parser.add_argument(
"--local_rank",
type=int,
default=-1,
help="For distributed training: local_rank",
"--local_rank", type=int, default=-1, help="For distributed training: local_rank",
)
parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
Expand Down Expand Up @@ -680,8 +634,7 @@ def main():
print("Output Layers Parameters:", output_layers_param_num)
single_output_layer_param_num = sum(param.numel() for param in model.classifiers[0].parameters())
print(
"Added Output Layers Parameters:",
output_layers_param_num - single_output_layer_param_num,
"Added Output Layers Parameters:", output_layers_param_num - single_output_layer_param_num,
)

logger.info("Training/evaluation parameters %s", args)
Expand Down
3 changes: 1 addition & 2 deletions examples/bertology/run_bertology.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,8 +395,7 @@ def main():
cache_dir=args.cache_dir,
)
tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
cache_dir=args.cache_dir,
args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir,
)
model = AutoModelForSequenceClassification.from_pretrained(
args.model_name_or_path,
Expand Down
5 changes: 1 addition & 4 deletions examples/contrib/mm-imdb/utils_mmimdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,6 @@ def get_image_transforms():
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.46777044, 0.44531429, 0.40661017],
std=[0.12221994, 0.12145835, 0.14380469],
),
transforms.Normalize(mean=[0.46777044, 0.44531429, 0.40661017], std=[0.12221994, 0.12145835, 0.14380469],),
]
)
6 changes: 1 addition & 5 deletions examples/contrib/run_camembert.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,7 @@ def fill_mask(masked_input, model, tokenizer, topk=5):
)
else:
topk_filled_outputs.append(
(
masked_input.replace(masked_token, predicted_token),
values[index].item(),
predicted_token,
)
(masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token,)
)
return topk_filled_outputs

Expand Down
5 changes: 1 addition & 4 deletions examples/contrib/run_openai_gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,7 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
lm_labels = np.full((n_batch, 2, input_len), fill_value=-100, dtype=np.int64)
mc_labels = np.zeros((n_batch,), dtype=np.int64)
for (
i,
(story, cont1, cont2, mc_label),
) in enumerate(dataset):
for (i, (story, cont1, cont2, mc_label),) in enumerate(dataset):
with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
input_ids[i, 0, : len(with_cont1)] = with_cont1
Expand Down
4 changes: 1 addition & 3 deletions examples/contrib/run_swag.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,9 +629,7 @@ def main():
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab

config = AutoConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
)
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,)
model = AutoModelForMultipleChoice.from_pretrained(
args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
)
Expand Down
6 changes: 1 addition & 5 deletions examples/deebert/run_glue_deebert.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,11 +358,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
)
features = convert_examples_to_features(
examples,
tokenizer,
label_list=label_list,
max_length=args.max_seq_length,
output_mode=output_mode,
examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode,
)
if args.local_rank in [-1, 0]:
logger.info("Saving features into cached file %s", cached_features_file)
Expand Down
3 changes: 1 addition & 2 deletions examples/deebert/src/modeling_highway_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,7 @@ def forward(


@add_start_docstrings(
"The Bert Model transformer with early exiting (DeeBERT). ",
BERT_START_DOCSTRING,
"The Bert Model transformer with early exiting (DeeBERT). ", BERT_START_DOCSTRING,
)
class DeeBertModel(BertPreTrainedModel):
def __init__(self, config):
Expand Down
3 changes: 1 addition & 2 deletions examples/deebert/src/modeling_highway_roberta.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@


@add_start_docstrings(
"The RoBERTa Model transformer with early exiting (DeeRoBERTa). ",
ROBERTA_START_DOCSTRING,
"The RoBERTa Model transformer with early exiting (DeeRoBERTa). ", ROBERTA_START_DOCSTRING,
)
class DeeRobertaModel(DeeBertModel):

Expand Down
22 changes: 8 additions & 14 deletions examples/distillation/run_squad_w_distillation.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,20 +228,14 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
assert end_logits_tea.size() == end_logits_stu.size()

loss_fct = nn.KLDivLoss(reduction="batchmean")
loss_start = (
loss_fct(
F.log_softmax(start_logits_stu / args.temperature, dim=-1),
F.softmax(start_logits_tea / args.temperature, dim=-1),
)
* (args.temperature ** 2)
)
loss_end = (
loss_fct(
F.log_softmax(end_logits_stu / args.temperature, dim=-1),
F.softmax(end_logits_tea / args.temperature, dim=-1),
)
* (args.temperature ** 2)
)
loss_start = loss_fct(
F.log_softmax(start_logits_stu / args.temperature, dim=-1),
F.softmax(start_logits_tea / args.temperature, dim=-1),
) * (args.temperature ** 2)
loss_end = loss_fct(
F.log_softmax(end_logits_stu / args.temperature, dim=-1),
F.softmax(end_logits_tea / args.temperature, dim=-1),
) * (args.temperature ** 2)
loss_ce = (loss_start + loss_end) / 2.0

loss = args.alpha_ce * loss_ce + args.alpha_squad * loss
Expand Down
3 changes: 1 addition & 2 deletions examples/distillation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,7 @@ def init_gpu_params(params):
if params.multi_gpu:
logger.info("Initializing PyTorch distributed")
torch.distributed.init_process_group(
init_method="env://",
backend="nccl",
init_method="env://", backend="nccl",
)


Expand Down
Loading

0 comments on commit 91af39c

Please sign in to comment.