quality

hSterz · Nov 16, 2020 · 91af39c · 91af39c
1 parent 5a58ca4
commit 91af39c
Show file tree

Hide file tree

Showing 175 changed files with 756 additions and 2,880 deletions.
diff --git a/examples/adversarial/utils_hans.py b/examples/adversarial/utils_hans.py
@@ -112,10 +112,7 @@ def __init__(
             cached_features_file = os.path.join(
                 data_dir,
                 "cached_{}_{}_{}_{}".format(
-                    "dev" if evaluate else "train",
-                    tokenizer.__class__.__name__,
-                    str(max_seq_length),
-                    task,
+                    "dev" if evaluate else "train", tokenizer.__class__.__name__, str(max_seq_length), task,
                 ),
             )
             label_list = processor.get_labels()
@@ -281,10 +278,7 @@ def _create_examples(self, lines, set_type):
 
 
 def hans_convert_examples_to_features(
-    examples: List[InputExample],
-    label_list: List[str],
-    max_length: int,
-    tokenizer: PreTrainedTokenizer,
+    examples: List[InputExample], label_list: List[str], max_length: int, tokenizer: PreTrainedTokenizer,
 ):
     """
     Loads a data file into a list of ``InputFeatures``

diff --git a/examples/benchmarking/plot_csv_file.py b/examples/benchmarking/plot_csv_file.py
@@ -20,9 +20,7 @@ class PlotArguments:
     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
     """
 
-    csv_file: str = field(
-        metadata={"help": "The csv file to plot."},
-    )
+    csv_file: str = field(metadata={"help": "The csv file to plot."},)
     plot_along_batch: bool = field(
         default=False,
         metadata={"help": "Whether to plot along batch size or sequence lengh. Defaults to sequence length."},
@@ -32,8 +30,7 @@ class PlotArguments:
         metadata={"help": "Whether the csv file has time results or memory results. Defaults to memory results."},
     )
     no_log_scale: bool = field(
-        default=False,
-        metadata={"help": "Disable logarithmic scale when plotting"},
+        default=False, metadata={"help": "Disable logarithmic scale when plotting"},
     )
     is_train: bool = field(
         default=False,
@@ -42,8 +39,7 @@ class PlotArguments:
         },
     )
     figure_png_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "Filename under which the plot will be saved. If unused no plot is saved."},
+        default=None, metadata={"help": "Filename under which the plot will be saved. If unused no plot is saved."},
     )
     short_model_names: Optional[List[str]] = list_field(
         default=None, metadata={"help": "List of model names that are used instead of the ones in the csv file."}

diff --git a/examples/bert-loses-patience/pabee/modeling_pabee_albert.py b/examples/bert-loses-patience/pabee/modeling_pabee_albert.py
@@ -157,10 +157,7 @@ def forward(
             res = []
             for i in range(self.config.num_hidden_layers):
                 encoder_outputs = self.encoder.adaptive_forward(
-                    encoder_outputs,
-                    current_layer=i,
-                    attention_mask=extended_attention_mask,
-                    head_mask=head_mask,
+                    encoder_outputs, current_layer=i, attention_mask=extended_attention_mask, head_mask=head_mask,
                 )
 
                 pooled_output = self.pooler_activation(self.pooler(encoder_outputs[0][:, 0]))
@@ -177,10 +174,7 @@ def forward(
             for i in range(self.config.num_hidden_layers):
                 calculated_layer_num += 1
                 encoder_outputs = self.encoder.adaptive_forward(
-                    encoder_outputs,
-                    current_layer=i,
-                    attention_mask=extended_attention_mask,
-                    head_mask=head_mask,
+                    encoder_outputs, current_layer=i, attention_mask=extended_attention_mask, head_mask=head_mask,
                 )
 
                 pooled_output = self.pooler_activation(self.pooler(encoder_outputs[0][:, 0]))

diff --git a/examples/bert-loses-patience/run_glue_with_pabee.py b/examples/bert-loses-patience/run_glue_with_pabee.py
@@ -120,10 +120,7 @@ def train(args, train_dataset, model, tokenizer):
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
         model = torch.nn.parallel.DistributedDataParallel(
-            model,
-            device_ids=[args.local_rank],
-            output_device=args.local_rank,
-            find_unused_parameters=True,
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
         )
 
     # Train!
@@ -154,17 +151,13 @@ def train(args, train_dataset, model, tokenizer):
         logger.info("  Continuing training from epoch %d", epochs_trained)
         logger.info("  Continuing training from global step %d", global_step)
         logger.info(
-            "  Will skip the first %d steps in the first epoch",
-            steps_trained_in_current_epoch,
+            "  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch,
         )
 
     tr_loss, logging_loss = 0.0, 0.0
     model.zero_grad()
     train_iterator = trange(
-        epochs_trained,
-        int(args.num_train_epochs),
-        desc="Epoch",
-        disable=args.local_rank not in [-1, 0],
+        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0],
     )
     set_seed(args)  # Added here for reproductibility
     for _ in train_iterator:
@@ -379,11 +372,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
             processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
         )
         features = convert_examples_to_features(
-            examples,
-            tokenizer,
-            label_list=label_list,
-            max_length=args.max_seq_length,
-            output_mode=output_mode,
+            examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode,
         )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
@@ -445,24 +434,15 @@ def main():
         help="The output directory where the model predictions and checkpoints will be written.",
     )
     parser.add_argument(
-        "--patience",
-        default="0",
-        type=str,
-        required=False,
+        "--patience", default="0", type=str, required=False,
     )
     parser.add_argument(
-        "--regression_threshold",
-        default=0,
-        type=float,
-        required=False,
+        "--regression_threshold", default=0, type=float, required=False,
     )
 
     # Other parameters
     parser.add_argument(
-        "--config_name",
-        default="",
-        type=str,
-        help="Pretrained config name or path if not the same as model_name",
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name",
     )
     parser.add_argument(
         "--tokenizer_name",
@@ -486,27 +466,17 @@ def main():
     parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
     parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
     parser.add_argument(
-        "--evaluate_during_training",
-        action="store_true",
-        help="Run evaluation during training at each logging step.",
+        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.",
     )
     parser.add_argument(
-        "--do_lower_case",
-        action="store_true",
-        help="Set this flag if you are using an uncased model.",
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.",
     )
 
     parser.add_argument(
-        "--per_gpu_train_batch_size",
-        default=8,
-        type=int,
-        help="Batch size per GPU/CPU for training.",
+        "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.",
     )
     parser.add_argument(
-        "--per_gpu_eval_batch_size",
-        default=1,
-        type=int,
-        help="Batch size per GPU/CPU for evaluation.",
+        "--per_gpu_eval_batch_size", default=1, type=int, help="Batch size per GPU/CPU for evaluation.",
     )
     parser.add_argument(
         "--gradient_accumulation_steps",
@@ -515,19 +485,13 @@ def main():
         help="Number of updates steps to accumulate before performing a backward/update pass.",
     )
     parser.add_argument(
-        "--learning_rate",
-        default=5e-5,
-        type=float,
-        help="The initial learning rate for Adam.",
+        "--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.",
     )
     parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
     parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
     parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
     parser.add_argument(
-        "--num_train_epochs",
-        default=3.0,
-        type=float,
-        help="Total number of training epochs to perform.",
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.",
     )
     parser.add_argument(
         "--max_steps",
@@ -539,10 +503,7 @@ def main():
 
     parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
     parser.add_argument(
-        "--save_steps",
-        type=int,
-        default=500,
-        help="Save checkpoint every X updates steps.",
+        "--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.",
     )
     parser.add_argument(
         "--eval_all_checkpoints",
@@ -551,14 +512,10 @@ def main():
     )
     parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
     parser.add_argument(
-        "--overwrite_output_dir",
-        action="store_true",
-        help="Overwrite the content of the output directory",
+        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory",
     )
     parser.add_argument(
-        "--overwrite_cache",
-        action="store_true",
-        help="Overwrite the cached training and evaluation sets",
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets",
     )
     parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
 
@@ -575,10 +532,7 @@ def main():
         "See details at https://nvidia.github.io/apex/amp.html",
     )
     parser.add_argument(
-        "--local_rank",
-        type=int,
-        default=-1,
-        help="For distributed training: local_rank",
+        "--local_rank", type=int, default=-1, help="For distributed training: local_rank",
     )
     parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
     parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
@@ -680,8 +634,7 @@ def main():
     print("Output Layers Parameters:", output_layers_param_num)
     single_output_layer_param_num = sum(param.numel() for param in model.classifiers[0].parameters())
     print(
-        "Added Output Layers Parameters:",
-        output_layers_param_num - single_output_layer_param_num,
+        "Added Output Layers Parameters:", output_layers_param_num - single_output_layer_param_num,
     )
 
     logger.info("Training/evaluation parameters %s", args)

diff --git a/examples/bertology/run_bertology.py b/examples/bertology/run_bertology.py
@@ -395,8 +395,7 @@ def main():
         cache_dir=args.cache_dir,
     )
     tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        cache_dir=args.cache_dir,
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir,
     )
     model = AutoModelForSequenceClassification.from_pretrained(
         args.model_name_or_path,

diff --git a/examples/contrib/mm-imdb/utils_mmimdb.py b/examples/contrib/mm-imdb/utils_mmimdb.py
@@ -138,9 +138,6 @@ def get_image_transforms():
             transforms.Resize(256),
             transforms.CenterCrop(224),
             transforms.ToTensor(),
-            transforms.Normalize(
-                mean=[0.46777044, 0.44531429, 0.40661017],
-                std=[0.12221994, 0.12145835, 0.14380469],
-            ),
+            transforms.Normalize(mean=[0.46777044, 0.44531429, 0.40661017], std=[0.12221994, 0.12145835, 0.14380469],),
         ]
     )
diff --git a/examples/contrib/run_camembert.py b/examples/contrib/run_camembert.py
@@ -30,11 +30,7 @@ def fill_mask(masked_input, model, tokenizer, topk=5):
             )
         else:
             topk_filled_outputs.append(
-                (
-                    masked_input.replace(masked_token, predicted_token),
-                    values[index].item(),
-                    predicted_token,
-                )
+                (masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token,)
             )
     return topk_filled_outputs
 

diff --git a/examples/contrib/run_openai_gpt.py b/examples/contrib/run_openai_gpt.py
@@ -83,10 +83,7 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
         mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
         lm_labels = np.full((n_batch, 2, input_len), fill_value=-100, dtype=np.int64)
         mc_labels = np.zeros((n_batch,), dtype=np.int64)
-        for (
-            i,
-            (story, cont1, cont2, mc_label),
-        ) in enumerate(dataset):
+        for (i, (story, cont1, cont2, mc_label),) in enumerate(dataset):
             with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
             with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
             input_ids[i, 0, : len(with_cont1)] = with_cont1

diff --git a/examples/contrib/run_swag.py b/examples/contrib/run_swag.py
@@ -629,9 +629,7 @@ def main():
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
 
     config = AutoConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
-    tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-    )
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,)
     model = AutoModelForMultipleChoice.from_pretrained(
         args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
     )

diff --git a/examples/deebert/run_glue_deebert.py b/examples/deebert/run_glue_deebert.py
@@ -358,11 +358,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
             processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
         )
         features = convert_examples_to_features(
-            examples,
-            tokenizer,
-            label_list=label_list,
-            max_length=args.max_seq_length,
-            output_mode=output_mode,
+            examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode,
         )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)

diff --git a/examples/deebert/src/modeling_highway_bert.py b/examples/deebert/src/modeling_highway_bert.py
@@ -103,8 +103,7 @@ def forward(
 
 
 @add_start_docstrings(
-    "The Bert Model transformer with early exiting (DeeBERT). ",
-    BERT_START_DOCSTRING,
+    "The Bert Model transformer with early exiting (DeeBERT). ", BERT_START_DOCSTRING,
 )
 class DeeBertModel(BertPreTrainedModel):
     def __init__(self, config):

diff --git a/examples/deebert/src/modeling_highway_roberta.py b/examples/deebert/src/modeling_highway_roberta.py
@@ -11,8 +11,7 @@
 
 
 @add_start_docstrings(
-    "The RoBERTa Model transformer with early exiting (DeeRoBERTa). ",
-    ROBERTA_START_DOCSTRING,
+    "The RoBERTa Model transformer with early exiting (DeeRoBERTa). ", ROBERTA_START_DOCSTRING,
 )
 class DeeRobertaModel(DeeBertModel):
 

diff --git a/examples/distillation/run_squad_w_distillation.py b/examples/distillation/run_squad_w_distillation.py
@@ -228,20 +228,14 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
                 assert end_logits_tea.size() == end_logits_stu.size()
 
                 loss_fct = nn.KLDivLoss(reduction="batchmean")
-                loss_start = (
-                    loss_fct(
-                        F.log_softmax(start_logits_stu / args.temperature, dim=-1),
-                        F.softmax(start_logits_tea / args.temperature, dim=-1),
-                    )
-                    * (args.temperature ** 2)
-                )
-                loss_end = (
-                    loss_fct(
-                        F.log_softmax(end_logits_stu / args.temperature, dim=-1),
-                        F.softmax(end_logits_tea / args.temperature, dim=-1),
-                    )
-                    * (args.temperature ** 2)
-                )
+                loss_start = loss_fct(
+                    F.log_softmax(start_logits_stu / args.temperature, dim=-1),
+                    F.softmax(start_logits_tea / args.temperature, dim=-1),
+                ) * (args.temperature ** 2)
+                loss_end = loss_fct(
+                    F.log_softmax(end_logits_stu / args.temperature, dim=-1),
+                    F.softmax(end_logits_tea / args.temperature, dim=-1),
+                ) * (args.temperature ** 2)
                 loss_ce = (loss_start + loss_end) / 2.0
 
                 loss = args.alpha_ce * loss_ce + args.alpha_squad * loss

diff --git a/examples/distillation/utils.py b/examples/distillation/utils.py
@@ -118,8 +118,7 @@ def init_gpu_params(params):
     if params.multi_gpu:
         logger.info("Initializing PyTorch distributed")
         torch.distributed.init_process_group(
-            init_method="env://",
-            backend="nccl",
+            init_method="env://", backend="nccl",
         )